vendor/clang/clang-trunk-r238337

author: Dimitry Andric <dim@FreeBSD.org> 2015-05-27 18:47:56 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2015-05-27 18:47:56 +0000
commit: 5e20cdd81c44a443562a09007668ffdf76c455af (patch)
tree: dbbd4047878da71c1a706e26ce05b4e7791b14cc /test/OpenMP
parent: d5f23b0b7528b5c3caed1ba14f897cc4aaa9e3c3 (diff)
181 files changed, 12328 insertions, 1249 deletions
diff --git a/test/OpenMP/atomic_ast_print.cpp b/test/OpenMP/atomic_ast_print.cpp
index e75d291eab51..961c1e69b1fc 100644
--- a/test/OpenMP/atomic_ast_print.cpp
+++ b/test/OpenMP/atomic_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/atomic_capture_codegen.cpp b/test/OpenMP/atomic_capture_codegen.cpp
new file mode 100644
index 000000000000..12f2f3ac4e81
--- /dev/null
+++ b/test/OpenMP/atomic_capture_codegen.cpp
@@ -0,0 +1,1018 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+_Bool bv, bx;
+char cv, cx;
+unsigned char ucv, ucx;
+short sv, sx;
+unsigned short usv, usx;
+int iv, ix;
+unsigned int uiv, uix;
+long lv, lx;
+unsigned long ulv, ulx;
+long long llv, llx;
+unsigned long long ullv, ullx;
+float fv, fx;
+double dv, dx;
+long double ldv, ldx;
+_Complex int civ, cix;
+_Complex float cfv, cfx;
+_Complex double cdv, cdx;
+
+typedef int int4 __attribute__((__vector_size__(16)));
+int4 int4x;
+
+struct BitFields {
+  int : 32;
+  int a : 31;
+} bfx;
+
+struct BitFields_packed {
+  int : 32;
+  int a : 31;
+} __attribute__ ((__packed__)) bfx_packed;
+
+struct BitFields2 {
+  int : 31;
+  int a : 1;
+} bfx2;
+
+struct BitFields2_packed {
+  int : 31;
+  int a : 1;
+} __attribute__ ((__packed__)) bfx2_packed;
+
+struct BitFields3 {
+  int : 11;
+  int a : 14;
+} bfx3;
+
+struct BitFields3_packed {
+  int : 11;
+  int a : 14;
+} __attribute__ ((__packed__)) bfx3_packed;
+
+struct BitFields4 {
+  short : 16;
+  int a: 1;
+  long b : 7;
+} bfx4;
+
+struct BitFields4_packed {
+  short : 16;
+  int a: 1;
+  long b : 7;
+} __attribute__ ((__packed__)) bfx4_packed;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+float2 float2x;
+
+register int rix __asm__("0");
+
+int main() {
+// CHECK: [[PREV:%.+]] = atomicrmw add i8* @{{.+}}, i8 1 monotonic
+// CHECK: store i8 [[PREV]], i8* @{{.+}},
+#pragma omp atomic capture
+  bv = bx++;
+// CHECK: atomicrmw add i8* @{{.+}}, i8 1 monotonic
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i8 %{{.+}}, i8* @{{.+}},
+#pragma omp atomic capture
+  cv = ++cx;
+// CHECK: [[PREV:%.+]] = atomicrmw sub i8* @{{.+}}, i8 1 monotonic
+// CHECK: store i8 [[PREV]], i8* @{{.+}},
+#pragma omp atomic capture
+  ucv = ucx--;
+// CHECK: atomicrmw sub i16* @{{.+}}, i16 1 monotonic
+// CHECK: sub nsw i32 %{{.+}}, 1
+// CHECK: store i16 %{{.+}}, i16* @{{.+}},
+#pragma omp atomic capture
+  sv = --sx;
+// CHECK: [[USV:%.+]] = load i16, i16* @{{.+}},
+// CHECK: [[EXPR:%.+]] = zext i16 [[USV]] to i32
+// CHECK: [[X:%.+]] = load atomic i16, i16* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i16 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[CONV:%.+]] = zext i16 [[EXPECTED]] to i32
+// CHECK: [[ADD:%.+]] = add nsw i32 [[CONV]], [[EXPR]]
+// CHECK: [[DESIRED_CALC:%.+]] = trunc i32 [[ADD]] to i16
+// CHECK: store i16 [[DESIRED_CALC]], i16* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i16, i16* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i16* [[X_ADDR]], i16 [[EXPECTED]], i16 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i16, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i16, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i16 [[DESIRED_CALC]], i16* @{{.+}},
+#pragma omp atomic capture
+  sv = usx += usv;
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i32, i32* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED_CALC:%.+]] = mul nsw i32 [[EXPECTED]], [[EXPR]]
+// CHECK: store i32 [[DESIRED_CALC]], i32* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[X_ADDR]], i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[DESIRED_CALC]], i32* @{{.+}},
+#pragma omp atomic capture
+  uiv = ix *= iv;
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[PREV:%.+]] = atomicrmw sub i32* @{{.+}}, i32 [[EXPR]] monotonic
+// CHECK: store i32 [[PREV]], i32* @{{.+}},
+#pragma omp atomic capture
+  {iv = uix; uix -= uiv;}
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i32, i32* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED_CALC:%.+]] = shl i32 [[EXPECTED]], [[EXPR]]
+// CHECK: store i32 [[DESIRED_CALC]], i32* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[X_ADDR]], i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[DESIRED_CALC]], i32* @{{.+}},
+#pragma omp atomic capture
+  {ix <<= iv; uiv = ix;}
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i32, i32* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED_CALC:%.+]] = lshr i32 [[EXPECTED]], [[EXPR]]
+// CHECK: store i32 [[DESIRED_CALC]], i32* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[X_ADDR]], i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[DESIRED_CALC]], i32* @{{.+}},
+#pragma omp atomic capture
+  iv = uix >>= uiv;
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i64, i64* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED:%.+]] = sdiv i64 [[EXPECTED]], [[EXPR]]
+// CHECK: store i64 [[DESIRED]], i64* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* [[X_ADDR]], i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i64 [[EXPECTED]], i64* @{{.+}},
+#pragma omp atomic capture
+  {ulv = lx; lx /= lv;}
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[OLD:%.+]] = atomicrmw and i64* @{{.+}}, i64 [[EXPR]] monotonic
+// CHECK: [[DESIRED:%.+]] = and i64 [[OLD]], [[EXPR]]
+// CHECK:  store i64 [[DESIRED]], i64* @{{.+}},
+#pragma omp atomic capture
+  {ulx &= ulv; lv = ulx;}
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[OLD:%.+]] = atomicrmw xor i64* @{{.+}}, i64 [[EXPR]] monotonic
+// CHECK: [[DESIRED:%.+]] = xor i64 [[OLD]], [[EXPR]]
+// CHECK:  store i64 [[DESIRED]], i64* @{{.+}},
+#pragma omp atomic capture
+  ullv = llx ^= llv;
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[OLD:%.+]] = atomicrmw or i64* @{{.+}}, i64 [[EXPR]] monotonic
+// CHECK: [[DESIRED:%.+]] = or i64 [[OLD]], [[EXPR]]
+// CHECK:  store i64 [[DESIRED]], i64* @{{.+}},
+#pragma omp atomic capture
+  llv = ullx |= ullv;
+// CHECK: [[EXPR:%.+]] = load float, float* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i32, i32*  bitcast (float* [[X_ADDR:@.+]] to i32*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[TEMP_I:%.+]] = bitcast float* [[TEMP:%.+]] to i32*
+// CHECK: [[OLD:%.+]] = bitcast i32 [[EXPECTED]] to float
+// CHECK: [[ADD:%.+]] = fadd float [[OLD]], [[EXPR]]
+// CHECK: store float [[ADD]], float* [[TEMP]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[TEMP_I]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* bitcast (float* [[X_ADDR]] to i32*), i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[CAST:%.+]] = fpext float [[ADD]] to double
+// CHECK: store double [[CAST]], double* @{{.+}},
+#pragma omp atomic capture
+  dv = fx = fx + fv;
+// CHECK: [[EXPR:%.+]] = load double, double* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i64, i64*  bitcast (double* [[X_ADDR:@.+]] to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[TEMP_I:%.+]] = bitcast double* [[TEMP:%.+]] to i64*
+// CHECK: [[OLD:%.+]] = bitcast i64 [[EXPECTED]] to double
+// CHECK: [[SUB:%.+]] = fsub double [[EXPR]], [[OLD]]
+// CHECK: store double [[SUB]], double* [[TEMP]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP_I]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (double* [[X_ADDR]] to i64*), i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[CAST:%.+]] = fptrunc double [[OLD]] to float
+// CHECK: store float [[CAST]], float* @{{.+}},
+#pragma omp atomic capture
+  {fv = dx; dx = dv - dx;}
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i128, i128*  bitcast (x86_fp80* [[X_ADDR:@.+]] to i128*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i128 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP:%.+]] to i128*
+// CHECK: store i128 [[EXPECTED]], i128* [[BITCAST]]
+// CHECK: [[BITCAST1:%.+]] = bitcast x86_fp80* [[TEMP1:%.+]] to i128*
+// CHECK: store i128 [[EXPECTED]], i128* [[BITCAST1]]
+// CHECK: [[OLD:%.+]] = load x86_fp80, x86_fp80* [[TEMP1]]
+// CHECK: [[MUL:%.+]] = fmul x86_fp80 [[OLD]], [[EXPR]]
+// CHECK: store x86_fp80 [[MUL]], x86_fp80* [[TEMP]]
+// CHECK: [[DESIRED:%.+]] = load i128, i128* [[BITCAST]]
+// CHECK: [[RES:%.+]] = cmpxchg i128* bitcast (x86_fp80* [[X_ADDR]] to i128*), i128 [[EXPECTED]], i128 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i128, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i128, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[CAST:%.+]] = fptrunc x86_fp80 [[MUL]] to double
+// CHECK: store double [[CAST]], double* @{{.+}},
+#pragma omp atomic capture
+  {ldx = ldx * ldv; dv = ldx;}
+// CHECK: [[EXPR_RE:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[LD_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[LD_RE:%.+]] = load i32, i32* [[LD_RE_ADDR]]
+// CHECK: [[LD_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[LD_IM:%.+]] = load i32, i32* [[LD_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store i32 [[NEW_RE:%.+]], i32* [[X_RE_ADDR]]
+// CHECK: store i32 [[NEW_IM:%.+]], i32* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { i32, i32 }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 0, i32 0)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[RE_CAST:%.+]] = sitofp i32 [[NEW_RE]] to float
+// CHECK: [[IM_CAST:%.+]] = sitofp i32 [[NEW_IM]] to float
+// CHECK: store float [[RE_CAST]], float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 0),
+// CHECK: store float [[IM_CAST]], float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 1),
+#pragma omp atomic capture
+  cfv = cix = civ / cix;
+// CHECK: [[EXPR_RE:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[BITCAST:%.+]] = bitcast { float, float }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 8, i8* bitcast ({ float, float }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[X_RE_OLD:%.+]] = load float, float* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[X_IM_OLD:%.+]] = load float, float* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store float [[NEW_RE:%.+]], float* [[X_RE_ADDR]]
+// CHECK: store float [[NEW_IM:%.+]], float* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { float, float }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { float, float }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 8, i8* bitcast ({ float, float }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 0, i32 0)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[RE_CAST:%.+]] = fptosi float [[X_RE_OLD]] to i32
+// CHECK: [[IM_CAST:%.+]] = fptosi float [[X_IM_OLD]] to i32
+// CHECK: store i32 [[RE_CAST]], i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 0),
+// CHECK: store i32 [[IM_CAST]], i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 1),
+#pragma omp atomic capture
+  {civ = cfx; cfx = cfv + cfx;}
+// CHECK: [[EXPR_RE:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[BITCAST:%.+]] = bitcast { double, double }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 16, i8* bitcast ({ double, double }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 5)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load double, double* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load double, double* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store double [[NEW_RE:%.+]], double* [[X_RE_ADDR]]
+// CHECK: store double [[NEW_IM:%.+]], double* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { double, double }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { double, double }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 16, i8* bitcast ({ double, double }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 5, i32 5)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[RE_CAST:%.+]] = fptrunc double [[NEW_RE]] to float
+// CHECK: [[IM_CAST:%.+]] = fptrunc double [[NEW_IM]] to float
+// CHECK: store float [[RE_CAST]], float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 0),
+// CHECK: store float [[IM_CAST]], float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 1),
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic capture seq_cst
+  {cdx = cdx - cdv; cfv = cdx;}
+// CHECK: [[BV:%.+]] = load i8, i8* @{{.+}}
+// CHECK: [[BOOL:%.+]] = trunc i8 [[BV]] to i1
+// CHECK: [[EXPR:%.+]] = zext i1 [[BOOL]] to i64
+// CHECK: [[OLD:%.+]] = atomicrmw and i64* @{{.+}}, i64 [[EXPR]] monotonic
+// CHECK: [[DESIRED:%.+]] = and i64 [[OLD]], [[EXPR]]
+// CHECK: store i64 [[DESIRED]], i64* @{{.+}},
+#pragma omp atomic capture
+  ulv = ulx = ulx & bv;
+// CHECK: [[CV:%.+]]  = load i8, i8* @{{.+}}, align 1
+// CHECK: [[EXPR:%.+]] = sext i8 [[CV]] to i32
+// CHECK: [[X:%.+]] = load atomic i8, i8* [[BX_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i8 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[OLD_BOOL:%.+]] = trunc i8 [[EXPECTED]] to i1
+// CHECK: [[X_RVAL:%.+]] = zext i1 [[OLD_BOOL]] to i32
+// CHECK: [[AND:%.+]] = and i32 [[EXPR]], [[X_RVAL]]
+// CHECK: [[CAST:%.+]] = icmp ne i32 [[AND]], 0
+// CHECK: [[NEW:%.+]] = zext i1 [[CAST]] to i8
+// CHECK: store i8 [[NEW]], i8* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i8, i8* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i8* [[BX_ADDR]], i8 [[EXPECTED]], i8 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD:%.+]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[OLD_I8:%.+]] = zext i1 [[OLD_BOOL]] to i8
+// CHECK: store i8 [[OLD_I8]], i8* @{{.+}},
+#pragma omp atomic capture
+  {bv = bx; bx = cv & bx;}
+// CHECK: [[UCV:%.+]]  = load i8, i8* @{{.+}},
+// CHECK: [[EXPR:%.+]] = zext i8 [[UCV]] to i32
+// CHECK: [[X:%.+]] = load atomic i8, i8* [[CX_ADDR:@.+]] seq_cst
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i8 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[X_RVAL:%.+]] = sext i8 [[EXPECTED]] to i32
+// CHECK: [[ASHR:%.+]] = ashr i32 [[X_RVAL]], [[EXPR]]
+// CHECK: [[NEW:%.+]] = trunc i32 [[ASHR]] to i8
+// CHECK: store i8 [[NEW]], i8* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i8, i8* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i8* [[CX_ADDR]], i8 [[EXPECTED]], i8 [[DESIRED]] seq_cst seq_cst
+// CHECK: [[OLD_X:%.+]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i8 [[NEW]], i8* @{{.+}},
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic capture, seq_cst
+  {cx = cx >> ucv; cv = cx;}
+// CHECK: [[SV:%.+]]  = load i16, i16* @{{.+}},
+// CHECK: [[EXPR:%.+]] = sext i16 [[SV]] to i32
+// CHECK: [[X:%.+]] = load atomic i64, i64* [[ULX_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[X_RVAL:%.+]] = trunc i64 [[EXPECTED]] to i32
+// CHECK: [[SHL:%.+]] = shl i32 [[EXPR]], [[X_RVAL]]
+// CHECK: [[NEW:%.+]] = sext i32 [[SHL]] to i64
+// CHECK: store i64 [[NEW]], i64* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* [[ULX_ADDR]], i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i64 [[NEW]], i64* @{{.+}},
+#pragma omp atomic capture
+  ulv = ulx = sv << ulx;
+// CHECK: [[USV:%.+]]  = load i16, i16* @{{.+}},
+// CHECK: [[EXPR:%.+]] = zext i16 [[USV]] to i64
+// CHECK: [[X:%.+]] = load atomic i64, i64* [[LX_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED:%.+]] = srem i64 [[EXPECTED]], [[EXPR]]
+// CHECK: store i64 [[DESIRED]], i64* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* [[LX_ADDR]], i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i64 [[EXPECTED]], i64* @{{.+}},
+#pragma omp atomic capture
+  {lv = lx; lx = lx % usv;}
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}}
+// CHECK: [[OLD:%.+]] = atomicrmw or i32* @{{.+}}, i32 [[EXPR]] seq_cst
+// CHECK: [[DESIRED:%.+]] = or i32 [[EXPR]], [[OLD]]
+// CHECK: store i32 [[DESIRED]], i32* @{{.+}},
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic seq_cst, capture
+  {uix = iv | uix; uiv = uix;}
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}}
+// CHECK: [[OLD:%.+]] = atomicrmw and i32* @{{.+}}, i32 [[EXPR]] monotonic
+// CHECK: [[DESIRED:%.+]] = and i32 [[OLD]], [[EXPR]]
+// CHECK: store i32 [[DESIRED]], i32* @{{.+}},
+#pragma omp atomic capture
+  iv = ix = ix & uiv;
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[OLD_RE:%.+]] = load i32, i32* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[OLD_IM:%.+]] = load i32, i32* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store i32 %{{.+}}, i32* [[X_RE_ADDR]]
+// CHECK: store i32 %{{.+}}, i32* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { i32, i32 }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 0, i32 0)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[OLD_RE]], i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 0),
+// CHECK: store i32 [[OLD_IM]], i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 1),
+#pragma omp atomic capture
+  {civ = cix; cix = lv + cix;}
+// CHECK: [[ULV:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[EXPR:%.+]] = uitofp i64 [[ULV]] to float
+// CHECK: [[X:%.+]] = load atomic i32, i32*  bitcast (float* [[X_ADDR:@.+]] to i32*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[TEMP_I:%.+]] = bitcast float* [[TEMP:%.+]] to i32*
+// CHECK: [[OLD:%.+]] = bitcast i32 [[EXPECTED]] to float
+// CHECK: [[MUL:%.+]] = fmul float [[OLD]], [[EXPR]]
+// CHECK: store float [[MUL]], float* [[TEMP]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[TEMP_I]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* bitcast (float* [[X_ADDR]] to i32*), i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store float [[MUL]], float* @{{.+}},
+#pragma omp atomic capture
+  {fx = fx * ulv; fv = fx;}
+// CHECK: [[LLV:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[EXPR:%.+]] = sitofp i64 [[LLV]] to double
+// CHECK: [[X:%.+]] = load atomic i64, i64*  bitcast (double* [[X_ADDR:@.+]] to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[TEMP_I:%.+]] = bitcast double* [[TEMP:%.+]] to i64*
+// CHECK: [[OLD:%.+]] = bitcast i64 [[EXPECTED]] to double
+// CHECK: [[DIV:%.+]] = fdiv double [[OLD]], [[EXPR]]
+// CHECK: store double [[DIV]], double* [[TEMP]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP_I]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (double* [[X_ADDR]] to i64*), i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store double [[DIV]], double* @{{.+}},
+#pragma omp atomic capture
+  dv = dx /= llv;
+// CHECK: [[ULLV:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[EXPR:%.+]] = uitofp i64 [[ULLV]] to x86_fp80
+// CHECK: [[X:%.+]] = load atomic i128, i128*  bitcast (x86_fp80* [[X_ADDR:@.+]] to i128*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i128 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[TEMP_I1:%.+]] = bitcast x86_fp80* [[TEMP1:%.+]] to i128*
+// CHECK: store i128 [[EXPECTED]], i128* [[TEMP_I1]],
+// CHECK: [[TEMP_I:%.+]] = bitcast x86_fp80* [[TEMP:%.+]] to i128*
+// CHECK: store i128 [[EXPECTED]], i128* [[TEMP_I]],
+// CHECK: [[OLD:%.+]] = load x86_fp80, x86_fp80* [[TEMP]],
+// CHECK: [[SUB:%.+]] = fsub x86_fp80 [[OLD]], [[EXPR]]
+// CHECK: store x86_fp80 [[SUB]], x86_fp80* [[TEMP1]]
+// CHECK: [[DESIRED:%.+]] = load i128, i128* [[TEMP_I1]]
+// CHECK: [[RES:%.+]] = cmpxchg i128* bitcast (x86_fp80* [[X_ADDR]] to i128*), i128 [[EXPECTED]], i128 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i128, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i128, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store x86_fp80 [[OLD]], x86_fp80* @{{.+}},
+#pragma omp atomic capture
+  {ldv = ldx; ldx -= ullv;}
+// CHECK: [[EXPR:%.+]] = load float, float* @{{.+}},
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load i32, i32* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load i32, i32* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store i32 [[NEW_RE:%.+]], i32* [[X_RE_ADDR]]
+// CHECK: store i32 [[NEW_IM:%.+]], i32* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { i32, i32 }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 0, i32 0)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[NEW_RE]], i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 0),
+// CHECK: store i32 [[NEW_IM]], i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 1),
+#pragma omp atomic capture
+  {cix = fv / cix; civ = cix;}
+// CHECK: [[EXPR:%.+]] = load double, double* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i16, i16* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i16 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[CONV:%.+]] = sext i16 [[EXPECTED]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CONV]] to double
+// CHECK: [[ADD:%.+]] = fadd double [[X_RVAL]], [[EXPR]]
+// CHECK: [[NEW:%.+]] = fptosi double [[ADD]] to i16
+// CHECK: store i16 [[NEW]], i16* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i16, i16* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i16* [[X_ADDR]], i16 [[EXPECTED]], i16 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i16, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i16, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i16 [[NEW]], i16* @{{.+}},
+#pragma omp atomic capture
+  sv = sx = sx + dv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}},
+// CHECK: [[XI8:%.+]] = load atomic i8, i8* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i8 [ [[XI8]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[BOOL_EXPECTED:%.+]] = trunc i8 [[EXPECTED]] to i1
+// CHECK: [[CONV:%.+]] = zext i1 [[BOOL_EXPECTED]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CONV]] to x86_fp80
+// CHECK: [[MUL:%.+]] = fmul x86_fp80 [[EXPR]], [[X_RVAL]]
+// CHECK: [[BOOL_DESIRED:%.+]] = fcmp une x86_fp80 [[MUL]], 0xK00000000000000000000
+// CHECK: [[DESIRED:%.+]] = zext i1 [[BOOL_DESIRED]] to i8
+// CHECK: store i8 [[DESIRED]], i8* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i8, i8* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i8* [[X_ADDR]], i8 [[EXPECTED]], i8 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[EXPECTED_I8:%.+]] = zext i1 [[BOOL_EXPECTED]] to i8
+// CHECK: store i8 [[EXPECTED_I8]], i8* @{{.+}},
+#pragma omp atomic capture
+  {bv = bx; bx = ldv * bx;}
+// CHECK: [[EXPR_RE:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* [[CIV_ADDR:@.+]], i32 0, i32 0),
+// CHECK: [[EXPR_IM:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* [[CIV_ADDR]], i32 0, i32 1),
+// CHECK: [[XI8:%.+]] = load atomic i8, i8* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i8 [ [[XI8]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[BOOL_EXPECTED:%.+]] = trunc i8 [[EXPECTED]] to i1
+// CHECK: [[X_RVAL:%.+]] = zext i1 [[BOOL_EXPECTED]] to i32
+// CHECK: [[SUB_RE:%.+]] = sub i32 [[EXPR_RE:%.+]], [[X_RVAL]]
+// CHECK: [[SUB_IM:%.+]] = sub i32 [[EXPR_IM:%.+]], 0
+// CHECK: icmp ne i32 [[SUB_RE]], 0
+// CHECK: icmp ne i32 [[SUB_IM]], 0
+// CHECK: [[BOOL_DESIRED:%.+]] = or i1
+// CHECK: [[DESIRED:%.+]] = zext i1 [[BOOL_DESIRED]] to i8
+// CHECK: store i8 [[DESIRED]], i8* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i8, i8* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i8* [[X_ADDR]], i8 [[EXPECTED]], i8 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[DESIRED_I8:%.+]] = zext i1 [[BOOL_DESIRED]] to i8
+// CHECK: store i8 [[DESIRED_I8]], i8* @{{.+}},
+#pragma omp atomic capture
+  {bx = civ - bx; bv = bx;}
+// CHECK: [[EXPR_RE:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[X:%.+]] = load atomic i16, i16* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i16 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[CONV:%.+]] = zext i16 [[EXPECTED]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CONV]] to float
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[TEMP:%.+]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load float, float* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[TEMP]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load float, float* [[X_IM_ADDR]]
+// CHECK: [[NEW:%.+]] = fptoui float [[X_RE]] to i16
+// CHECK: store i16 [[NEW]], i16* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i16, i16* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i16* [[X_ADDR]], i16 [[EXPECTED]], i16 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i16, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i16, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i16 [[NEW]], i16* @{{.+}},
+#pragma omp atomic capture
+  usv = usx /= cfv;
+// CHECK: [[EXPR_RE:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[X:%.+]] = load atomic i64, i64* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[X_RVAL:%.+]] = sitofp i64 [[EXPECTED]] to double
+// CHECK: [[ADD_RE:%.+]] = fadd double [[X_RVAL]], [[EXPR_RE]]
+// CHECK: [[ADD_IM:%.+]] = fadd double 0.000000e+00, [[EXPR_IM]]
+// CHECK: [[DESIRED:%.+]] = fptosi double [[ADD_RE]] to i64
+// CHECK: store i64 [[DESIRED]], i64* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* [[X_ADDR]], i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i64 [[EXPECTED]], i64* @{{.+}},
+#pragma omp atomic capture
+  {llv = llx; llx += cdv;}
+// CHECK: [[IDX:%.+]] = load i16, i16* @{{.+}}
+// CHECK: load i8, i8*
+// CHECK: [[VEC_ITEM_VAL:%.+]] = zext i1 %{{.+}} to i32
+// CHECK: [[I128VAL:%.+]] = load atomic i128, i128* bitcast (<4 x i32>* [[DEST:@.+]] to i128*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_I128:%.+]] = phi i128 [ [[I128VAL]], %{{.+}} ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[TEMP_I:%.+]] = bitcast <4 x i32>* [[TEMP:%.+]] to i128*
+// CHECK: store i128 [[OLD_I128]], i128* [[TEMP_I]],
+// CHECK: [[LD:%.+]] = bitcast i128 [[OLD_I128]] to <4 x i32>
+// CHECK: store <4 x i32> [[LD]], <4 x i32>* [[TEMP1:%.+]],
+// CHECK: [[VEC_VAL:%.+]] = load <4 x i32>, <4 x i32>* [[TEMP1]]
+// CHECK: [[ITEM:%.+]] = extractelement <4 x i32> [[VEC_VAL]], i16 [[IDX]]
+// CHECK: [[OR:%.+]] = or i32 [[ITEM]], [[VEC_ITEM_VAL]]
+// CHECK: [[VEC_VAL:%.+]] = load <4 x i32>, <4 x i32>* [[TEMP]]
+// CHECK: [[NEW_VEC_VAL:%.+]] = insertelement <4 x i32> [[VEC_VAL]], i32 [[OR]], i16 [[IDX]]
+// CHECK: store <4 x i32> [[NEW_VEC_VAL]], <4 x i32>* [[TEMP]]
+// CHECK: [[NEW_I128:%.+]] = load i128, i128* [[TEMP_I]],
+// CHECK: [[RES:%.+]] = cmpxchg i128* bitcast (<4 x i32>* [[DEST]] to i128*), i128 [[OLD_I128]], i128 [[NEW_I128]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL:%.+]] = extractvalue { i128, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[OR]], i32* @{{.+}},
+#pragma omp atomic capture
+  {int4x[sv] |= bv; iv = int4x[sv];}
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* bitcast (i8* getelementptr (i8, i8* bitcast (%struct.BitFields* @{{.+}} to i8*), i64 4) to i32*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP1:%.+]],
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i32 [[A_LD]], 1
+// CHECK: [[A_ASHR:%.+]] = ashr i32 [[A_SHL]], 1
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[A_ASHR]] to x86_fp80
+// CHECK: [[SUB:%.+]] = fsub x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[SUB]] to i32
+// CHECK: [[NEW_VAL:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[BF_VALUE:%.+]] = and i32 [[CONV]], 2147483647
+// CHECK: [[BF_CLEAR:%.+]] = and i32 [[NEW_VAL]], -2147483648
+// CHECK: [[BF_SET:%.+]] = or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 [[BF_SET]], i32* [[TEMP1]],
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* bitcast (i8* getelementptr (i8, i8* bitcast (%struct.BitFields* @{{.+}} to i8*), i64 4) to i32*), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[CONV]], i32* @{{.+}},
+#pragma omp atomic capture
+  iv = bfx.a = bfx.a - ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[LDTEMP:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 4, i8* getelementptr (i8, i8* bitcast (%struct.BitFields_packed* @{{.+}} to i8*), i64 4), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD:%.+]] = load i32, i32* [[LDTEMP]],
+// CHECK: store i32 [[OLD]], i32* [[TEMP1:%.+]],
+// CHECK: [[OLD:%.+]] = load i32, i32* [[LDTEMP]],
+// CHECK: store i32 [[OLD]], i32* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i32 [[A_LD]], 1
+// CHECK: [[A_ASHR:%.+]] = ashr i32 [[A_SHL]], 1
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[A_ASHR]] to x86_fp80
+// CHECK: [[MUL:%.+]] = fmul x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[MUL]] to i32
+// CHECK: [[NEW_VAL:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[BF_VALUE:%.+]] = and i32 [[CONV]], 2147483647
+// CHECK: [[BF_CLEAR:%.+]] = and i32 [[NEW_VAL]], -2147483648
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[TEMP1]]
+// CHECK: [[BITCAST_TEMP_OLD_BF_ADDR:%.+]] = bitcast i32* [[LDTEMP]] to i8*
+// CHECK: [[BITCAST_TEMP_NEW_BF_ADDR:%.+]] = bitcast i32* [[TEMP1]] to i8*
+// CHECK: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 4, i8* getelementptr (i8, i8* bitcast (%struct.BitFields_packed* @{{.+}} to i8*), i64 4), i8* [[BITCAST_TEMP_OLD_BF_ADDR]], i8* [[BITCAST_TEMP_NEW_BF_ADDR]], i32 0, i32 0)
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[A_ASHR]], i32* @{{.+}},
+#pragma omp atomic capture
+  {iv = bfx_packed.a; bfx_packed.a *= ldv;}
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* getelementptr inbounds (%struct.BitFields2, %struct.BitFields2* @{{.+}}, i32 0, i32 0) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP1:%.+]],
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[A_ASHR:%.+]] = ashr i32 [[A_LD]], 31
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[A_ASHR]] to x86_fp80
+// CHECK: [[SUB:%.+]] = fsub x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[SUB]] to i32
+// CHECK: [[NEW_VAL:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i32 [[CONV]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i32 [[BF_AND]], 31
+// CHECK: [[BF_CLEAR:%.+]] = and i32 [[NEW_VAL]], 2147483647
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i32* getelementptr inbounds (%struct.BitFields2, %struct.BitFields2* @{{.+}}, i32 0, i32 0), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[CONV]], i32* @{{.+}},
+#pragma omp atomic capture
+  {bfx2.a -= ldv; iv = bfx2.a;}
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr (i8, i8* bitcast (%struct.BitFields2_packed* @{{.+}} to i8*), i64 3) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST_NEW:%.+]] = bitcast i32* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST_NEW]],
+// CHECK: [[BITCAST:%.+]] = bitcast i32* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST]],
+// CHECK: [[A_LD:%.+]] = load i8, i8* [[BITCAST]],
+// CHECK: [[A_ASHR:%.+]] = ashr i8 [[A_LD]], 7
+// CHECK: [[CAST:%.+]] = sext i8 [[A_ASHR]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CAST]] to x86_fp80
+// CHECK: [[DIV:%.+]] = fdiv x86_fp80 [[EXPR]], [[X_RVAL]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[DIV]] to i32
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[NEW_VAL]] to i8
+// CHECK: [[BF_LD:%.+]] = load i8, i8* [[BITCAST_NEW]],
+// CHECK: [[BF_AND:%.+]] = and i8 [[TRUNC]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i8 [[BF_AND]], 7
+// CHECK: [[BF_CLEAR:%.+]] = and i8 %{{.+}}, 127
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[BITCAST_NEW]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[BITCAST_NEW]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr (i8, i8* bitcast (%struct.BitFields2_packed* @{{.+}} to i8*), i64 3), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[NEW_VAL]], i32* @{{.+}},
+#pragma omp atomic capture
+  iv = bfx2_packed.a = ldv / bfx2_packed.a;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* getelementptr inbounds (%struct.BitFields3, %struct.BitFields3* @{{.+}}, i32 0, i32 0) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP1:%.+]],
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i32 [[A_LD]], 7
+// CHECK: [[A_ASHR:%.+]] = ashr i32 [[A_SHL]], 18
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[A_ASHR]] to x86_fp80
+// CHECK: [[DIV:%.+]] = fdiv x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[DIV]] to i32
+// CHECK: [[BF_LD:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i32 [[NEW_VAL]], 16383
+// CHECK: [[BF_VALUE:%.+]] = shl i32 [[BF_AND]], 11
+// CHECK: [[BF_CLEAR:%.+]] = and i32 %{{.+}}, -33552385
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i32* getelementptr inbounds (%struct.BitFields3, %struct.BitFields3* @{{.+}}, i32 0, i32 0), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[A_ASHR]], i32* @{{.+}},
+#pragma omp atomic capture
+  {iv = bfx3.a; bfx3.a /= ldv;}
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[LDTEMP:%.+]] = bitcast i32* %{{.+}} to i24*
+// CHECK: [[BITCAST:%.+]] = bitcast i24* [[LDTEMP]] to i8*
+// CHECK: call void @__atomic_load(i64 3, i8* getelementptr (i8, i8* bitcast (%struct.BitFields3_packed* @{{.+}} to i8*), i64 1), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD:%.+]] = load i24, i24* [[LDTEMP]],
+// CHECK: store i24 [[OLD]], i24* [[BITCAST2:%.+]],
+// CHECK: [[OLD:%.+]] = load i24, i24* [[LDTEMP]],
+// CHECK: store i24 [[OLD]], i24* [[BITCAST1:%.+]],
+// CHECK: [[A_LD:%.+]] = load i24, i24* [[BITCAST1]],
+// CHECK: [[A_SHL:%.+]] = shl i24 [[A_LD]], 7
+// CHECK: [[A_ASHR:%.+]] = ashr i24 [[A_SHL]], 10
+// CHECK: [[CAST:%.+]] = sext i24 [[A_ASHR]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CAST]] to x86_fp80
+// CHECK: [[ADD:%.+]] = fadd x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[ADD]] to i32
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[NEW_VAL]] to i24
+// CHECK: [[BF_LD:%.+]] = load i24, i24* [[BITCAST2]],
+// CHECK: [[BF_AND:%.+]] = and i24 [[TRUNC]], 16383
+// CHECK: [[BF_VALUE:%.+]] = shl i24 [[BF_AND]], 3
+// CHECK: [[BF_CLEAR:%.+]] = and i24 [[BF_LD]], -131065
+// CHECK: or i24 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i24 %{{.+}}, i24* [[BITCAST2]]
+// CHECK: [[BITCAST_TEMP_OLD_BF_ADDR:%.+]] = bitcast i24* [[LDTEMP]] to i8*
+// CHECK: [[BITCAST_TEMP_NEW_BF_ADDR:%.+]] = bitcast i24* [[BITCAST2]] to i8*
+// CHECK: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 3, i8* getelementptr (i8, i8* bitcast (%struct.BitFields3_packed* @{{.+}} to i8*), i64 1), i8* [[BITCAST_TEMP_OLD_BF_ADDR]], i8* [[BITCAST_TEMP_NEW_BF_ADDR]], i32 0, i32 0)
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[NEW_VAL]], i32* @{{.+}},
+#pragma omp atomic capture
+  {bfx3_packed.a += ldv; iv = bfx3_packed.a;}
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i64, i64* bitcast (%struct.BitFields4* @{{.+}} to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i64 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i64 [[OLD_BF_VALUE]], i64* [[TEMP1:%.+]],
+// CHECK: store i64 [[OLD_BF_VALUE]], i64* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i64 [[A_LD]], 47
+// CHECK: [[A_ASHR:%.+]] = ashr i64 [[A_SHL:%.+]], 63
+// CHECK: [[A_CAST:%.+]] = trunc i64 [[A_ASHR:%.+]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CAST:%.+]] to x86_fp80
+// CHECK: [[MUL:%.+]] = fmul x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[MUL]] to i32
+// CHECK: [[ZEXT:%.+]] = zext i32 [[NEW_VAL]] to i64
+// CHECK: [[BF_LD:%.+]] = load i64, i64* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i64 [[ZEXT]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i64 [[BF_AND]], 16
+// CHECK: [[BF_CLEAR:%.+]] = and i64 [[BF_LD]], -65537
+// CHECK: or i64 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i64 %{{.+}}, i64* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i64, i64* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (%struct.BitFields4* @{{.+}} to i64*), i64 [[OLD_BF_VALUE]], i64 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[NEW_VAL]], i32* @{{.+}},
+#pragma omp atomic capture
+  iv = bfx4.a = bfx4.a * ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %{{.+}} ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST1:%.+]] = bitcast i32* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST1]],
+// CHECK: [[BITCAST:%.+]] = bitcast i32* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST]],
+// CHECK: [[A_LD:%.+]] = load i8, i8* [[BITCAST]],
+// CHECK: [[A_SHL:%.+]] = shl i8 [[A_LD]], 7
+// CHECK: [[A_ASHR:%.+]] = ashr i8 [[A_SHL:%.+]], 7
+// CHECK: [[CAST:%.+]] = sext i8 [[A_ASHR:%.+]] to i32
+// CHECK: [[CONV:%.+]] = sitofp i32 [[CAST]] to x86_fp80
+// CHECK: [[SUB: %.+]] = fsub x86_fp80 [[CONV]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[SUB:%.+]] to i32
+// CHECK: [[NEW_VAL:%.+]] = trunc i32 [[CONV]] to i8
+// CHECK: [[BF_LD:%.+]] = load i8, i8* [[BITCAST1]],
+// CHECK: [[BF_VALUE:%.+]] = and i8 [[NEW_VAL]], 1
+// CHECK: [[BF_CLEAR:%.+]] = and i8 [[BF_LD]], -2
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[BITCAST1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[BITCAST1]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store i32 [[CAST]], i32* @{{.+}},
+#pragma omp atomic capture
+  {iv = bfx4_packed.a; bfx4_packed.a -= ldv;}
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i64, i64* bitcast (%struct.BitFields4* @{{.+}} to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i64 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i64 [[OLD_BF_VALUE]], i64* [[TEMP1:%.+]],
+// CHECK: store i64 [[OLD_BF_VALUE]], i64* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i64 [[A_LD]], 40
+// CHECK: [[A_ASHR:%.+]] = ashr i64 [[A_SHL:%.+]], 57
+// CHECK: [[CONV:%.+]] = sitofp i64 [[A_ASHR]] to x86_fp80
+// CHECK: [[DIV:%.+]] = fdiv x86_fp80 [[CONV]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[DIV]] to i64
+// CHECK: [[BF_LD:%.+]] = load i64, i64* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i64 [[CONV]], 127
+// CHECK: [[BF_VALUE:%.+]] = shl i64 [[BF_AND:%.+]], 17
+// CHECK: [[BF_CLEAR:%.+]] = and i64 [[BF_LD]], -16646145
+// CHECK: [[VAL:%.+]] = or i64 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i64 [[VAL]], i64* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i64, i64* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (%struct.BitFields4* @{{.+}} to i64*), i64 [[OLD_BF_VALUE]], i64 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[NEW_VAL:%.+]] = trunc i64 [[CONV]] to i32
+// CHECK: store i32 [[NEW_VAL]], i32* @{{.+}},
+#pragma omp atomic capture
+  {bfx4.b /= ldv; iv = bfx4.b;}
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST1:%.+]] = bitcast i64* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST1]],
+// CHECK: [[BITCAST:%.+]] = bitcast i64* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST]],
+// CHECK: [[A_LD:%.+]] = load i8, i8* [[BITCAST]],
+// CHECK: [[A_ASHR:%.+]] = ashr i8 [[A_LD]], 1
+// CHECK: [[CAST:%.+]] = sext i8 [[A_ASHR]] to i64
+// CHECK: [[CONV:%.+]] = sitofp i64 [[CAST]] to x86_fp80
+// CHECK: [[ADD:%.+]] = fadd x86_fp80 [[CONV]], [[EXPR]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[ADD]] to i64
+// CHECK: [[TRUNC:%.+]] = trunc i64 [[NEW_VAL]] to i8
+// CHECK: [[BF_LD:%.+]] = load i8, i8* [[BITCAST1]],
+// CHECK: [[BF_AND:%.+]] = and i8 [[TRUNC]], 127
+// CHECK: [[BF_VALUE:%.+]] = shl i8 [[BF_AND]], 1
+// CHECK: [[BF_CLEAR:%.+]] = and i8 [[BF_LD]], 1
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[BITCAST1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[BITCAST1]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: [[NEW_VAL_I32:%.+]] = trunc i64 [[NEW_VAL]] to i32
+// CHECK: store i32 [[NEW_VAL_I32]], i32* @{{.+}},
+#pragma omp atomic capture
+  iv = bfx4_packed.b += ldv;
+// CHECK: load i64, i64*
+// CHECK: [[EXPR:%.+]] = uitofp i64 %{{.+}} to float
+// CHECK: [[I64VAL:%.+]] = load atomic i64, i64* bitcast (<2 x float>* [[DEST:@.+]] to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_I64:%.+]] = phi i64 [ [[I64VAL]], %{{.+}} ], [ [[FAILED_I64_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast <2 x float>* [[LDTEMP1:%.+]] to i64*
+// CHECK: store i64 [[OLD_I64]], i64* [[BITCAST]],
+// CHECK: [[OLD_VEC_VAL:%.+]] = bitcast i64 [[OLD_I64]] to <2 x float>
+// CHECK: store <2 x float> [[OLD_VEC_VAL]], <2 x float>* [[LDTEMP:%.+]],
+// CHECK: [[VEC_VAL:%.+]] = load <2 x float>, <2 x float>* [[LDTEMP]]
+// CHECK: [[X:%.+]] = extractelement <2 x float> [[VEC_VAL]], i64 0
+// CHECK: [[VEC_ITEM_VAL:%.+]] = fsub float [[EXPR]], [[X]]
+// CHECK: [[VEC_VAL:%.+]] = load <2 x float>, <2 x float>* [[LDTEMP1]],
+// CHECK: [[NEW_VEC_VAL:%.+]] = insertelement <2 x float> [[VEC_VAL]], float [[VEC_ITEM_VAL]], i64 0
+// CHECK: store <2 x float> [[NEW_VEC_VAL]], <2 x float>* [[LDTEMP1]]
+// CHECK: [[NEW_I64:%.+]] = load i64, i64* [[BITCAST]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (<2 x float>* [[DEST]] to i64*), i64 [[OLD_I64]], i64 [[NEW_I64]] monotonic monotonic
+// CHECK: [[FAILED_I64_OLD_VAL:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: store float [[X]], float* @{{.+}},
+#pragma omp atomic capture
+  {fv = float2x.x; float2x.x = ulv - float2x.x;}
+// CHECK: [[EXPR:%.+]] = load double, double* @{{.+}},
+// CHECK: [[OLD_VAL:%.+]] = call i32 @llvm.read_register.i32([[REG:metadata ![0-9]+]])
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[OLD_VAL]] to double
+// CHECK: [[DIV:%.+]] = fdiv double [[EXPR]], [[X_RVAL]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi double [[DIV]] to i32
+// CHECK: call void @llvm.write_register.i32([[REG]], i32 [[NEW_VAL]])
+// CHECK: store i32 [[NEW_VAL]], i32* @{{.+}},
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic capture seq_cst
+  {rix = dv / rix; iv = rix;}
+// CHECK: [[OLD_VAL:%.+]] = atomicrmw xchg i32* @{{.+}}, i32 5 monotonic
+// CHECK: call void @llvm.write_register.i32([[REG]], i32 [[OLD_VAL]])
+#pragma omp atomic capture
+  {rix = ix; ix = 5;}
+  return 0;
+}
+#endif
diff --git a/test/OpenMP/atomic_codegen.cpp b/test/OpenMP/atomic_codegen.cpp
new file mode 100644
index 000000000000..2ce1f9450b3f
--- /dev/null
+++ b/test/OpenMP/atomic_codegen.cpp
@@ -0,0 +1,114 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -x c++ -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
+// expected-no-diagnostics
+
+int a;
+int b;
+
+struct St {
+  St() {}
+  ~St() {}
+  int &get() { return a; }
+};
+
+// CHECK-LABEL: parallel_atomic_ewc
+void parallel_atomic_ewc() {
+#pragma omp parallel
+  {
+      // CHECK: invoke void @_ZN2StC1Ev(%struct.St* [[TEMP_ST_ADDR:%.+]])
+      // CHECK: [[SCALAR_ADDR:%.+]] = invoke dereferenceable(4) i32* @_ZN2St3getEv(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK: [[SCALAR_VAL:%.+]] = load atomic i32, i32* [[SCALAR_ADDR]] monotonic
+      // CHECK: store i32 [[SCALAR_VAL]], i32* @b
+      // CHECK: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+#pragma omp atomic read
+      b = St().get();
+      // CHECK-DAG: invoke void @_ZN2StC1Ev(%struct.St* [[TEMP_ST_ADDR:%.+]])
+      // CHECK-DAG: [[SCALAR_ADDR:%.+]] = invoke dereferenceable(4) i32* @_ZN2St3getEv(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK-DAG: [[B_VAL:%.+]] = load i32, i32* @b
+      // CHECK: store atomic i32 [[B_VAL]], i32* [[SCALAR_ADDR]] monotonic
+      // CHECK: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+#pragma omp atomic write
+      St().get() = b;
+      // CHECK: invoke void @_ZN2StC1Ev(%struct.St* [[TEMP_ST_ADDR:%.+]])
+      // CHECK: [[SCALAR_ADDR:%.+]] = invoke dereferenceable(4) i32* @_ZN2St3getEv(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK: [[B_VAL:%.+]] = load i32, i32* @b
+      // CHECK: [[OLD_VAL:%.+]] = load atomic i32, i32* [[SCALAR_ADDR]] monotonic,
+      // CHECK: br label %[[OMP_UPDATE:.+]]
+      // CHECK: [[OMP_UPDATE]]
+      // CHECK: [[OLD_PHI_VAL:%.+]] = phi i32 [ [[OLD_VAL]], %{{.+}} ], [ [[NEW_OLD_VAL:%.+]], %[[OMP_UPDATE]] ]
+      // CHECK: [[NEW_VAL:%.+]] = srem i32 [[OLD_PHI_VAL]], [[B_VAL]]
+      // CHECK: store i32 [[NEW_VAL]], i32* [[TEMP:%.+]],
+      // CHECK: [[NEW_VAL:%.+]] = load i32, i32* [[TEMP]],
+      // CHECK: [[RES:%.+]] = cmpxchg i32* [[SCALAR_ADDR]], i32 [[OLD_PHI_VAL]], i32 [[NEW_VAL]] monotonic monotonic
+      // CHECK: [[NEW_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+      // CHECK: [[COND:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+      // CHECK: br i1 [[COND]], label %[[OMP_DONE:.+]], label %[[OMP_UPDATE]]
+      // CHECK: [[OMP_DONE]]
+      // CHECK: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+#pragma omp atomic
+      St().get() %= b;
+      // CHECK: invoke void @_ZN2StC1Ev(%struct.St* [[TEMP_ST_ADDR:%.+]])
+      // CHECK: [[SCALAR_ADDR:%.+]] = invoke dereferenceable(4) i32* @_ZN2St3getEv(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK: [[B_VAL:%.+]] = load i32, i32* @b
+      // CHECK: [[OLD_VAL:%.+]] = load atomic i32, i32* [[SCALAR_ADDR]] monotonic,
+      // CHECK: br label %[[OMP_UPDATE:.+]]
+      // CHECK: [[OMP_UPDATE]]
+      // CHECK: [[OLD_PHI_VAL:%.+]] = phi i32 [ [[OLD_VAL]], %{{.+}} ], [ [[NEW_OLD_VAL:%.+]], %[[OMP_UPDATE]] ]
+      // CHECK: [[NEW_CALC_VAL:%.+]] = srem i32 [[OLD_PHI_VAL]], [[B_VAL]]
+      // CHECK: store i32 [[NEW_CALC_VAL]], i32* [[TEMP:%.+]],
+      // CHECK: [[NEW_VAL:%.+]] = load i32, i32* [[TEMP]],
+      // CHECK: [[RES:%.+]] = cmpxchg i32* [[SCALAR_ADDR]], i32 [[OLD_PHI_VAL]], i32 [[NEW_VAL]] monotonic monotonic
+      // CHECK: [[NEW_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+      // CHECK: [[COND:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+      // CHECK: br i1 [[COND]], label %[[OMP_DONE:.+]], label %[[OMP_UPDATE]]
+      // CHECK: [[OMP_DONE]]
+      // CHECK: store i32 [[NEW_CALC_VAL]], i32* @a,
+      // CHECK: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+#pragma omp atomic capture
+      a = St().get() %= b;
+    }
+}
+
+int &foo() { return a; }
+
+// TERM_DEBUG-LABEL: parallel_atomic
+void parallel_atomic() {
+#pragma omp parallel
+  {
+#pragma omp atomic read
+    // TERM_DEBUG-NOT: __kmpc_global_thread_num
+    // TERM_DEBUG:     invoke {{.*}}foo{{.*}}()
+    // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+    // TERM_DEBUG:     load atomic i32, i32* @{{.+}} monotonic, {{.*}}!dbg [[READ_LOC:![0-9]+]]
+    foo() = a;
+#pragma omp atomic write
+    // TERM_DEBUG-NOT: __kmpc_global_thread_num
+    // TERM_DEBUG:     invoke {{.*}}foo{{.*}}()
+    // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+    // TERM_DEBUG-NOT: __kmpc_global_thread_num
+    // TERM_DEBUG:     store atomic i32 {{%.+}}, i32* @{{.+}} monotonic, {{.*}}!dbg [[WRITE_LOC:![0-9]+]]
+    a = foo();
+#pragma omp atomic update
+    // TERM_DEBUG-NOT: __kmpc_global_thread_num
+    // TERM_DEBUG:     invoke {{.*}}foo{{.*}}()
+    // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+    // TERM_DEBUG-NOT: __kmpc_global_thread_num
+    // TERM_DEBUG:     atomicrmw add i32* @{{.+}}, i32 %{{.+}} monotonic, {{.*}}!dbg [[UPDATE_LOC:![0-9]+]]
+    a += foo();
+#pragma omp atomic capture
+    // TERM_DEBUG-NOT: __kmpc_global_thread_num
+    // TERM_DEBUG:     invoke {{.*}}foo{{.*}}()
+    // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+    // TERM_DEBUG-NOT: __kmpc_global_thread_num
+    // TERM_DEBUG:     [[OLD_VAL:%.+]] = atomicrmw add i32* @{{.+}}, i32 %{{.+}} monotonic, {{.*}}!dbg [[CAPTURE_LOC:![0-9]+]]
+    // TERM_DEBUG:     store i32 [[OLD_VAL]], i32* @b,
+    {b = a; a += foo(); }
+  }
+  // TERM_DEBUG:     [[TERM_LPAD]]
+  // TERM_DEBUG:     call void @__clang_call_terminate
+  // TERM_DEBUG:     unreachable
+}
+// TERM_DEBUG-DAG: [[READ_LOC]] = !DILocation(line: [[@LINE-33]],
+// TERM_DEBUG-DAG: [[WRITE_LOC]] = !DILocation(line: [[@LINE-28]],
+// TERM_DEBUG-DAG: [[UPDATE_LOC]] = !DILocation(line: [[@LINE-22]],
+// TERM_DEBUG-DAG: [[CAPTURE_LOC]] = !DILocation(line: [[@LINE-16]],
diff --git a/test/OpenMP/atomic_messages.c b/test/OpenMP/atomic_messages.c
index ae490ee0841a..8182465494ba 100644
--- a/test/OpenMP/atomic_messages.c
+++ b/test/OpenMP/atomic_messages.c
@@ -1,17 +1,19 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 int foo() {
 L1:
   foo();
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
     foo();
     goto L1; // expected-error {{use of undeclared label 'L1'}}
   }
   goto L2; // expected-error {{use of undeclared label 'L2'}}
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
     foo();
   L2:
@@ -100,3 +102,266 @@ int writeS() {
 
   return a.a;
 }
+
+int updateint() {
+  int a = 0, b = 0;
+// Test for atomic update
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  ;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected built-in binary or unary operator}}
+  foo();
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected built-in binary operator}}
+  a = b;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  a = b || a;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  a = a && b;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = (float)a + b;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = 2 * b;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = b + *&a;
+#pragma omp atomic update
+  *&a = *&a +  2;
+#pragma omp atomic update
+  a++;
+#pragma omp atomic
+  ++a;
+#pragma omp atomic update
+  a--;
+#pragma omp atomic
+  --a;
+#pragma omp atomic update
+  a += b;
+#pragma omp atomic
+  a %= b;
+#pragma omp atomic update
+  a *= b;
+#pragma omp atomic
+  a -= b;
+#pragma omp atomic update
+  a /= b;
+#pragma omp atomic
+  a &= b;
+#pragma omp atomic update
+  a ^= b;
+#pragma omp atomic
+  a |= b;
+#pragma omp atomic update
+  a <<= b;
+#pragma omp atomic
+  a >>= b;
+#pragma omp atomic update
+  a = b + a;
+#pragma omp atomic
+  a = a * b;
+#pragma omp atomic update
+  a = b - a;
+#pragma omp atomic
+  a = a / b;
+#pragma omp atomic update
+  a = b & a;
+#pragma omp atomic
+  a = a ^ b;
+#pragma omp atomic update
+  a = b | a;
+#pragma omp atomic
+  a = a << b;
+#pragma omp atomic
+  a = b >> a;
+  // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'update' clause}}
+#pragma omp atomic update update
+  a /= b;
+
+  return 0;
+}
+
+int captureint() {
+  int a = 0, b = 0, c = 0;
+// Test for atomic capture
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected compound statement}}
+  ;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  foo();
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected built-in binary or unary operator}}
+  a = b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = b || a;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  b = a = a && b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = (float)a + b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = 2 * b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = b + *&a;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected exactly two expression statements}}
+  { a = b; }
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected exactly two expression statements}}
+  {}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of the first expression}}
+  {a = b;a = b;}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of the first expression}}
+  {a = b; a = b || a;}
+#pragma omp atomic capture
+  {b = a; a = a && b;}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = (float)a + b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = 2 * b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = b + *&a;
+#pragma omp atomic capture
+  c = *&a = *&a +  2;
+#pragma omp atomic capture
+  c = a++;
+#pragma omp atomic capture
+  c = ++a;
+#pragma omp atomic capture
+  c = a--;
+#pragma omp atomic capture
+  c = --a;
+#pragma omp atomic capture
+  c = a += b;
+#pragma omp atomic capture
+  c = a %= b;
+#pragma omp atomic capture
+  c = a *= b;
+#pragma omp atomic capture
+  c = a -= b;
+#pragma omp atomic capture
+  c = a /= b;
+#pragma omp atomic capture
+  c = a &= b;
+#pragma omp atomic capture
+  c = a ^= b;
+#pragma omp atomic capture
+  c = a |= b;
+#pragma omp atomic capture
+  c = a <<= b;
+#pragma omp atomic capture
+  c = a >>= b;
+#pragma omp atomic capture
+  c = a = b + a;
+#pragma omp atomic capture
+  c = a = a * b;
+#pragma omp atomic capture
+  c = a = b - a;
+#pragma omp atomic capture
+  c = a = a / b;
+#pragma omp atomic capture
+  c = a = b & a;
+#pragma omp atomic capture
+  c = a = a ^ b;
+#pragma omp atomic capture
+  c = a = b | a;
+#pragma omp atomic capture
+  c = a = a << b;
+#pragma omp atomic capture
+  c = a = b >> a;
+#pragma omp atomic capture
+  { c = *&a; *&a = *&a +  2;}
+#pragma omp atomic capture
+  { *&a = *&a +  2; c = *&a;}
+#pragma omp atomic capture
+  {c = a; a++;}
+#pragma omp atomic capture
+  {++a;c = a;}
+#pragma omp atomic capture
+  {c = a;a--;}
+#pragma omp atomic capture
+  {--a;c = a;}
+#pragma omp atomic capture
+  {c = a; a += b;}
+#pragma omp atomic capture
+  {a %= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a *= b;}
+#pragma omp atomic capture
+  {a -= b;c = a;}
+#pragma omp atomic capture
+  {c = a; a /= b;}
+#pragma omp atomic capture
+  {a &= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a ^= b;}
+#pragma omp atomic capture
+  {a |= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a <<= b;}
+#pragma omp atomic capture
+  {a >>= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b + a;}
+#pragma omp atomic capture
+  {a = a * b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b - a;}
+#pragma omp atomic capture
+  {a = a / b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b & a;}
+#pragma omp atomic capture
+  {a = a ^ b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b | a;}
+#pragma omp atomic capture
+  {a = a << b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b >> a;}
+#pragma omp atomic capture
+  {c = a; a = foo();}
+  // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'capture' clause}}
+#pragma omp atomic capture capture
+  b = a /= b;
+
+  return 0;
+}
+
diff --git a/test/OpenMP/atomic_messages.cpp b/test/OpenMP/atomic_messages.cpp
index a6c07ad9e289..c3e02bc96e86 100644
--- a/test/OpenMP/atomic_messages.cpp
+++ b/test/OpenMP/atomic_messages.cpp
@@ -1,17 +1,19 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 int foo() {
 L1:
   foo();
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
     foo();
     goto L1; // expected-error {{use of undeclared label 'L1'}}
   }
   goto L2; // expected-error {{use of undeclared label 'L2'}}
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
     foo();
   L2:
@@ -150,17 +152,91 @@ int write() {
 
 template <class T>
 T update() {
-  T a, b = 0;
+  T a = 0, b = 0, c = 0;
 // Test for atomic update
 #pragma omp atomic update
-  // expected-error@+1 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   ;
 // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'update' clause}}
 #pragma omp atomic update update
   a += b;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected built-in binary operator}}
+  a = b;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  a = b || a;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  a = a && b;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = float(a) + b;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = 2 * b;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = b + *&a;
+#pragma omp atomic
+  *&a = b * *&a;
+#pragma omp atomic update
+  a++;
+#pragma omp atomic
+  ++a;
+#pragma omp atomic update
+  a--;
+#pragma omp atomic
+  --a;
+#pragma omp atomic update
+  a += b;
+#pragma omp atomic
+  a %= b;
+#pragma omp atomic update
+  a *= b;
+#pragma omp atomic
+  a -= b;
+#pragma omp atomic update
+  a /= b;
+#pragma omp atomic
+  a &= b;
+#pragma omp atomic update
+  a ^= b;
+#pragma omp atomic
+  a |= b;
+#pragma omp atomic update
+  a <<= b;
+#pragma omp atomic
+  a >>= b;
+#pragma omp atomic update
+  a = b + a;
+#pragma omp atomic
+  a = a * b;
+#pragma omp atomic update
+  a = b - a;
+#pragma omp atomic
+  a = a / b;
+#pragma omp atomic update
+  a = b & a;
+#pragma omp atomic
+  a = a ^ b;
+#pragma omp atomic update
+  a = b | a;
+#pragma omp atomic
+  a = a << b;
+#pragma omp atomic
+  a = b >> a;
 
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   ;
 
   return T();
@@ -170,14 +246,85 @@ int update() {
   int a, b = 0;
 // Test for atomic update
 #pragma omp atomic update
-  // expected-error@+1 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   ;
 // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'update' clause}}
 #pragma omp atomic update update
   a += b;
-
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected built-in binary operator}}
+  a = b;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  a = b || a;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  a = a && b;
+#pragma omp atomic update
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = float(a) + b;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = 2 * b;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  a = b + *&a;
+#pragma omp atomic update
+  a++;
+#pragma omp atomic
+  ++a;
+#pragma omp atomic update
+  a--;
+#pragma omp atomic
+  --a;
+#pragma omp atomic update
+  a += b;
+#pragma omp atomic
+  a %= b;
+#pragma omp atomic update
+  a *= b;
+#pragma omp atomic
+  a -= b;
+#pragma omp atomic update
+  a /= b;
+#pragma omp atomic
+  a &= b;
+#pragma omp atomic update
+  a ^= b;
+#pragma omp atomic
+  a |= b;
+#pragma omp atomic update
+  a <<= b;
+#pragma omp atomic
+  a >>= b;
+#pragma omp atomic update
+  a = b + a;
+#pragma omp atomic
+  a = a * b;
+#pragma omp atomic update
+  a = b - a;
+#pragma omp atomic
+  a = a / b;
+#pragma omp atomic update
+  a = b & a;
+#pragma omp atomic
+  a = a ^ b;
+#pragma omp atomic update
+  a = b | a;
+#pragma omp atomic
+  a = a << b;
+#pragma omp atomic
+  a = b >> a;
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   ;
 
   return update<int>();
@@ -185,33 +332,345 @@ int update() {
 
 template <class T>
 T capture() {
-  T a, b = 0;
+  T a = 0, b = 0, c = 0;
 // Test for atomic capture
 #pragma omp atomic capture
-  // expected-error@+1 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
-  ++a;
-#pragma omp atomic capture
-  // expected-error@+1 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected compound statement}}
   ;
-// expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'capture' clause}}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  foo();
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected built-in binary or unary operator}}
+  a = b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = b || a;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  b = a = a && b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = (float)a + b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = 2 * b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = b + *&a;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected exactly two expression statements}}
+  { a = b; }
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected exactly two expression statements}}
+  {}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of the first expression}}
+  {a = b;a = b;}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of the first expression}}
+  {a = b; a = b || a;}
+#pragma omp atomic capture
+  {b = a; a = a && b;}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = (float)a + b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = 2 * b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = b + *&a;
+#pragma omp atomic capture
+  c = *&a = *&a +  2;
+#pragma omp atomic capture
+  c = a++;
+#pragma omp atomic capture
+  c = ++a;
+#pragma omp atomic capture
+  c = a--;
+#pragma omp atomic capture
+  c = --a;
+#pragma omp atomic capture
+  c = a += b;
+#pragma omp atomic capture
+  c = a %= b;
+#pragma omp atomic capture
+  c = a *= b;
+#pragma omp atomic capture
+  c = a -= b;
+#pragma omp atomic capture
+  c = a /= b;
+#pragma omp atomic capture
+  c = a &= b;
+#pragma omp atomic capture
+  c = a ^= b;
+#pragma omp atomic capture
+  c = a |= b;
+#pragma omp atomic capture
+  c = a <<= b;
+#pragma omp atomic capture
+  c = a >>= b;
+#pragma omp atomic capture
+  c = a = b + a;
+#pragma omp atomic capture
+  c = a = a * b;
+#pragma omp atomic capture
+  c = a = b - a;
+#pragma omp atomic capture
+  c = a = a / b;
+#pragma omp atomic capture
+  c = a = b & a;
+#pragma omp atomic capture
+  c = a = a ^ b;
+#pragma omp atomic capture
+  c = a = b | a;
+#pragma omp atomic capture
+  c = a = a << b;
+#pragma omp atomic capture
+  c = a = b >> a;
+#pragma omp atomic capture
+  { c = *&a; *&a = *&a +  2;}
+#pragma omp atomic capture
+  { *&a = *&a +  2; c = *&a;}
+#pragma omp atomic capture
+  {c = a; a++;}
+#pragma omp atomic capture
+  {++a;c = a;}
+#pragma omp atomic capture
+  {c = a;a--;}
+#pragma omp atomic capture
+  {--a;c = a;}
+#pragma omp atomic capture
+  {c = a; a += b;}
+#pragma omp atomic capture
+  {a %= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a *= b;}
+#pragma omp atomic capture
+  {a -= b;c = a;}
+#pragma omp atomic capture
+  {c = a; a /= b;}
+#pragma omp atomic capture
+  {a &= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a ^= b;}
+#pragma omp atomic capture
+  {a |= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a <<= b;}
+#pragma omp atomic capture
+  {a >>= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b + a;}
+#pragma omp atomic capture
+  {a = a * b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b - a;}
+#pragma omp atomic capture
+  {a = a / b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b & a;}
+#pragma omp atomic capture
+  {a = a ^ b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b | a;}
+#pragma omp atomic capture
+  {a = a << b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b >> a;}
+#pragma omp atomic capture
+  {c = a; a = foo();}
+  // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'capture' clause}}
 #pragma omp atomic capture capture
-  a = ++b;
+  b = a /= b;
 
   return T();
 }
 
 int capture() {
-  int a, b = 0;
+  int a = 0, b = 0, c = 0;
 // Test for atomic capture
 #pragma omp atomic capture
-  // expected-error@+1 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
-  ++a;
-#pragma omp atomic capture
-  // expected-error@+1 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected compound statement}}
   ;
-// expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'capture' clause}}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  foo();
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected built-in binary or unary operator}}
+  a = b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = b || a;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected one of '+', '*', '-', '/', '&', '^', '|', '<<', or '>>' built-in operations}}
+  b = a = a && b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = (float)a + b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = 2 * b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected assignment expression}}
+  a = b + *&a;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected exactly two expression statements}}
+  { a = b; }
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected exactly two expression statements}}
+  {}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of the first expression}}
+  {a = b;a = b;}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be a compound statement of form '{v = x; x binop= expr;}', '{x binop= expr; v = x;}', '{v = x; x = x binop expr;}', '{v = x; x = expr binop x;}', '{x = x binop expr; v = x;}', '{x = expr binop x; v = x;}' or '{v = x; x = expr;}', '{v = x; x++;}', '{v = x; ++x;}', '{++x; v = x;}', '{x++; v = x;}', '{v = x; x--;}', '{v = x; --x;}', '{--x; v = x;}', '{x--; v = x;}' where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected in right hand side of the first expression}}
+  {a = b; a = b || a;}
+#pragma omp atomic capture
+  {b = a; a = a && b;}
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = (float)a + b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = 2 * b;
+#pragma omp atomic capture
+  // expected-error@+2 {{the statement for 'atomic capture' must be an expression statement of form 'v = ++x;', 'v = --x;', 'v = x++;', 'v = x--;', 'v = x binop= expr;', 'v = x = x binop expr' or 'v = x = expr binop x', where x and v are both l-value expressions with scalar type}}
+  // expected-note@+1 {{expected in right hand side of expression}}
+  b = a = b + *&a;
+#pragma omp atomic capture
+  c = *&a = *&a +  2;
+#pragma omp atomic capture
+  c = a++;
+#pragma omp atomic capture
+  c = ++a;
+#pragma omp atomic capture
+  c = a--;
+#pragma omp atomic capture
+  c = --a;
+#pragma omp atomic capture
+  c = a += b;
+#pragma omp atomic capture
+  c = a %= b;
+#pragma omp atomic capture
+  c = a *= b;
+#pragma omp atomic capture
+  c = a -= b;
+#pragma omp atomic capture
+  c = a /= b;
+#pragma omp atomic capture
+  c = a &= b;
+#pragma omp atomic capture
+  c = a ^= b;
+#pragma omp atomic capture
+  c = a |= b;
+#pragma omp atomic capture
+  c = a <<= b;
+#pragma omp atomic capture
+  c = a >>= b;
+#pragma omp atomic capture
+  c = a = b + a;
+#pragma omp atomic capture
+  c = a = a * b;
+#pragma omp atomic capture
+  c = a = b - a;
+#pragma omp atomic capture
+  c = a = a / b;
+#pragma omp atomic capture
+  c = a = b & a;
+#pragma omp atomic capture
+  c = a = a ^ b;
+#pragma omp atomic capture
+  c = a = b | a;
+#pragma omp atomic capture
+  c = a = a << b;
+#pragma omp atomic capture
+  c = a = b >> a;
+#pragma omp atomic capture
+  { c = *&a; *&a = *&a +  2;}
+#pragma omp atomic capture
+  { *&a = *&a +  2; c = *&a;}
+#pragma omp atomic capture
+  {c = a; a++;}
+#pragma omp atomic capture
+  {++a;c = a;}
+#pragma omp atomic capture
+  {c = a;a--;}
+#pragma omp atomic capture
+  {--a;c = a;}
+#pragma omp atomic capture
+  {c = a; a += b;}
+#pragma omp atomic capture
+  {a %= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a *= b;}
+#pragma omp atomic capture
+  {a -= b;c = a;}
+#pragma omp atomic capture
+  {c = a; a /= b;}
+#pragma omp atomic capture
+  {a &= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a ^= b;}
+#pragma omp atomic capture
+  {a |= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a <<= b;}
+#pragma omp atomic capture
+  {a >>= b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b + a;}
+#pragma omp atomic capture
+  {a = a * b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b - a;}
+#pragma omp atomic capture
+  {a = a / b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b & a;}
+#pragma omp atomic capture
+  {a = a ^ b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b | a;}
+#pragma omp atomic capture
+  {a = a << b; c = a;}
+#pragma omp atomic capture
+  {c = a; a = b >> a;}
+#pragma omp atomic capture
+  {c = a; a = foo();}
+  // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'capture' clause}}
 #pragma omp atomic capture capture
-  a = ++b;
+  b = a /= b;
 
   return capture<int>();
 }
@@ -221,14 +680,16 @@ T seq_cst() {
   T a, b = 0;
 // Test for atomic seq_cst
 #pragma omp atomic seq_cst
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   ;
 // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'seq_cst' clause}}
 #pragma omp atomic seq_cst seq_cst
   a += b;
 
 #pragma omp atomic update seq_cst
-  // expected-error@+1 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   ;
 
   return T();
@@ -238,14 +699,16 @@ int seq_cst() {
   int a, b = 0;
 // Test for atomic seq_cst
 #pragma omp atomic seq_cst
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   ;
 // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'seq_cst' clause}}
 #pragma omp atomic seq_cst seq_cst
   a += b;
 
 #pragma omp atomic update seq_cst
-  // expected-error@+1 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic update' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   ;
 
  return seq_cst<int>();
diff --git a/test/OpenMP/atomic_read_codegen.c b/test/OpenMP/atomic_read_codegen.c
new file mode 100644
index 000000000000..efeec0302d34
--- /dev/null
+++ b/test/OpenMP/atomic_read_codegen.c
@@ -0,0 +1,333 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+_Bool bv, bx;
+char cv, cx;
+unsigned char ucv, ucx;
+short sv, sx;
+unsigned short usv, usx;
+int iv, ix;
+unsigned int uiv, uix;
+long lv, lx;
+unsigned long ulv, ulx;
+long long llv, llx;
+unsigned long long ullv, ullx;
+float fv, fx;
+double dv, dx;
+long double ldv, ldx;
+_Complex int civ, cix;
+_Complex float cfv, cfx;
+_Complex double cdv, cdx;
+
+typedef int int4 __attribute__((__vector_size__(16)));
+int4 int4x;
+
+struct BitFields {
+  int : 32;
+  int a : 31;
+} bfx;
+
+struct BitFields_packed {
+  int : 32;
+  int a : 31;
+} __attribute__ ((__packed__)) bfx_packed;
+
+struct BitFields2 {
+  int : 31;
+  int a : 1;
+} bfx2;
+
+struct BitFields2_packed {
+  int : 31;
+  int a : 1;
+} __attribute__ ((__packed__)) bfx2_packed;
+
+struct BitFields3 {
+  int : 11;
+  int a : 14;
+} bfx3;
+
+struct BitFields3_packed {
+  int : 11;
+  int a : 14;
+} __attribute__ ((__packed__)) bfx3_packed;
+
+struct BitFields4 {
+  short : 16;
+  int a: 1;
+  long b : 7;
+} bfx4;
+
+struct BitFields4_packed {
+  short : 16;
+  int a: 1;
+  long b : 7;
+} __attribute__ ((__packed__)) bfx4_packed;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+float2 float2x;
+
+register int rix __asm__("0");
+
+int main() {
+// CHECK: load atomic i8, i8*
+// CHECK: store i8
+#pragma omp atomic read
+  bv = bx;
+// CHECK: load atomic i8, i8*
+// CHECK: store i8
+#pragma omp atomic read
+  cv = cx;
+// CHECK: load atomic i8, i8*
+// CHECK: store i8
+#pragma omp atomic read
+  ucv = ucx;
+// CHECK: load atomic i16, i16*
+// CHECK: store i16
+#pragma omp atomic read
+  sv = sx;
+// CHECK: load atomic i16, i16*
+// CHECK: store i16
+#pragma omp atomic read
+  usv = usx;
+// CHECK: load atomic i32, i32*
+// CHECK: store i32
+#pragma omp atomic read
+  iv = ix;
+// CHECK: load atomic i32, i32*
+// CHECK: store i32
+#pragma omp atomic read
+  uiv = uix;
+// CHECK: load atomic i64, i64*
+// CHECK: store i64
+#pragma omp atomic read
+  lv = lx;
+// CHECK: load atomic i64, i64*
+// CHECK: store i64
+#pragma omp atomic read
+  ulv = ulx;
+// CHECK: load atomic i64, i64*
+// CHECK: store i64
+#pragma omp atomic read
+  llv = llx;
+// CHECK: load atomic i64, i64*
+// CHECK: store i64
+#pragma omp atomic read
+  ullv = ullx;
+// CHECK: load atomic i32, i32* bitcast (float*
+// CHECK: bitcast i32 {{.*}} to float
+// CHECK: store float
+#pragma omp atomic read
+  fv = fx;
+// CHECK: load atomic i64, i64* bitcast (double*
+// CHECK: bitcast i64 {{.*}} to double
+// CHECK: store double
+#pragma omp atomic read
+  dv = dx;
+// CHECK: [[LD:%.+]] = load atomic i128, i128* bitcast (x86_fp80*
+// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[LDTEMP:%.*]] to i128*
+// CHECK: store i128 [[LD]], i128* [[BITCAST]]
+// CHECK: [[LD:%.+]] = load x86_fp80, x86_fp80* [[LDTEMP]]
+// CHECK: store x86_fp80 [[LD]]
+#pragma omp atomic read
+  ldv = ldx;
+// CHECK: call{{.*}} void @__atomic_load(i64 8,
+// CHECK: store i32
+// CHECK: store i32
+#pragma omp atomic read
+  civ = cix;
+// CHECK: call{{.*}} void @__atomic_load(i64 8,
+// CHECK: store float
+// CHECK: store float
+#pragma omp atomic read
+  cfv = cfx;
+// CHECK: call{{.*}} void @__atomic_load(i64 16,
+// CHECK: call{{.*}} @__kmpc_flush(
+// CHECK: store double
+// CHECK: store double
+#pragma omp atomic seq_cst read
+  cdv = cdx;
+// CHECK: load atomic i64, i64*
+// CHECK: store i8
+#pragma omp atomic read
+  bv = ulx;
+// CHECK: load atomic i8, i8*
+// CHECK: store i8
+#pragma omp atomic read
+  cv = bx;
+// CHECK: load atomic i8, i8*
+// CHECK: call{{.*}} @__kmpc_flush(
+// CHECK: store i8
+#pragma omp atomic read, seq_cst
+  ucv = cx;
+// CHECK: load atomic i64, i64*
+// CHECK: store i16
+#pragma omp atomic read
+  sv = ulx;
+// CHECK: load atomic i64, i64*
+// CHECK: store i16
+#pragma omp atomic read
+  usv = lx;
+// CHECK: load atomic i32, i32*
+// CHECK: call{{.*}} @__kmpc_flush(
+// CHECK: store i32
+#pragma omp atomic seq_cst, read
+  iv = uix;
+// CHECK: load atomic i32, i32*
+// CHECK: store i32
+#pragma omp atomic read
+  uiv = ix;
+// CHECK: call{{.*}} void @__atomic_load(i64 8,
+// CHECK: store i64
+#pragma omp atomic read
+  lv = cix;
+// CHECK: load atomic i32, i32*
+// CHECK: store i64
+#pragma omp atomic read
+  ulv = fx;
+// CHECK: load atomic i64, i64*
+// CHECK: store i64
+#pragma omp atomic read
+  llv = dx;
+// CHECK: load atomic i128, i128*
+// CHECK: store i64
+#pragma omp atomic read
+  ullv = ldx;
+// CHECK: call{{.*}} void @__atomic_load(i64 8,
+// CHECK: store float
+#pragma omp atomic read
+  fv = cix;
+// CHECK: load atomic i16, i16*
+// CHECK: store double
+#pragma omp atomic read
+  dv = sx;
+// CHECK: load atomic i8, i8*
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bx;
+// CHECK: load atomic i8, i8*
+// CHECK: store i32
+// CHECK: store i32
+#pragma omp atomic read
+  civ = bx;
+// CHECK: load atomic i16, i16*
+// CHECK: store float
+// CHECK: store float
+#pragma omp atomic read
+  cfv = usx;
+// CHECK: load atomic i64, i64*
+// CHECK: store double
+// CHECK: store double
+#pragma omp atomic read
+  cdv = llx;
+// CHECK: [[I128VAL:%.+]] = load atomic i128, i128* bitcast (<4 x i32>* @{{.+}} to i128*) monotonic
+// CHECK: [[I128PTR:%.+]] = bitcast <4 x i32>* [[LDTEMP:%.+]] to i128*
+// CHECK: store i128 [[I128VAL]], i128* [[I128PTR]]
+// CHECK: [[LD:%.+]] = load <4 x i32>, <4 x i32>* [[LDTEMP]]
+// CHECK: extractelement <4 x i32> [[LD]]
+// CHECK: store i8
+#pragma omp atomic read
+  bv = int4x[0];
+// CHECK: [[LD:%.+]] = load atomic i32, i32* bitcast (i8* getelementptr (i8, i8* bitcast (%{{.+}}* @{{.+}} to i8*), i64 4) to i32*) monotonic
+// CHECK: store i32 [[LD]], i32* [[LDTEMP:%.+]]
+// CHECK: [[LD:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: [[SHL:%.+]] = shl i32 [[LD]], 1
+// CHECK: ashr i32 [[SHL]], 1
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx.a;
+// CHECK: [[LDTEMP_VOID_PTR:%.+]] = bitcast i32* [[LDTEMP:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 4, i8* getelementptr (i8, i8* bitcast (%struct.BitFields_packed* @bfx_packed to i8*), i64 4), i8* [[LDTEMP_VOID_PTR]], i32 0)
+// CHECK: [[LD:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: [[SHL:%.+]] = shl i32 [[LD]], 1
+// CHECK: ashr i32 [[SHL]], 1
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx_packed.a;
+// CHECK: [[LD:%.+]] = load atomic i32, i32* getelementptr inbounds (%struct.BitFields2, %struct.BitFields2* @bfx2, i32 0, i32 0) monotonic
+// CHECK: store i32 [[LD]], i32* [[LDTEMP:%.+]]
+// CHECK: [[LD:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: ashr i32 [[LD]], 31
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx2.a;
+// CHECK: [[LD:%.+]] = load atomic i8, i8* getelementptr (i8, i8* bitcast (%struct.BitFields2_packed* @bfx2_packed to i8*), i64 3) monotonic
+// CHECK: store i8 [[LD]], i8* [[LDTEMP:%.+]]
+// CHECK: [[LD:%.+]] = load i8, i8* [[LDTEMP]]
+// CHECK: ashr i8 [[LD]], 7
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx2_packed.a;
+// CHECK: [[LD:%.+]] = load atomic i32, i32* getelementptr inbounds (%struct.BitFields3, %struct.BitFields3* @bfx3, i32 0, i32 0) monotonic
+// CHECK: store i32 [[LD]], i32* [[LDTEMP:%.+]]
+// CHECK: [[LD:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: [[SHL:%.+]] = shl i32 [[LD]], 7
+// CHECK: ashr i32 [[SHL]], 18
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx3.a;
+// CHECK: [[LDTEMP_VOID_PTR:%.+]] = bitcast i24* [[LDTEMP:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 3, i8* getelementptr (i8, i8* bitcast (%struct.BitFields3_packed* @bfx3_packed to i8*), i64 1), i8* [[LDTEMP_VOID_PTR]], i32 0)
+// CHECK: [[LD:%.+]] = load i24, i24* [[LDTEMP]]
+// CHECK: [[SHL:%.+]] = shl i24 [[LD]], 7
+// CHECK: [[ASHR:%.+]] = ashr i24 [[SHL]], 10
+// CHECK: sext i24 [[ASHR]] to i32
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx3_packed.a;
+// CHECK: [[LD:%.+]] = load atomic i64, i64* bitcast (%struct.BitFields4* @bfx4 to i64*) monotonic
+// CHECK: store i64 [[LD]], i64* [[LDTEMP:%.+]]
+// CHECK: [[LD:%.+]] = load i64, i64* [[LDTEMP]]
+// CHECK: [[SHL:%.+]] = shl i64 [[LD]], 47
+// CHECK: [[ASHR:%.+]] = ashr i64 [[SHL]], 63
+// CHECK: trunc i64 [[ASHR]] to i32
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx4.a;
+// CHECK: [[LD:%.+]] = load atomic i8, i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @bfx4_packed, i32 0, i32 0, i64 2) monotonic
+// CHECK: store i8 [[LD]], i8* [[LDTEMP:%.+]]
+// CHECK: [[LD:%.+]] = load i8, i8* [[LDTEMP]]
+// CHECK: [[SHL:%.+]] = shl i8 [[LD]], 7
+// CHECK: [[ASHR:%.+]] = ashr i8 [[SHL]], 7
+// CHECK: sext i8 [[ASHR]] to i32
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx4_packed.a;
+// CHECK: [[LD:%.+]] = load atomic i64, i64* bitcast (%struct.BitFields4* @bfx4 to i64*) monotonic
+// CHECK: store i64 [[LD]], i64* [[LDTEMP:%.+]]
+// CHECK: [[LD:%.+]] = load i64, i64* [[LDTEMP]]
+// CHECK: [[SHL:%.+]] = shl i64 [[LD]], 40
+// CHECK: [[ASHR:%.+]] = ashr i64 [[SHL]], 57
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx4.b;
+// CHECK: [[LD:%.+]] = load atomic i8, i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @bfx4_packed, i32 0, i32 0, i64 2) monotonic
+// CHECK: store i8 [[LD]], i8* [[LDTEMP:%.+]]
+// CHECK: [[LD:%.+]] = load i8, i8* [[LDTEMP]]
+// CHECK: [[ASHR:%.+]] = ashr i8 [[LD]], 1
+// CHECK: sext i8 [[ASHR]] to i64
+// CHECK: store x86_fp80
+#pragma omp atomic read
+  ldv = bfx4_packed.b;
+// CHECK: [[LD:%.+]] = load atomic i64, i64* bitcast (<2 x float>* @{{.+}} to i64*) monotonic
+// CHECK: [[BITCAST:%.+]] = bitcast <2 x float>* [[LDTEMP:%.+]] to i64*
+// CHECK: store i64 [[LD]], i64* [[BITCAST]]
+// CHECK: [[LD:%.+]] = load <2 x float>, <2 x float>* [[LDTEMP]]
+// CHECK: extractelement <2 x float> [[LD]]
+// CHECK: store i64
+#pragma omp atomic read
+  ulv = float2x.x;
+// CHECK: call{{.*}} i{{[0-9]+}} @llvm.read_register
+// CHECK: call{{.*}} @__kmpc_flush(
+// CHECK: store double
+#pragma omp atomic read seq_cst
+  dv = rix;
+  return 0;
+}
+
+#endif
diff --git a/test/OpenMP/atomic_update_codegen.cpp b/test/OpenMP/atomic_update_codegen.cpp
new file mode 100644
index 000000000000..b619a07200dc
--- /dev/null
+++ b/test/OpenMP/atomic_update_codegen.cpp
@@ -0,0 +1,941 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+_Bool bv, bx;
+char cv, cx;
+unsigned char ucv, ucx;
+short sv, sx;
+unsigned short usv, usx;
+int iv, ix;
+unsigned int uiv, uix;
+long lv, lx;
+unsigned long ulv, ulx;
+long long llv, llx;
+unsigned long long ullv, ullx;
+float fv, fx;
+double dv, dx;
+long double ldv, ldx;
+_Complex int civ, cix;
+_Complex float cfv, cfx;
+_Complex double cdv, cdx;
+
+typedef int int4 __attribute__((__vector_size__(16)));
+int4 int4x;
+
+struct BitFields {
+  int : 32;
+  int a : 31;
+} bfx;
+
+struct BitFields_packed {
+  int : 32;
+  int a : 31;
+} __attribute__ ((__packed__)) bfx_packed;
+
+struct BitFields2 {
+  int : 31;
+  int a : 1;
+} bfx2;
+
+struct BitFields2_packed {
+  int : 31;
+  int a : 1;
+} __attribute__ ((__packed__)) bfx2_packed;
+
+struct BitFields3 {
+  int : 11;
+  int a : 14;
+} bfx3;
+
+struct BitFields3_packed {
+  int : 11;
+  int a : 14;
+} __attribute__ ((__packed__)) bfx3_packed;
+
+struct BitFields4 {
+  short : 16;
+  int a: 1;
+  long b : 7;
+} bfx4;
+
+struct BitFields4_packed {
+  short : 16;
+  int a: 1;
+  long b : 7;
+} __attribute__ ((__packed__)) bfx4_packed;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+float2 float2x;
+
+register int rix __asm__("0");
+
+int main() {
+// CHECK-NOT: atomicrmw
+#pragma omp atomic
+  ++dv;
+// CHECK: atomicrmw add i8* @{{.+}}, i8 1 monotonic
+#pragma omp atomic
+  bx++;
+// CHECK: atomicrmw add i8* @{{.+}}, i8 1 monotonic
+#pragma omp atomic update
+  ++cx;
+// CHECK: atomicrmw sub i8* @{{.+}}, i8 1 monotonic
+#pragma omp atomic
+  ucx--;
+// CHECK: atomicrmw sub i16* @{{.+}}, i16 1 monotonic
+#pragma omp atomic update
+  --sx;
+// CHECK: [[USV:%.+]] = load i16, i16* @{{.+}},
+// CHECK: [[EXPR:%.+]] = zext i16 [[USV]] to i32
+// CHECK: [[X:%.+]] = load atomic i16, i16* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i16 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[CONV:%.+]] = zext i16 [[EXPECTED]] to i32
+// CHECK: [[ADD:%.+]] = add nsw i32 [[CONV]], [[EXPR]]
+// CHECK: [[DESIRED:%.+]] = trunc i32 [[ADD]] to i16
+// CHECK: store i16 [[DESIRED]], i16* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i16, i16* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i16* [[X_ADDR]], i16 [[EXPECTED]], i16 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i16, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i16, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  usx += usv;
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i32, i32* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED:%.+]] = mul nsw i32 [[EXPECTED]], [[EXPR]]
+// CHECK: store i32 [[DESIRED]], i32* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[X_ADDR]], i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  ix *= iv;
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}},
+// CHECK: atomicrmw sub i32* @{{.+}}, i32 [[EXPR]] monotonic
+#pragma omp atomic
+  uix -= uiv;
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i32, i32* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED:%.+]] = shl i32 [[EXPECTED]], [[EXPR]]
+// CHECK: store i32 [[DESIRED]], i32* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[X_ADDR]], i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  ix <<= iv;
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i32, i32* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED:%.+]] = lshr i32 [[EXPECTED]], [[EXPR]]
+// CHECK: store i32 [[DESIRED]], i32* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[X_ADDR]], i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  uix >>= uiv;
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i64, i64* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED:%.+]] = sdiv i64 [[EXPECTED]], [[EXPR]]
+// CHECK: store i64 [[DESIRED]], i64* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* [[X_ADDR]], i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  lx /= lv;
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: atomicrmw and i64* @{{.+}}, i64 [[EXPR]] monotonic
+#pragma omp atomic
+  ulx &= ulv;
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: atomicrmw xor i64* @{{.+}}, i64 [[EXPR]] monotonic
+#pragma omp atomic update
+  llx ^= llv;
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: atomicrmw or i64* @{{.+}}, i64 [[EXPR]] monotonic
+#pragma omp atomic
+  ullx |= ullv;
+// CHECK: [[EXPR:%.+]] = load float, float* @{{.+}},
+// CHECK: [[OLD:%.+]] = load atomic i32, i32*  bitcast (float* [[X_ADDR:@.+]] to i32*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[OLD]], %{{.+}} ], [ [[PREV:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[TEMP:%.+]] to i32*
+// CHECK: [[OLD:%.+]] = bitcast i32 [[EXPECTED]] to float
+// CHECK: [[ADD:%.+]] = fadd float [[OLD]], [[EXPR]]
+// CHECK: store float [[ADD]], float* [[TEMP]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[BITCAST]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* bitcast (float* [[X_ADDR]] to i32*), i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[PREV:%.+]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  fx = fx + fv;
+// CHECK: [[EXPR:%.+]] = load double, double* @{{.+}},
+// CHECK: [[OLD:%.+]] = load atomic i64, i64*  bitcast (double* [[X_ADDR:@.+]] to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[OLD]], %{{.+}} ], [ [[PREV:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast double* [[TEMP:%.+]] to i64*
+// CHECK: [[OLD:%.+]] = bitcast i64 [[EXPECTED]] to double
+// CHECK: [[SUB:%.+]] = fsub double [[EXPR]], [[OLD]]
+// CHECK: store double [[SUB]], double* [[TEMP]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[BITCAST]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (double* [[X_ADDR]] to i64*), i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[PREV:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  dx = dv - dx;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}},
+// CHECK: [[OLD:%.+]] = load atomic i128, i128*  bitcast (x86_fp80* [[X_ADDR:@.+]] to i128*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i128 [ [[OLD]], %{{.+}} ], [ [[PREV:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP:%.+]] to i128*
+// CHECK: store i128 [[EXPECTED]], i128* [[BITCAST]],
+// CHECK: [[BITCAST1:%.+]] = bitcast x86_fp80* [[TEMP1:%.+]] to i128*
+// CHECK: store i128 [[EXPECTED]], i128* [[BITCAST1]],
+// CHECK: [[OLD:%.+]] = load x86_fp80, x86_fp80* [[TEMP1]]
+// CHECK: [[MUL:%.+]] = fmul x86_fp80 [[OLD]], [[EXPR]]
+// CHECK: store x86_fp80 [[MUL]], x86_fp80* [[TEMP]]
+// CHECK: [[DESIRED:%.+]] = load i128, i128* [[BITCAST]]
+// CHECK: [[RES:%.+]] = cmpxchg i128* bitcast (x86_fp80* [[X_ADDR]] to i128*), i128 [[EXPECTED]], i128 [[DESIRED]] monotonic monotonic
+// CHECK: [[PREV:%.+]] = extractvalue { i128, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i128, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  ldx = ldx * ldv;
+// CHECK: [[EXPR_RE:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load i32, i32* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load i32, i32* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store i32 %{{.+}}, i32* [[X_RE_ADDR]]
+// CHECK: store i32 %{{.+}}, i32* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { i32, i32 }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 0, i32 0)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  cix = civ / cix;
+// CHECK: [[EXPR_RE:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[BITCAST:%.+]] = bitcast { float, float }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 8, i8* bitcast ({ float, float }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load float, float* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load float, float* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store float %{{.+}}, float* [[X_RE_ADDR]]
+// CHECK: store float %{{.+}}, float* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { float, float }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { float, float }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 8, i8* bitcast ({ float, float }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 0, i32 0)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  cfx = cfv + cfx;
+// CHECK: [[EXPR_RE:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[BITCAST:%.+]] = bitcast { double, double }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 16, i8* bitcast ({ double, double }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 5)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load double, double* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load double, double* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store double %{{.+}}, double* [[X_RE_ADDR]]
+// CHECK: store double %{{.+}}, double* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { double, double }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { double, double }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 16, i8* bitcast ({ double, double }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 5, i32 5)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic seq_cst
+  cdx = cdx - cdv;
+// CHECK: [[BV:%.+]] = load i8, i8* @{{.+}}
+// CHECK: [[BOOL:%.+]] = trunc i8 [[BV]] to i1
+// CHECK: [[EXPR:%.+]] = zext i1 [[BOOL]] to i64
+// CHECK: atomicrmw and i64* @{{.+}}, i64 [[EXPR]] monotonic
+#pragma omp atomic update
+  ulx = ulx & bv;
+// CHECK: [[CV:%.+]]  = load i8, i8* @{{.+}}, align 1
+// CHECK: [[EXPR:%.+]] = sext i8 [[CV]] to i32
+// CHECK: [[BX:%.+]] = load atomic i8, i8* [[BX_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i8 [ [[BX]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[OLD:%.+]] = trunc i8 [[EXPECTED]] to i1
+// CHECK: [[X_RVAL:%.+]] = zext i1 [[OLD]] to i32
+// CHECK: [[AND:%.+]] = and i32 [[EXPR]], [[X_RVAL]]
+// CHECK: [[CAST:%.+]] = icmp ne i32 [[AND]], 0
+// CHECK: [[DESIRED:%.+]] = zext i1 [[CAST]] to i8
+// CHECK: store i8 [[DESIRED]], i8* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i8, i8* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i8* [[BX_ADDR]], i8 [[EXPECTED]], i8 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  bx = cv & bx;
+// CHECK: [[UCV:%.+]]  = load i8, i8* @{{.+}},
+// CHECK: [[EXPR:%.+]] = zext i8 [[UCV]] to i32
+// CHECK: [[X:%.+]] = load atomic i8, i8* [[CX_ADDR:@.+]] seq_cst
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i8 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[X_RVAL:%.+]] = sext i8 [[EXPECTED]] to i32
+// CHECK: [[ASHR:%.+]] = ashr i32 [[X_RVAL]], [[EXPR]]
+// CHECK: [[DESIRED:%.+]] = trunc i32 [[ASHR]] to i8
+// CHECK: store i8 [[DESIRED]], i8* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i8, i8* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i8* [[CX_ADDR]], i8 [[EXPECTED]], i8 [[DESIRED]] seq_cst seq_cst
+// CHECK: [[OLD_X:%.+]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic update, seq_cst
+  cx = cx >> ucv;
+// CHECK: [[SV:%.+]]  = load i16, i16* @{{.+}},
+// CHECK: [[EXPR:%.+]] = sext i16 [[SV]] to i32
+// CHECK: [[X:%.+]] = load atomic i64, i64* [[ULX_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[X_RVAL:%.+]] = trunc i64 [[EXPECTED]] to i32
+// CHECK: [[SHL:%.+]] = shl i32 [[EXPR]], [[X_RVAL]]
+// CHECK: [[DESIRED:%.+]] = sext i32 [[SHL]] to i64
+// CHECK: store i64 [[DESIRED]], i64* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* [[ULX_ADDR]], i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  ulx = sv << ulx;
+// CHECK: [[USV:%.+]]  = load i16, i16* @{{.+}},
+// CHECK: [[EXPR:%.+]] = zext i16 [[USV]] to i64
+// CHECK: [[X:%.+]] = load atomic i64, i64* [[LX_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[DESIRED:%.+]] = srem i64 [[EXPECTED]], [[EXPR]]
+// CHECK: store i64 [[DESIRED]], i64* [[TEMP:%.+]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* [[LX_ADDR]], i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  lx = lx % usv;
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}}
+// CHECK: atomicrmw or i32* @{{.+}}, i32 [[EXPR]] seq_cst
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic seq_cst, update
+  uix = iv | uix;
+// CHECK: [[EXPR:%.+]] = load i32, i32* @{{.+}}
+// CHECK: atomicrmw and i32* @{{.+}}, i32 [[EXPR]] monotonic
+#pragma omp atomic
+  ix = ix & uiv;
+// CHECK: [[EXPR:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load i32, i32* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load i32, i32* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store i32 %{{.+}}, i32* [[X_RE_ADDR]]
+// CHECK: store i32 %{{.+}}, i32* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { i32, i32 }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 0, i32 0)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  cix = lv + cix;
+// CHECK: [[ULV:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[EXPR:%.+]] = uitofp i64 [[ULV]] to float
+// CHECK: [[OLD:%.+]] = load atomic i32, i32*  bitcast (float* [[X_ADDR:@.+]] to i32*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[OLD]], %{{.+}} ], [ [[PREV:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[TEMP:%.+]] to i32*
+// CHECK: [[OLD:%.+]] = bitcast i32 [[EXPECTED]] to float
+// CHECK: [[MUL:%.+]] = fmul float [[OLD]], [[EXPR]]
+// CHECK: store float [[MUL]], float* [[TEMP]],
+// CHECK: [[DESIRED:%.+]] = load i32, i32* [[BITCAST]],
+// CHECK: [[RES:%.+]] = cmpxchg i32* bitcast (float* [[X_ADDR]] to i32*), i32 [[EXPECTED]], i32 [[DESIRED]] monotonic monotonic
+// CHECK: [[PREV:%.+]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  fx = fx * ulv;
+// CHECK: [[LLV:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[EXPR:%.+]] = sitofp i64 [[LLV]] to double
+// CHECK: [[OLD:%.+]] = load atomic i64, i64*  bitcast (double* [[X_ADDR:@.+]] to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[OLD]], %{{.+}} ], [ [[PREV:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast double* [[TEMP:%.+]] to i64*
+// CHECK: [[OLD:%.+]] = bitcast i64 [[EXPECTED]] to double
+// CHECK: [[DIV:%.+]] = fdiv double [[OLD]], [[EXPR]]
+// CHECK: store double [[DIV]], double* [[TEMP]],
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[BITCAST]],
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (double* [[X_ADDR]] to i64*), i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[PREV:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  dx /= llv;
+// CHECK: [[ULLV:%.+]] = load i64, i64* @{{.+}},
+// CHECK: [[EXPR:%.+]] = uitofp i64 [[ULLV]] to x86_fp80
+// CHECK: [[OLD:%.+]] = load atomic i128, i128*  bitcast (x86_fp80* [[X_ADDR:@.+]] to i128*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i128 [ [[OLD]], %{{.+}} ], [ [[PREV:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST1:%.+]] = bitcast x86_fp80* [[TEMP1:%.+]] to i128*
+// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP:%.+]] to i128*
+// CHECK: store i128 [[EXPECTED]], i128* [[BITCAST]]
+// CHECK: [[OLD:%.+]] = load x86_fp80, x86_fp80* [[TEMP]]
+// CHECK: [[SUB:%.+]] = fsub x86_fp80 [[OLD]], [[EXPR]]
+// CHECK: store x86_fp80 [[SUB]], x86_fp80* [[TEMP1]]
+// CHECK: [[DESIRED:%.+]] = load i128, i128* [[BITCAST1]]
+// CHECK: [[RES:%.+]] = cmpxchg i128* bitcast (x86_fp80* [[X_ADDR]] to i128*), i128 [[EXPECTED]], i128 [[DESIRED]] monotonic monotonic
+// CHECK: [[PREV:%.+]] = extractvalue { i128, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i128, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  ldx -= ullv;
+// CHECK: [[EXPR:%.+]] = load float, float* @{{.+}},
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR:@.+]] to i8*), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load i32, i32* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[EXPECTED_ADDR]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load i32, i32* [[X_IM_ADDR]]
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR:%.+]], i32 0, i32 0
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[DESIRED_ADDR]], i32 0, i32 1
+// CHECK: store i32 %{{.+}}, i32* [[X_RE_ADDR]]
+// CHECK: store i32 %{{.+}}, i32* [[X_IM_ADDR]]
+// CHECK: [[EXPECTED:%.+]] = bitcast { i32, i32 }* [[EXPECTED_ADDR]] to i8*
+// CHECK: [[DESIRED:%.+]] = bitcast { i32, i32 }* [[DESIRED_ADDR]] to i8*
+// CHECK: [[SUCCESS_FAIL:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 8, i8* bitcast ({ i32, i32 }* [[X_ADDR]] to i8*), i8* [[EXPECTED]], i8* [[DESIRED]], i32 0, i32 0)
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  cix = fv / cix;
+// CHECK: [[EXPR:%.+]] = load double, double* @{{.+}},
+// CHECK: [[X:%.+]] = load atomic i16, i16* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i16 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[CONV:%.+]] = sext i16 [[EXPECTED]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CONV]] to double
+// CHECK: [[ADD:%.+]] = fadd double [[X_RVAL]], [[EXPR]]
+// CHECK: [[DESIRED:%.+]] = fptosi double [[ADD]] to i16
+// CHECK: store i16 [[DESIRED]], i16* [[TEMP:%.+]]
+// CHECK: [[DESIRED:%.+]] = load i16, i16* [[TEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i16* [[X_ADDR]], i16 [[EXPECTED]], i16 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i16, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i16, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  sx = sx + dv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}},
+// CHECK: [[XI8:%.+]] = load atomic i8, i8* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i8 [ [[XI8]], %{{.+}} ], [ [[OLD_XI8:%.+]], %[[CONT]] ]
+// CHECK: [[BOOL_EXPECTED:%.+]] = trunc i8 [[EXPECTED]] to i1
+// CHECK: [[CONV:%.+]] = zext i1 [[BOOL_EXPECTED]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CONV]] to x86_fp80
+// CHECK: [[MUL:%.+]] = fmul x86_fp80 [[EXPR]], [[X_RVAL]]
+// CHECK: [[BOOL_DESIRED:%.+]] = fcmp une x86_fp80 [[MUL]], 0xK00000000000000000000
+// CHECK: [[DESIRED:%.+]] = zext i1 [[BOOL_DESIRED]] to i8
+// CHECK: store i8 [[DESIRED]], i8* [[TEMP:%.+]]
+// CHECK: [[DESIRED:%.+]] = load i8, i8* [[TEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* [[X_ADDR]], i8 [[EXPECTED]], i8 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_XI8:%.+]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  bx = ldv * bx;
+// CHECK: [[EXPR_RE:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* [[CIV_ADDR:@.+]], i32 0, i32 0),
+// CHECK: [[EXPR_IM:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* [[CIV_ADDR]], i32 0, i32 1),
+// CHECK: [[XI8:%.+]] = load atomic i8, i8* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i8 [ [[XI8]], %{{.+}} ], [ [[OLD_XI8:%.+]], %[[CONT]] ]
+// CHECK: [[BOOL_EXPECTED:%.+]] = trunc i8 [[EXPECTED]] to i1
+// CHECK: [[X_RVAL:%.+]] = zext i1 [[BOOL_EXPECTED]] to i32
+// CHECK: [[SUB_RE:%.+]] = sub i32 [[EXPR_RE:%.+]], [[X_RVAL]]
+// CHECK: [[SUB_IM:%.+]] = sub i32 [[EXPR_IM:%.+]], 0
+// CHECK: icmp ne i32 [[SUB_RE]], 0
+// CHECK: icmp ne i32 [[SUB_IM]], 0
+// CHECK: [[BOOL_DESIRED:%.+]] = or i1
+// CHECK: [[DESIRED:%.+]] = zext i1 [[BOOL_DESIRED]] to i8
+// CHECK: store i8 [[DESIRED]], i8* [[TEMP:%.+]]
+// CHECK: [[DESIRED:%.+]] = load i8, i8* [[TEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* [[X_ADDR]], i8 [[EXPECTED]], i8 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_XI8:%.+]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  bx = civ - bx;
+// CHECK: [[EXPR_RE:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[X:%.+]] = load atomic i16, i16* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i16 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[CONV:%.+]] = zext i16 [[EXPECTED]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CONV]] to float
+// <Skip checks for complex calculations>
+// CHECK: [[X_RE_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[TEMP:%.+]], i32 0, i32 0
+// CHECK: [[X_RE:%.+]] = load float, float* [[X_RE_ADDR]]
+// CHECK: [[X_IM_ADDR:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[TEMP]], i32 0, i32 1
+// CHECK: [[X_IM:%.+]] = load float, float* [[X_IM_ADDR]]
+// CHECK: [[DESIRED:%.+]] = fptoui float [[X_RE]] to i16
+// CHECK: store i16 [[DESIRED]], i16* [[TEMP:%.+]]
+// CHECK: [[DESIRED:%.+]] = load i16, i16* [[TEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i16* [[X_ADDR]], i16 [[EXPECTED]], i16 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i16, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i16, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  usx /= cfv;
+// CHECK: [[EXPR_RE:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 0)
+// CHECK: [[EXPR_IM:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 1)
+// CHECK: [[X:%.+]] = load atomic i64, i64* [[X_ADDR:@.+]] monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[EXPECTED:%.+]] = phi i64 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ]
+// CHECK: [[X_RVAL:%.+]] = sitofp i64 [[EXPECTED]] to double
+// CHECK: [[ADD_RE:%.+]] = fadd double [[X_RVAL]], [[EXPR_RE]]
+// CHECK: [[ADD_IM:%.+]] = fadd double 0.000000e+00, [[EXPR_IM]]
+// CHECK: [[DESIRED:%.+]] = fptosi double [[ADD_RE]] to i64
+// CHECK: store i64 [[DESIRED]], i64* [[TEMP:%.+]]
+// CHECK: [[DESIRED:%.+]] = load i64, i64* [[TEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* [[X_ADDR]], i64 [[EXPECTED]], i64 [[DESIRED]] monotonic monotonic
+// CHECK: [[OLD_X]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  llx += cdv;
+// CHECK: [[IDX:%.+]] = load i16, i16* @{{.+}}
+// CHECK: load i8, i8*
+// CHECK: [[VEC_ITEM_VAL:%.+]] = zext i1 %{{.+}} to i32
+// CHECK: [[I128VAL:%.+]] = load atomic i128, i128* bitcast (<4 x i32>* [[DEST:@.+]] to i128*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_I128:%.+]] = phi i128 [ [[I128VAL]], %{{.+}} ], [ [[FAILED_I128_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast <4 x i32>* [[TEMP:%.+]] to i128*
+// CHECK: store i128 [[OLD_I128]], i128* [[BITCAST]],
+// CHECK: [[OLD_VEC_VAL:%.+]] = bitcast i128 [[OLD_I128]] to <4 x i32>
+// CHECK: store <4 x i32> [[OLD_VEC_VAL]], <4 x i32>* [[LDTEMP:%.+]],
+// CHECK: [[VEC_VAL:%.+]] = load <4 x i32>, <4 x i32>* [[LDTEMP]]
+// CHECK: [[ITEM:%.+]] = extractelement <4 x i32> [[VEC_VAL]], i16 [[IDX]]
+// CHECK: [[OR:%.+]] = or i32 [[ITEM]], [[VEC_ITEM_VAL]]
+// CHECK: [[VEC_VAL:%.+]] = load <4 x i32>, <4 x i32>* [[TEMP]]
+// CHECK: [[NEW_VEC_VAL:%.+]] = insertelement <4 x i32> [[VEC_VAL]], i32 [[OR]], i16 [[IDX]]
+// CHECK: store <4 x i32> [[NEW_VEC_VAL]], <4 x i32>* [[TEMP]]
+// CHECK: [[NEW_I128:%.+]] = load i128, i128* [[BITCAST]]
+// CHECK: [[RES:%.+]] = cmpxchg i128* bitcast (<4 x i32>* [[DEST]] to i128*), i128 [[OLD_I128]], i128 [[NEW_I128]] monotonic monotonic
+// CHECK: [[FAILED_I128_OLD_VAL:%.+]] = extractvalue { i128, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  int4x[sv] |= bv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* bitcast (i8* getelementptr (i8, i8* bitcast (%struct.BitFields* @{{.+}} to i8*), i64 4) to i32*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP1:%.+]],
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i32 [[A_LD]], 1
+// CHECK: [[A_ASHR:%.+]] = ashr i32 [[A_SHL]], 1
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[A_ASHR]] to x86_fp80
+// CHECK: [[SUB:%.+]] = fsub x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[SUB]] to i32
+// CHECK: [[NEW_VAL:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[BF_VALUE:%.+]] = and i32 [[CONV]], 2147483647
+// CHECK: [[BF_CLEAR:%.+]] = and i32 [[NEW_VAL]], -2147483648
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i32* bitcast (i8* getelementptr (i8, i8* bitcast (%struct.BitFields* @{{.+}} to i8*), i64 4) to i32*), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  bfx.a = bfx.a - ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[LDTEMP:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 4, i8* getelementptr (i8, i8* bitcast (%struct.BitFields_packed* @{{.+}} to i8*), i64 4), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[PREV_VALUE:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: store i32 [[PREV_VALUE]], i32* [[TEMP1:%.+]],
+// CHECK: [[PREV_VALUE:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: store i32 [[PREV_VALUE]], i32* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i32 [[A_LD]], 1
+// CHECK: [[A_ASHR:%.+]] = ashr i32 [[A_SHL]], 1
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[A_ASHR]] to x86_fp80
+// CHECK: [[MUL:%.+]] = fmul x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[MUL]] to i32
+// CHECK: [[NEW_VAL:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[BF_VALUE:%.+]] = and i32 [[CONV]], 2147483647
+// CHECK: [[BF_CLEAR:%.+]] = and i32 [[NEW_VAL]], -2147483648
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[TEMP1]]
+// CHECK: [[BITCAST_TEMP_OLD_BF_ADDR:%.+]] = bitcast i32* [[LDTEMP]] to i8*
+// CHECK: [[BITCAST_TEMP_NEW_BF_ADDR:%.+]] = bitcast i32* [[TEMP1]] to i8*
+// CHECK: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 4, i8* getelementptr (i8, i8* bitcast (%struct.BitFields_packed* @{{.+}} to i8*), i64 4), i8* [[BITCAST_TEMP_OLD_BF_ADDR]], i8* [[BITCAST_TEMP_NEW_BF_ADDR]], i32 0, i32 0)
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  bfx_packed.a *= ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* getelementptr inbounds (%struct.BitFields2, %struct.BitFields2* @{{.+}}, i32 0, i32 0) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP1:%.+]],
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[A_ASHR:%.+]] = ashr i32 [[A_LD]], 31
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[A_ASHR]] to x86_fp80
+// CHECK: [[SUB:%.+]] = fsub x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[SUB]] to i32
+// CHECK: [[NEW_VAL:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i32 [[CONV]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i32 [[BF_AND]], 31
+// CHECK: [[BF_CLEAR:%.+]] = and i32 [[NEW_VAL]], 2147483647
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i32* getelementptr inbounds (%struct.BitFields2, %struct.BitFields2* @{{.+}}, i32 0, i32 0), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  bfx2.a -= ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr (i8, i8* bitcast (%struct.BitFields2_packed* @{{.+}} to i8*), i64 3) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST1:%.+]] = bitcast i32* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST1]],
+// CHECK: [[BITCAST:%.+]] = bitcast i32* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST]],
+// CHECK: [[A_LD:%.+]] = load i8, i8* [[BITCAST]],
+// CHECK: [[A_ASHR:%.+]] = ashr i8 [[A_LD]], 7
+// CHECK: [[CAST:%.+]] = sext i8 [[A_ASHR]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CAST]] to x86_fp80
+// CHECK: [[DIV:%.+]] = fdiv x86_fp80 [[EXPR]], [[X_RVAL]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[DIV]] to i32
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[NEW_VAL]] to i8
+// CHECK: [[BF_LD:%.+]] = load i8, i8* [[BITCAST1]],
+// CHECK: [[BF_AND:%.+]] = and i8 [[TRUNC]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i8 [[BF_AND]], 7
+// CHECK: [[BF_CLEAR:%.+]] = and i8 %{{.+}}, 127
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[BITCAST1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[BITCAST1]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr (i8, i8* bitcast (%struct.BitFields2_packed* @{{.+}} to i8*), i64 3), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  bfx2_packed.a = ldv / bfx2_packed.a;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* getelementptr inbounds (%struct.BitFields3, %struct.BitFields3* @{{.+}}, i32 0, i32 0) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP1:%.+]],
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i32, i32* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i32 [[A_LD]], 7
+// CHECK: [[A_ASHR:%.+]] = ashr i32 [[A_SHL]], 18
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[A_ASHR]] to x86_fp80
+// CHECK: [[DIV:%.+]] = fdiv x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[DIV]] to i32
+// CHECK: [[BF_LD:%.+]] = load i32, i32* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i32 [[NEW_VAL]], 16383
+// CHECK: [[BF_VALUE:%.+]] = shl i32 [[BF_AND]], 11
+// CHECK: [[BF_CLEAR:%.+]] = and i32 %{{.+}}, -33552385
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i32* getelementptr inbounds (%struct.BitFields3, %struct.BitFields3* @{{.+}}, i32 0, i32 0), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  bfx3.a /= ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[LDTEMP:%.+]] = bitcast i32* %{{.+}} to i24*
+// CHECK: [[BITCAST:%.+]] = bitcast i24* %{{.+}} to i8*
+// CHECK: call void @__atomic_load(i64 3, i8* getelementptr (i8, i8* bitcast (%struct.BitFields3_packed* @{{.+}} to i8*), i64 1), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[PREV_VALUE:%.+]] = load i24, i24* [[LDTEMP]]
+// CHECK: store i24 [[PREV_VALUE]], i24* [[TEMP1:%.+]],
+// CHECK: [[PREV_VALUE:%.+]] = load i24, i24* [[LDTEMP]]
+// CHECK: store i24 [[PREV_VALUE]], i24* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i24, i24* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i24 [[A_LD]], 7
+// CHECK: [[A_ASHR:%.+]] = ashr i24 [[A_SHL]], 10
+// CHECK: [[CAST:%.+]] = sext i24 [[A_ASHR]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CAST]] to x86_fp80
+// CHECK: [[ADD:%.+]] = fadd x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[ADD]] to i32
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[NEW_VAL]] to i24
+// CHECK: [[BF_LD:%.+]] = load i24, i24* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i24 [[TRUNC]], 16383
+// CHECK: [[BF_VALUE:%.+]] = shl i24 [[BF_AND]], 3
+// CHECK: [[BF_CLEAR:%.+]] = and i24 [[BF_LD]], -131065
+// CHECK: or i24 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i24 %{{.+}}, i24* [[TEMP1]]
+// CHECK: [[BITCAST_TEMP_OLD_BF_ADDR:%.+]] = bitcast i24* [[LDTEMP]] to i8*
+// CHECK: [[BITCAST_TEMP_NEW_BF_ADDR:%.+]] = bitcast i24* [[TEMP1]] to i8*
+// CHECK: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 3, i8* getelementptr (i8, i8* bitcast (%struct.BitFields3_packed* @{{.+}} to i8*), i64 1), i8* [[BITCAST_TEMP_OLD_BF_ADDR]], i8* [[BITCAST_TEMP_NEW_BF_ADDR]], i32 0, i32 0)
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  bfx3_packed.a += ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i64, i64* bitcast (%struct.BitFields4* @{{.+}} to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i64 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i64 [[OLD_BF_VALUE]], i64* [[TEMP1:%.+]],
+// CHECK: store i64 [[OLD_BF_VALUE]], i64* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i64 [[A_LD]], 47
+// CHECK: [[A_ASHR:%.+]] = ashr i64 [[A_SHL:%.+]], 63
+// CHECK: [[A_CAST:%.+]] = trunc i64 [[A_ASHR:%.+]] to i32
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[CAST:%.+]] to x86_fp80
+// CHECK: [[MUL:%.+]] = fmul x86_fp80 [[X_RVAL]], [[EXPR]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[MUL]] to i32
+// CHECK: [[ZEXT:%.+]] = zext i32 [[NEW_VAL]] to i64
+// CHECK: [[BF_LD:%.+]] = load i64, i64* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i64 [[ZEXT]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i64 [[BF_AND]], 16
+// CHECK: [[BF_CLEAR:%.+]] = and i64 [[BF_LD]], -65537
+// CHECK: or i64 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i64 %{{.+}}, i64* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i64, i64* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (%struct.BitFields4* @{{.+}} to i64*), i64 [[OLD_BF_VALUE]], i64 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  bfx4.a = bfx4.a * ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %{{.+}} ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST1:%.+]] = bitcast i32* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST1]],
+// CHECK: [[BITCAST:%.+]] = bitcast i32* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST]],
+// CHECK: [[A_LD:%.+]] = load i8, i8* [[BITCAST]],
+// CHECK: [[A_SHL:%.+]] = shl i8 [[A_LD]], 7
+// CHECK: [[A_ASHR:%.+]] = ashr i8 [[A_SHL:%.+]], 7
+// CHECK: [[CAST:%.+]] = sext i8 [[A_ASHR:%.+]] to i32
+// CHECK: [[CONV:%.+]] = sitofp i32 [[CAST]] to x86_fp80
+// CHECK: [[SUB: %.+]] = fsub x86_fp80 [[CONV]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[SUB:%.+]] to i32
+// CHECK: [[NEW_VAL:%.+]] = trunc i32 [[CONV]] to i8
+// CHECK: [[BF_LD:%.+]] = load i8, i8* [[BITCAST1]],
+// CHECK: [[BF_VALUE:%.+]] = and i8 [[NEW_VAL]], 1
+// CHECK: [[BF_CLEAR:%.+]] = and i8 [[BF_LD]], -2
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[BITCAST1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[BITCAST1]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  bfx4_packed.a -= ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i64, i64* bitcast (%struct.BitFields4* @{{.+}} to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i64 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: store i64 [[OLD_BF_VALUE]], i64* [[TEMP1:%.+]],
+// CHECK: store i64 [[OLD_BF_VALUE]], i64* [[TEMP:%.+]],
+// CHECK: [[A_LD:%.+]] = load i64, i64* [[TEMP]],
+// CHECK: [[A_SHL:%.+]] = shl i64 [[A_LD]], 40
+// CHECK: [[A_ASHR:%.+]] = ashr i64 [[A_SHL:%.+]], 57
+// CHECK: [[CONV:%.+]] = sitofp i64 [[A_ASHR]] to x86_fp80
+// CHECK: [[DIV:%.+]] = fdiv x86_fp80 [[CONV]], [[EXPR]]
+// CHECK: [[CONV:%.+]] = fptosi x86_fp80 [[DIV]] to i64
+// CHECK: [[BF_LD:%.+]] = load i64, i64* [[TEMP1]],
+// CHECK: [[BF_AND:%.+]] = and i64 [[CONV]], 127
+// CHECK: [[BF_VALUE:%.+]] = shl i64 [[BF_AND:%.+]], 17
+// CHECK: [[BF_CLEAR:%.+]] = and i64 [[BF_LD]], -16646145
+// CHECK: [[VAL:%.+]] = or i64 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i64 [[VAL]], i64* [[TEMP1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i64, i64* [[TEMP1]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (%struct.BitFields4* @{{.+}} to i64*), i64 [[OLD_BF_VALUE]], i64 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  bfx4.b /= ldv;
+// CHECK: [[EXPR:%.+]] = load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST1:%.+]] = bitcast i64* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST1]],
+// CHECK: [[BITCAST:%.+]] = bitcast i64* %{{.+}} to i8*
+// CHECK: store i8 [[OLD_BF_VALUE]], i8* [[BITCAST]],
+// CHECK: [[A_LD:%.+]] = load i8, i8* [[BITCAST]],
+// CHECK: [[A_ASHR:%.+]] = ashr i8 [[A_LD]], 1
+// CHECK: [[CAST:%.+]] = sext i8 [[A_ASHR]] to i64
+// CHECK: [[CONV:%.+]] = sitofp i64 [[CAST]] to x86_fp80
+// CHECK: [[ADD:%.+]] = fadd x86_fp80 [[CONV]], [[EXPR]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 [[ADD]] to i64
+// CHECK: [[TRUNC:%.+]] = trunc i64 [[NEW_VAL]] to i8
+// CHECK: [[BF_LD:%.+]] = load i8, i8* [[BITCAST1]],
+// CHECK: [[BF_AND:%.+]] = and i8 [[TRUNC]], 127
+// CHECK: [[BF_VALUE:%.+]] = shl i8 [[BF_AND]], 1
+// CHECK: [[BF_CLEAR:%.+]] = and i8 [[BF_LD]], 1
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[BITCAST1]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[BITCAST1]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic update
+  bfx4_packed.b += ldv;
+// CHECK: load i64, i64*
+// CHECK: [[EXPR:%.+]] = uitofp i64 %{{.+}} to float
+// CHECK: [[I64VAL:%.+]] = load atomic i64, i64* bitcast (<2 x float>* [[DEST:@.+]] to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_I64:%.+]] = phi i64 [ [[I64VAL]], %{{.+}} ], [ [[FAILED_I64_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast <2 x float>* [[TEMP:%.+]] to i64*
+// CHECK: store i64 [[OLD_I64]], i64* [[BITCAST]],
+// CHECK: [[OLD_VEC_VAL:%.+]] = bitcast i64 [[OLD_I64]] to <2 x float>
+// CHECK: store <2 x float> [[OLD_VEC_VAL]], <2 x float>* [[LDTEMP:%.+]],
+// CHECK: [[VEC_VAL:%.+]] = load <2 x float>, <2 x float>* [[LDTEMP]]
+// CHECK: [[X:%.+]] = extractelement <2 x float> [[VEC_VAL]], i64 0
+// CHECK: [[VEC_ITEM_VAL:%.+]] = fsub float [[EXPR]], [[X]]
+// CHECK: [[VEC_VAL:%.+]] = load <2 x float>, <2 x float>* [[TEMP]],
+// CHECK: [[NEW_VEC_VAL:%.+]] = insertelement <2 x float> [[VEC_VAL]], float [[VEC_ITEM_VAL]], i64 0
+// CHECK: store <2 x float> [[NEW_VEC_VAL]], <2 x float>* [[TEMP]]
+// CHECK: [[NEW_I64:%.+]] = load i64, i64* [[BITCAST]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (<2 x float>* [[DEST]] to i64*), i64 [[OLD_I64]], i64 [[NEW_I64]] monotonic monotonic
+// CHECK: [[FAILED_I64_OLD_VAL:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic
+  float2x.x = ulv - float2x.x;
+// CHECK: [[EXPR:%.+]] = load double, double* @{{.+}},
+// CHECK: [[OLD_VAL:%.+]] = call i32 @llvm.read_register.i32([[REG:metadata ![0-9]+]])
+// CHECK: [[X_RVAL:%.+]] = sitofp i32 [[OLD_VAL]] to double
+// CHECK: [[DIV:%.+]] = fdiv double [[EXPR]], [[X_RVAL]]
+// CHECK: [[NEW_VAL:%.+]] = fptosi double [[DIV]] to i32
+// CHECK: call void @llvm.write_register.i32([[REG]], i32 [[NEW_VAL]])
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic seq_cst
+  rix = dv / rix;
+  return 0;
+}
+
+#endif
diff --git a/test/OpenMP/atomic_write_codegen.c b/test/OpenMP/atomic_write_codegen.c
new file mode 100644
index 000000000000..0016dc86c36a
--- /dev/null
+++ b/test/OpenMP/atomic_write_codegen.c
@@ -0,0 +1,509 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+_Bool bv, bx;
+char cv, cx;
+unsigned char ucv, ucx;
+short sv, sx;
+unsigned short usv, usx;
+int iv, ix;
+unsigned int uiv, uix;
+long lv, lx;
+unsigned long ulv, ulx;
+long long llv, llx;
+unsigned long long ullv, ullx;
+float fv, fx;
+double dv, dx;
+long double ldv, ldx;
+_Complex int civ, cix;
+_Complex float cfv, cfx;
+_Complex double cdv, cdx;
+
+typedef int int4 __attribute__((__vector_size__(16)));
+int4 int4x;
+
+struct BitFields {
+  int : 32;
+  int a : 31;
+} bfx;
+
+struct BitFields_packed {
+  int : 32;
+  int a : 31;
+} __attribute__ ((__packed__)) bfx_packed;
+
+struct BitFields2 {
+  int : 31;
+  int a : 1;
+} bfx2;
+
+struct BitFields2_packed {
+  int : 31;
+  int a : 1;
+} __attribute__ ((__packed__)) bfx2_packed;
+
+struct BitFields3 {
+  int : 11;
+  int a : 14;
+} bfx3;
+
+struct BitFields3_packed {
+  int : 11;
+  int a : 14;
+} __attribute__ ((__packed__)) bfx3_packed;
+
+struct BitFields4 {
+  short : 16;
+  int a: 1;
+  long b : 7;
+} bfx4;
+
+struct BitFields4_packed {
+  short : 16;
+  int a: 1;
+  long b : 7;
+} __attribute__ ((__packed__)) bfx4_packed;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+float2 float2x;
+
+register int rix __asm__("0");
+
+int main() {
+// CHECK: load i8, i8*
+// CHECK: store atomic i8
+#pragma omp atomic write
+  bx = bv;
+// CHECK: load i8, i8*
+// CHECK: store atomic i8
+#pragma omp atomic write
+  cx = cv;
+// CHECK: load i8, i8*
+// CHECK: store atomic i8
+#pragma omp atomic write
+  ucx = ucv;
+// CHECK: load i16, i16*
+// CHECK: store atomic i16
+#pragma omp atomic write
+  sx = sv;
+// CHECK: load i16, i16*
+// CHECK: store atomic i16
+#pragma omp atomic write
+  usx = usv;
+// CHECK: load i32, i32*
+// CHECK: store atomic i32
+#pragma omp atomic write
+  ix = iv;
+// CHECK: load i32, i32*
+// CHECK: store atomic i32
+#pragma omp atomic write
+  uix = uiv;
+// CHECK: load i64, i64*
+// CHECK: store atomic i64
+#pragma omp atomic write
+  lx = lv;
+// CHECK: load i64, i64*
+// CHECK: store atomic i64
+#pragma omp atomic write
+  ulx = ulv;
+// CHECK: load i64, i64*
+// CHECK: store atomic i64
+#pragma omp atomic write
+  llx = llv;
+// CHECK: load i64, i64*
+// CHECK: store atomic i64
+#pragma omp atomic write
+  ullx = ullv;
+// CHECK: load float, float*
+// CHECK: bitcast float {{.*}} to i32
+// CHECK: store atomic i32 {{.*}}, i32* bitcast (float*
+#pragma omp atomic write
+  fx = fv;
+// CHECK: load double, double*
+// CHECK: bitcast double {{.*}} to i64
+// CHECK: store atomic i64 {{.*}}, i64* bitcast (double*
+#pragma omp atomic write
+  dx = dv;
+// CHECK: [[LD:%.+]] = load x86_fp80, x86_fp80*
+// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[LDTEMP:%.*]] to i8*
+// CHECK: call void @llvm.memset.p0i8.i64(i8* [[BITCAST]], i8 0, i64 16, i32 16, i1 false)
+// CHECK: store x86_fp80 [[LD]], x86_fp80* [[LDTEMP]]
+// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[LDTEMP:%.*]] to i128*
+// CHECK: [[LD:%.+]] = load i128, i128* [[BITCAST]]
+// CHECK: store atomic i128 [[LD]], i128* bitcast (x86_fp80*
+#pragma omp atomic write
+  ldx = ldv;
+// CHECK: [[REAL_VAL:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.*}}, i32 0, i32 0)
+// CHECK: [[IMG_VAL:%.+]] = load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.*}}, i32 0, i32 1)
+// CHECK: [[TEMP_REAL_REF:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[TEMP:%.+]], i32 0, i32 0
+// CHECK: [[TEMP_IMG_REF:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[TEMP]], i32 0, i32 1
+// CHECK: store i32 [[REAL_VAL]], i32* [[TEMP_REAL_REF]]
+// CHECK: store i32 [[IMG_VAL]], i32* [[TEMP_IMG_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[TEMP]] to i8*
+// CHECK: call void @__atomic_store(i64 8, i8* bitcast ({ i32, i32 }* @{{.*}} to i8*), i8* [[BITCAST]], i32 0)
+#pragma omp atomic write
+  cix = civ;
+// CHECK: [[REAL_VAL:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.*}}, i32 0, i32 0)
+// CHECK: [[IMG_VAL:%.+]] = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.*}}, i32 0, i32 1)
+// CHECK: [[TEMP_REAL_REF:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[TEMP:%.+]], i32 0, i32 0
+// CHECK: [[TEMP_IMG_REF:%.+]] = getelementptr inbounds { float, float }, { float, float }* [[TEMP]], i32 0, i32 1
+// CHECK: store float [[REAL_VAL]], float* [[TEMP_REAL_REF]]
+// CHECK: store float [[IMG_VAL]], float* [[TEMP_IMG_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast { float, float }* [[TEMP]] to i8*
+// CHECK: call void @__atomic_store(i64 8, i8* bitcast ({ float, float }* @{{.*}} to i8*), i8* [[BITCAST]], i32 0)
+#pragma omp atomic write
+  cfx = cfv;
+// CHECK: [[REAL_VAL:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.*}}, i32 0, i32 0)
+// CHECK: [[IMG_VAL:%.+]] = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.*}}, i32 0, i32 1)
+// CHECK: [[TEMP_REAL_REF:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[TEMP:%.+]], i32 0, i32 0
+// CHECK: [[TEMP_IMG_REF:%.+]] = getelementptr inbounds { double, double }, { double, double }* [[TEMP]], i32 0, i32 1
+// CHECK: store double [[REAL_VAL]], double* [[TEMP_REAL_REF]]
+// CHECK: store double [[IMG_VAL]], double* [[TEMP_IMG_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast { double, double }* [[TEMP]] to i8*
+// CHECK: call void @__atomic_store(i64 16, i8* bitcast ({ double, double }* @{{.*}} to i8*), i8* [[BITCAST]], i32 5)
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic seq_cst write
+  cdx = cdv;
+// CHECK: load i8, i8*
+// CHECK: store atomic i64
+#pragma omp atomic write
+  ulx = bv;
+// CHECK: load i8, i8*
+// CHECK: store atomic i8
+#pragma omp atomic write
+  bx = cv;
+// CHECK: load i8, i8*
+// CHECK: store atomic i8
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic write, seq_cst
+  cx = ucv;
+// CHECK: load i16, i16*
+// CHECK: store atomic i64
+#pragma omp atomic write
+  ulx = sv;
+// CHECK: load i16, i16*
+// CHECK: store atomic i64
+#pragma omp atomic write
+  lx = usv;
+// CHECK: load i32, i32*
+// CHECK: store atomic i32
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic seq_cst, write
+  uix = iv;
+// CHECK: load i32, i32*
+// CHECK: store atomic i32
+#pragma omp atomic write
+  ix = uiv;
+// CHECK: load i64, i64*
+// CHECK: [[VAL:%.+]] = trunc i64 %{{.*}} to i32
+// CHECK: [[TEMP_REAL_REF:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[TEMP:%.+]], i32 0, i32 0
+// CHECK: [[TEMP_IMG_REF:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[TEMP]], i32 0, i32 1
+// CHECK: store i32 [[VAL]], i32* [[TEMP_REAL_REF]]
+// CHECK: store i32 0, i32* [[TEMP_IMG_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[TEMP]] to i8*
+// CHECK: call void @__atomic_store(i64 8, i8* bitcast ({ i32, i32 }* @{{.+}} to i8*), i8* [[BITCAST]], i32 0)
+#pragma omp atomic write
+  cix = lv;
+// CHECK: load i64, i64*
+// CHECK: store atomic i32 %{{.+}}, i32* bitcast (float*
+#pragma omp atomic write
+  fx = ulv;
+// CHECK: load i64, i64*
+// CHECK: store atomic i64 %{{.+}}, i64* bitcast (double*
+#pragma omp atomic write
+  dx = llv;
+// CHECK: load i64, i64*
+// CHECK: [[VAL:%.+]] = uitofp i64 %{{.+}} to x86_fp80
+// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP:%.+]] to i8*
+// CHECK: call void @llvm.memset.p0i8.i64(i8* [[BITCAST]], i8 0, i64 16, i32 16, i1 false)
+// CHECK: store x86_fp80 [[VAL]], x86_fp80* [[TEMP]]
+// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP]] to i128*
+// CHECK: [[VAL:%.+]] = load i128, i128* [[BITCAST]]
+// CHECK: store atomic i128 [[VAL]], i128* bitcast (x86_fp80*
+#pragma omp atomic write
+  ldx = ullv;
+// CHECK: load float, float*
+// CHECK: [[VAL:%.+]] = fptosi float %{{.*}} to i32
+// CHECK: [[TEMP_REAL_REF:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[TEMP:%.+]], i32 0, i32 0
+// CHECK: [[TEMP_IMG_REF:%.+]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[TEMP]], i32 0, i32 1
+// CHECK: store i32 [[VAL]], i32* [[TEMP_REAL_REF]]
+// CHECK: store i32 0, i32* [[TEMP_IMG_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast { i32, i32 }* [[TEMP]] to i8*
+// CHECK: call void @__atomic_store(i64 8, i8* bitcast ({ i32, i32 }* @{{.+}} to i8*), i8* [[BITCAST]], i32 0)
+#pragma omp atomic write
+  cix = fv;
+// CHECK: load double, double*
+// CHECK: store atomic i16
+#pragma omp atomic write
+  sx = dv;
+// CHECK: load x86_fp80, x86_fp80*
+// CHECK: store atomic i8
+#pragma omp atomic write
+  bx = ldv;
+// CHECK: load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 0)
+// CHECK: load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @{{.+}}, i32 0, i32 1)
+// CHECK: icmp ne i32 %{{.+}}, 0
+// CHECK: icmp ne i32 %{{.+}}, 0
+// CHECK: or i1
+// CHECK: store atomic i8
+#pragma omp atomic write
+  bx = civ;
+// CHECK: load float, float* getelementptr inbounds ({ float, float }, { float, float }* @{{.*}}, i32 0, i32 0)
+// CHECK: store atomic i16
+#pragma omp atomic write
+  usx = cfv;
+// CHECK: load double, double* getelementptr inbounds ({ double, double }, { double, double }* @{{.+}}, i32 0, i32 0)
+// CHECK: store atomic i64
+#pragma omp atomic write
+  llx = cdv;
+// CHECK-DAG: [[IDX:%.+]] = load i16, i16* @{{.+}}
+// CHECK-DAG: load i8, i8*
+// CHECK-DAG: [[VEC_ITEM_VAL:%.+]] = zext i1 %{{.+}} to i32
+// CHECK: [[I128VAL:%.+]] = load atomic i128, i128* bitcast (<4 x i32>* [[DEST:@.+]] to i128*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_I128:%.+]] = phi i128 [ [[I128VAL]], %{{.+}} ], [ [[FAILED_I128_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast <4 x i32>* [[LDTEMP:%.+]] to i128*
+// CHECK: store i128 [[OLD_I128]], i128* [[BITCAST]],
+// CHECK: [[VEC_VAL:%.+]] = load <4 x i32>, <4 x i32>* [[LDTEMP]]
+// CHECK: [[NEW_VEC_VAL:%.+]] = insertelement <4 x i32> [[VEC_VAL]], i32 [[VEC_ITEM_VAL]], i16 [[IDX]]
+// CHECK: store <4 x i32> [[NEW_VEC_VAL]], <4 x i32>* [[LDTEMP]]
+// CHECK: [[NEW_I128:%.+]] = load i128, i128* [[BITCAST]]
+// CHECK: [[RES:%.+]] = cmpxchg i128* bitcast (<4 x i32>* [[DEST]] to i128*), i128 [[OLD_I128]], i128 [[NEW_I128]] monotonic monotonic
+// CHECK: [[FAILED_I128_OLD_VAL:%.+]] = extractvalue { i128, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  int4x[sv] = bv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i32
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* bitcast (i8* getelementptr (i8, i8* bitcast (%struct.BitFields* @{{.+}} to i8*), i64 4) to i32*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BF_VALUE:%.+]] = and i32 [[NEW_VAL]], 2147483647
+// CHECK: [[BF_CLEAR:%.+]] = and i32 %{{.+}}, -2147483648
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[LDTEMP:%.+]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i32* bitcast (i8* getelementptr (i8, i8* bitcast (%struct.BitFields* @{{.+}} to i8*), i64 4) to i32*), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx.a = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i32
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[LDTEMP:%.+]] to i8*
+// CHECK: call void @__atomic_load(i64 4, i8* getelementptr (i8, i8* bitcast (%struct.BitFields_packed* @{{.+}} to i8*), i64 4), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = load i32, i32* [[LDTEMP]],
+// CHECK: store i32 [[OLD_BF_VALUE]], i32* [[LDTEMP1:%.+]],
+// CHECK: [[OLD_BF_VALUE:%.+]] = load i32, i32* [[LDTEMP1]],
+// CHECK: [[BF_VALUE:%.+]] = and i32 [[NEW_VAL]], 2147483647
+// CHECK: [[BF_CLEAR:%.+]] = and i32 [[OLD_BF_VALUE]], -2147483648
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[LDTEMP1]]
+// CHECK: [[BITCAST_TEMP_OLD_BF_ADDR:%.+]] = bitcast i32* [[LDTEMP]] to i8*
+// CHECK: [[BITCAST_TEMP_NEW_BF_ADDR:%.+]] = bitcast i32* [[LDTEMP1]] to i8*
+// CHECK: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 4, i8* getelementptr (i8, i8* bitcast (%struct.BitFields_packed* @{{.+}} to i8*), i64 4), i8* [[BITCAST_TEMP_OLD_BF_ADDR]], i8* [[BITCAST_TEMP_NEW_BF_ADDR]], i32 0, i32 0)
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx_packed.a = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i32
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* getelementptr inbounds (%struct.BitFields2, %struct.BitFields2* @{{.+}}, i32 0, i32 0) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BF_AND:%.+]] = and i32 [[NEW_VAL]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i32 [[BF_AND]], 31
+// CHECK: [[BF_CLEAR:%.+]] = and i32 %{{.+}}, 2147483647
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[LDTEMP:%.+]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i32* getelementptr inbounds (%struct.BitFields2, %struct.BitFields2* @{{.+}}, i32 0, i32 0), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx2.a = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i32
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr (i8, i8* bitcast (%struct.BitFields2_packed* @{{.+}} to i8*), i64 3) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[NEW_VAL]] to i8
+// CHECK: [[BF_AND:%.+]] = and i8 [[TRUNC]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i8 [[BF_AND]], 7
+// CHECK: [[BF_CLEAR:%.+]] = and i8 %{{.+}}, 127
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[LDTEMP:%.+]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[LDTEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr (i8, i8* bitcast (%struct.BitFields2_packed* @{{.+}} to i8*), i64 3), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx2_packed.a = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i32
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i32, i32* getelementptr inbounds (%struct.BitFields3, %struct.BitFields3* @{{.+}}, i32 0, i32 0) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i32 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BF_AND:%.+]] = and i32 [[NEW_VAL]], 16383
+// CHECK: [[BF_VALUE:%.+]] = shl i32 [[BF_AND]], 11
+// CHECK: [[BF_CLEAR:%.+]] = and i32 %{{.+}}, -33552385
+// CHECK: or i32 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i32 %{{.+}}, i32* [[LDTEMP:%.+]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i32, i32* [[LDTEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i32* getelementptr inbounds (%struct.BitFields3, %struct.BitFields3* @{{.+}}, i32 0, i32 0), i32 [[OLD_BF_VALUE]], i32 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx3.a = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i32
+// CHECK: [[LDTEMP:%.+]] = bitcast i32* %{{.+}} to i24*
+// CHECK: [[BITCAST:%.+]] = bitcast i24* %{{.+}} to i8*
+// CHECK: call void @__atomic_load(i64 3, i8* getelementptr (i8, i8* bitcast (%struct.BitFields3_packed* @{{.+}} to i8*), i64 1), i8* [[BITCAST]], i32 0)
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_VAL:%.+]] = load i24, i24* %{{.+}},
+// CHECK: store i24 [[OLD_VAL]], i24* [[TEMP:%.+]],
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[NEW_VAL]] to i24
+// CHECK: [[BF_AND:%.+]] = and i24 [[TRUNC]], 16383
+// CHECK: [[BF_VALUE:%.+]] = shl i24 [[BF_AND]], 3
+// CHECK: [[BF_CLEAR:%.+]] = and i24 %{{.+}}, -131065
+// CHECK: or i24 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i24 %{{.+}}, i24* [[TEMP]]
+// CHECK: [[BITCAST_TEMP_OLD_BF_ADDR:%.+]] = bitcast i24* [[LDTEMP]] to i8*
+// CHECK: [[BITCAST_TEMP_NEW_BF_ADDR:%.+]] = bitcast i24* [[TEMP]] to i8*
+// CHECK: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i64 3, i8* getelementptr (i8, i8* bitcast (%struct.BitFields3_packed* @{{.+}} to i8*), i64 1), i8* [[BITCAST_TEMP_OLD_BF_ADDR]], i8* [[BITCAST_TEMP_NEW_BF_ADDR]], i32 0, i32 0)
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx3_packed.a = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i32
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i64, i64* bitcast (%struct.BitFields4* @{{.+}} to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i64 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[ZEXT:%.+]] = zext i32 [[NEW_VAL]] to i64
+// CHECK: [[BF_AND:%.+]] = and i64 [[ZEXT]], 1
+// CHECK: [[BF_VALUE:%.+]] = shl i64 [[BF_AND]], 16
+// CHECK: [[BF_CLEAR:%.+]] = and i64 %{{.+}}, -65537
+// CHECK: or i64 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i64 %{{.+}}, i64* [[LDTEMP:%.+]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i64, i64* [[LDTEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (%struct.BitFields4* @{{.+}} to i64*), i64 [[OLD_BF_VALUE]], i64 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx4.a = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i32
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[NEW_VAL]] to i8
+// CHECK: [[BF_VALUE:%.+]] = and i8 [[TRUNC]], 1
+// CHECK: [[BF_CLEAR:%.+]] = and i8 %{{.+}}, -2
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[LDTEMP:%.+]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[LDTEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx4_packed.a = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i64
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i64, i64* bitcast (%struct.BitFields4* @{{.+}} to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i64 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BF_AND:%.+]] = and i64 [[NEW_VAL]], 127
+// CHECK: [[BF_VALUE:%.+]] = shl i64 [[BF_AND]], 17
+// CHECK: [[BF_CLEAR:%.+]] = and i64 %{{.+}}, -16646145
+// CHECK: or i64 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i64 %{{.+}}, i64* [[LDTEMP:%.+]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i64, i64* [[LDTEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (%struct.BitFields4* @{{.+}} to i64*), i64 [[OLD_BF_VALUE]], i64 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx4.b = ldv;
+// CHECK: load x86_fp80, x86_fp80* @{{.+}}
+// CHECK: [[NEW_VAL:%.+]] = fptosi x86_fp80 %{{.+}} to i64
+// CHECK: [[PREV_VALUE:%.+]] = load atomic i8, i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_BF_VALUE:%.+]] = phi i8 [ [[PREV_VALUE]], %[[EXIT]] ], [ [[FAILED_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[TRUNC:%.+]] = trunc i64 [[NEW_VAL]] to i8
+// CHECK: [[BF_AND:%.+]] = and i8 [[TRUNC]], 127
+// CHECK: [[BF_VALUE:%.+]] = shl i8 [[BF_AND]], 1
+// CHECK: [[BF_CLEAR:%.+]] = and i8 %{{.+}}, 1
+// CHECK: or i8 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK: store i8 %{{.+}}, i8* [[LDTEMP:%.+]]
+// CHECK: [[NEW_BF_VALUE:%.+]] = load i8, i8* [[LDTEMP]]
+// CHECK: [[RES:%.+]] = cmpxchg i8* getelementptr inbounds (%struct.BitFields4_packed, %struct.BitFields4_packed* @{{.+}}, i32 0, i32 0, i64 2), i8 [[OLD_BF_VALUE]], i8 [[NEW_BF_VALUE]] monotonic monotonic
+// CHECK: [[FAILED_OLD_VAL]] = extractvalue { i8, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i8, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  bfx4_packed.b = ldv;
+// CHECK: load i64, i64*
+// CHECK: [[VEC_ITEM_VAL:%.+]] = uitofp i64 %{{.+}} to float
+// CHECK: [[I64VAL:%.+]] = load atomic i64, i64* bitcast (<2 x float>* [[DEST:@.+]] to i64*) monotonic
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[OLD_I64:%.+]] = phi i64 [ [[I64VAL]], %{{.+}} ], [ [[FAILED_I64_OLD_VAL:%.+]], %[[CONT]] ]
+// CHECK: [[BITCAST:%.+]] = bitcast <2 x float>* [[LDTEMP:%.+]] to i64*
+// CHECK: store i64 [[OLD_I64]], i64* [[BITCAST]],
+// CHECK: [[VEC_VAL:%.+]] = load <2 x float>, <2 x float>* [[LDTEMP]]
+// CHECK: [[NEW_VEC_VAL:%.+]] = insertelement <2 x float> [[VEC_VAL]], float [[VEC_ITEM_VAL]], i64 0
+// CHECK: store <2 x float> [[NEW_VEC_VAL]], <2 x float>* [[LDTEMP]]
+// CHECK: [[NEW_I64:%.+]] = load i64, i64* [[BITCAST]]
+// CHECK: [[RES:%.+]] = cmpxchg i64* bitcast (<2 x float>* [[DEST]] to i64*), i64 [[OLD_I64]], i64 [[NEW_I64]] monotonic monotonic
+// CHECK: [[FAILED_I64_OLD_VAL:%.+]] = extractvalue { i64, i1 } [[RES]], 0
+// CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i64, i1 } [[RES]], 1
+// CHECK: br i1 [[FAIL_SUCCESS]], label %[[EXIT:.+]], label %[[CONT]]
+// CHECK: [[EXIT]]
+#pragma omp atomic write
+  float2x.x = ulv;
+// CHECK: call i32 @llvm.read_register.i32(
+// CHECK: sitofp i32 %{{.+}} to double
+// CHECK: bitcast double %{{.+}} to i64
+// CHECK: store atomic i64 %{{.+}}, i64* bitcast (double* @{{.+}} to i64*) seq_cst
+// CHECK: call{{.*}} @__kmpc_flush(
+#pragma omp atomic write seq_cst
+  dv = rix;
+  return 0;
+}
+
+#endif
diff --git a/test/OpenMP/barrier_ast_print.cpp b/test/OpenMP/barrier_ast_print.cpp
index 0ce344f6d992..fbb478bfb6b7 100644
--- a/test/OpenMP/barrier_ast_print.cpp
+++ b/test/OpenMP/barrier_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/barrier_codegen.cpp b/test/OpenMP/barrier_codegen.cpp
index 2e817c138c3c..12d4c449ff71 100644
--- a/test/OpenMP/barrier_codegen.cpp
+++ b/test/OpenMP/barrier_codegen.cpp
@@ -1,14 +1,14 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
 #define HEADER
 
 // CHECK: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
-// CHECK-DAG: [[EXPLICIT_BARRIER_LOC:@.+]] = {{.+}} [[IDENT_T]] { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8]* @{{.+}}, i32 0, i32 0) }
-// CHECK-DAG: [[LOC:@.+]] = {{.+}} [[IDENT_T]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8]* @{{.+}}, i32 0, i32 0) }
+// CHECK-DAG: [[EXPLICIT_BARRIER_LOC:@.+]] = {{.+}} [[IDENT_T]] { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @{{.+}}, i32 0, i32 0) }
+// CHECK-DAG: [[LOC:@.+]] = {{.+}} [[IDENT_T]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @{{.+}}, i32 0, i32 0) }
 
 void foo() {}
 
diff --git a/test/OpenMP/barrier_messages.cpp b/test/OpenMP/barrier_messages.cpp
index 81ff84eab746..4dc6480a57e2 100644
--- a/test/OpenMP/barrier_messages.cpp
+++ b/test/OpenMP/barrier_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 template <class T>
 T tmain(T argc) {
diff --git a/test/OpenMP/critical_ast_print.cpp b/test/OpenMP/critical_ast_print.cpp
index 98ece88e0f71..87a1fe0bcee7 100644
--- a/test/OpenMP/critical_ast_print.cpp
+++ b/test/OpenMP/critical_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/critical_codegen.cpp b/test/OpenMP/critical_codegen.cpp
index dda532ca369b..d350d6e3aaea 100644
--- a/test/OpenMP/critical_codegen.cpp
+++ b/test/OpenMP/critical_codegen.cpp
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -15,6 +16,7 @@
 void foo() {}
 
 // CHECK-LABEL: @main
+// TERM_DEBUG-LABEL: @main
 int main() {
 // CHECK:       [[A_ADDR:%.+]] = alloca i8
   char a;
@@ -26,8 +28,8 @@ int main() {
 #pragma omp critical
   a = 2;
 // CHECK:       call void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
-// CHECK-NEXT:  call void [[FOO]]()
-// CHECK-NEXT:  call void @__kmpc_end_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
+// CHECK-NEXT:  invoke void [[FOO]]()
+// CHECK:       call void @__kmpc_end_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
 #pragma omp critical(the_name)
   foo();
 // CHECK-NOT:   call void @__kmpc_critical
@@ -35,4 +37,22 @@ int main() {
   return a;
 }
 
+// CHECK-LABEL:      parallel_critical
+// TERM_DEBUG-LABEL: parallel_critical
+void parallel_critical() {
+#pragma omp parallel
+#pragma omp critical
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_critical({{.+}}), !dbg [[DBG_LOC_START:![0-9]+]]
+  // TERM_DEBUG:     invoke void {{.*}}foo{{.*}}()
+  // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_end_critical({{.+}}), !dbg [[DBG_LOC_END:![0-9]+]]
+  // TERM_DEBUG:     [[TERM_LPAD]]
+  // TERM_DEBUG:     call void @__clang_call_terminate
+  // TERM_DEBUG:     unreachable
+  foo();
+}
+// TERM_DEBUG-DAG: [[DBG_LOC_START]] = !DILocation(line: [[@LINE-12]],
+// TERM_DEBUG-DAG: [[DBG_LOC_END]] = !DILocation(line: [[@LINE-3]],
 #endif
diff --git a/test/OpenMP/critical_messages.cpp b/test/OpenMP/critical_messages.cpp
index 08df9e0666e4..e3f2ad70fe98 100644
--- a/test/OpenMP/critical_messages.cpp
+++ b/test/OpenMP/critical_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 int foo();
 
diff --git a/test/OpenMP/flush_ast_print.cpp b/test/OpenMP/flush_ast_print.cpp
index 7eb18f0c17f2..081c5345033d 100644
--- a/test/OpenMP/flush_ast_print.cpp
+++ b/test/OpenMP/flush_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/flush_codegen.cpp b/test/OpenMP/flush_codegen.cpp
index eb9c721e3508..2c41b3a2ed6d 100644
--- a/test/OpenMP/flush_codegen.cpp
+++ b/test/OpenMP/flush_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -19,16 +19,16 @@ int main() {
   static int a;
 #pragma omp flush
 #pragma omp flush(a)
-  // CHECK: call void (%{{.+}}*, ...)* @__kmpc_flush(%{{.+}}* {{(@|%).+}}, i32 0)
-  // CHECK: call void (%{{.+}}*, ...)* @__kmpc_flush(%{{.+}}* {{(@|%).+}}, i32 0)
+  // CHECK: call void @__kmpc_flush(%{{.+}}* {{(@|%).+}})
+  // CHECK: call void @__kmpc_flush(%{{.+}}* {{(@|%).+}})
   return tmain(a);
   // CHECK: call {{.*}} [[TMAIN:@.+]](
   // CHECK: ret
 }
 
 // CHECK: [[TMAIN]]
-// CHECK: call void (%{{.+}}*, ...)* @__kmpc_flush(%{{.+}}* {{(@|%).+}}, i32 0)
-// CHECK: call void (%{{.+}}*, ...)* @__kmpc_flush(%{{.+}}* {{(@|%).+}}, i32 0)
+// CHECK: call void @__kmpc_flush(%{{.+}}* {{(@|%).+}})
+// CHECK: call void @__kmpc_flush(%{{.+}}* {{(@|%).+}})
 // CHECK: ret
 
 #endif
diff --git a/test/OpenMP/flush_messages.cpp b/test/OpenMP/flush_messages.cpp
index 8c61680929ab..2f87a2938476 100644
--- a/test/OpenMP/flush_messages.cpp
+++ b/test/OpenMP/flush_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 struct S1 { // expected-note 2 {{declared here}}
   int a;
diff --git a/test/OpenMP/for_ast_print.cpp b/test/OpenMP/for_ast_print.cpp
index 8802237b2574..c482ec56b9ca 100644
--- a/test/OpenMP/for_ast_print.cpp
+++ b/test/OpenMP/for_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/for_codegen.cpp b/test/OpenMP/for_codegen.cpp
index 757de65df46a..5dcacb254d69 100644
--- a/test/OpenMP/for_codegen.cpp
+++ b/test/OpenMP/for_codegen.cpp
@@ -1,49 +1,56 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 //
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 // CHECK: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[I:@.+]] = global i8 1,
+// CHECK-DAG: [[J:@.+]] = global i8 2,
+// CHECK-DAG: [[K:@.+]] = global i8 3,
+
 // CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void without_schedule_clause(float *a, float *b, float *c, float *d) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
-  #pragma omp for
+  #pragma omp for nowait
 // CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
 // UB = min(UB, GlobalUB)
-// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
 // CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423
 // CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
 // CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
 // CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
-// CHECK-NEXT: [[LB:%.+]] = load i32* [[OMP_LB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
 // CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
 // Loop header
-// CHECK: [[IV:%.+]] = load i32* [[OMP_IV]]
-// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
 // CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
 // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
   for (int i = 33; i < 32000000; i += 7) {
 // CHECK: [[LOOP1_BODY]]
 // Start of body: calculate i from IV:
-// CHECK: [[IV1_1:%.+]] = load i32* [[OMP_IV]]
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
 // CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
 // CHECK-NEXT: [[CALC_I_2:%.+]] = add nsw i32 33, [[CALC_I_1]]
 // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
 // ... loop body ...
 // End of body: store into a[i]:
 // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
     a[i] = b[i] * c[i] * d[i];
-// CHECK: [[IV1_2:%.+]] = load i32* [[OMP_IV]]{{.*}}
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
 // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
 // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
 // CHECK-NEXT: br label %{{.+}}
   }
 // CHECK: [[LOOP1_END]]
 // CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
-// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK-NOT: __kmpc_cancel_barrier
 // CHECK: ret void
 }
 
@@ -53,39 +60,355 @@ void static_not_chunked(float *a, float *b, float *c, float *d) {
   #pragma omp for schedule(static)
 // CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
 // UB = min(UB, GlobalUB)
-// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
 // CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423
 // CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
 // CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
 // CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
-// CHECK-NEXT: [[LB:%.+]] = load i32* [[OMP_LB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
 // CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
 // Loop header
-// CHECK: [[IV:%.+]] = load i32* [[OMP_IV]]
-// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
 // CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
 // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
   for (int i = 32000000; i > 33; i += -7) {
 // CHECK: [[LOOP1_BODY]]
 // Start of body: calculate i from IV:
-// CHECK: [[IV1_1:%.+]] = load i32* [[OMP_IV]]
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
 // CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
 // CHECK-NEXT: [[CALC_I_2:%.+]] = sub nsw i32 32000000, [[CALC_I_1]]
 // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
 // ... loop body ...
 // End of body: store into a[i]:
 // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
     a[i] = b[i] * c[i] * d[i];
-// CHECK: [[IV1_2:%.+]] = load i32* [[OMP_IV]]{{.*}}
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
 // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
 // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
 // CHECK-NEXT: br label %{{.+}}
   }
 // CHECK: [[LOOP1_END]]
 // CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
-// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_chunked(float *a, float *b, float *c, float *d) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(static, 5)
+// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
+// UB = min(UB, GlobalUB)
+// CHECK: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 16908288, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+
+// Outer loop header
+// CHECK: [[O_IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[O_UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ule i32 [[O_IV]], [[O_UB]]
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned i = 131071; i <= 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i32 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// Update the counters, adding stride
+// CHECK:  [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]]
+// CHECK-NEXT: [[ADD_LB:%.+]] = add i32 [[LB]], [[ST]]
+// CHECK-NEXT: store i32 [[ADD_LB]], i32* [[OMP_LB]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]]
+// CHECK-NEXT: [[ADD_UB:%.+]] = add i32 [[UB]], [[ST]]
+// CHECK-NEXT: store i32 [[ADD_UB]], i32* [[OMP_UB]]
+
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void dynamic1(float *a, float *b, float *c, float *d) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(dynamic)
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 35, i64 0, i64 16908287, i64 1, i64 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}!llvm.mem.parallel_loop_access
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}guided7{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void guided7(float *a, float *b, float *c, float *d) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(guided, 7)
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 36, i64 0, i64 16908287, i64 1, i64 7)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}!llvm.mem.parallel_loop_access
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}test_auto{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void test_auto(float *a, float *b, float *c, float *d) {
+  unsigned int x = 0;
+  unsigned int y = 0;
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(auto) collapse(2)
+// CHECK: call void @__kmpc_dispatch_init_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 38, i64 0, i64 [[LAST_ITER:%[^,]+]], i64 1, i64 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+// FIXME: When the iteration count of some nested loop is not a known constant,
+// we should pre-calculate it, like we do for the total number of iterations!
+  for (char i = static_cast<char>(y); i <= '9'; ++i)
+    for (x = 11; x > 0; --x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}runtime{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void runtime(float *a, float *b, float *c, float *d) {
+  int x = 0;
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for collapse(2) schedule(runtime)
+// CHECK: call void @__kmpc_dispatch_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 37, i32 0, i32 199, i32 1, i32 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned char i = '0' ; i <= '9'; ++i)
+    for (x = -10; x < 10; ++x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
 // CHECK: ret void
 }
 
+// CHECK-LABEL: test_precond
+void test_precond() {
+  // CHECK: [[A_ADDR:%.+]] = alloca i8,
+  // CHECK: [[I_ADDR:%.+]] = alloca i8,
+  char a = 0;
+  // CHECK: store i32 0, i32* [[IV_ADDR:%.+]],
+  // CHECK: [[A:%.+]] = load i8, i8* [[A_ADDR]],
+  // CHECK: [[CONV:%.+]] = sext i8 [[A]] to i32
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_ADDR]],
+  // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[CONV]], [[MUL]]
+  // CHECK: [[CONV:%.+]] = trunc i32 [[ADD]] to i8
+  // CHECK: store i8 [[CONV]], i8* [[I_ADDR]],
+  // CHECK: [[A:%.+]] = load i8, i8* [[A_ADDR]],
+  // CHECK: [[CONV:%.+]] = sext i8 [[A]] to i32
+  // CHECK: [[CMP:%.+]] = icmp slt i32 [[CONV]], 10
+  // CHECK: br i1 [[CMP]], label %[[PRECOND_THEN:[^,]+]], label %[[PRECOND_END:[^,]+]]
+  // CHECK: [[PRECOND_THEN]]
+  // CHECK: call void @__kmpc_for_static_init_4
+#pragma omp for
+  for(char i = a; i < 10; ++i);
+  // CHECK: call void @__kmpc_for_static_fini
+  // CHECK: [[PRECOND_END]]
+}
+
+// TERM_DEBUG-LABEL: foo
+int foo() {return 0;};
+
+// TERM_DEBUG-LABEL: parallel_for
+void parallel_for(float *a) {
+#pragma omp parallel
+#pragma omp for schedule(static, 5)
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_for_static_init_4u({{.+}}), !dbg [[DBG_LOC_START:![0-9]+]]
+  // TERM_DEBUG:     invoke i32 {{.*}}foo{{.*}}()
+  // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_for_static_fini({{.+}}), !dbg [[DBG_LOC_END:![0-9]+]]
+  // TERM_DEBUG:     call {{.+}} @__kmpc_cancel_barrier({{.+}}), !dbg [[DBG_LOC_CANCEL:![0-9]+]]
+  // TERM_DEBUG:     [[TERM_LPAD]]
+  // TERM_DEBUG:     call void @__clang_call_terminate
+  // TERM_DEBUG:     unreachable
+  for (unsigned i = 131071; i <= 2147483647; i += 127)
+    a[i] += foo();
+}
+// Check source line corresponds to "#pragma omp for schedule(static, 5)" above:
+// TERM_DEBUG-DAG: [[DBG_LOC_START]] = !DILocation(line: [[@LINE-15]],
+// TERM_DEBUG-DAG: [[DBG_LOC_END]] = !DILocation(line: [[@LINE-16]],
+// TERM_DEBUG-DAG: [[DBG_LOC_CANCEL]] = !DILocation(line: [[@LINE-17]],
+
+char i = 1, j = 2, k = 3;
+// CHECK-LABEL: for_with_global_lcv
+void for_with_global_lcv() {
+// CHECK: [[I_ADDR:%.+]] = alloca i8,
+// CHECK: [[J_ADDR:%.+]] = alloca i8,
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-NOT: [[I]]
+// CHECK: store i8 %{{.+}}, i8* [[I_ADDR]]
+// CHECK-NOT: [[I]]
+// CHECK: [[I_VAL:%.+]] = load i8, i8* [[I_ADDR]],
+// CHECK-NOT: [[I]]
+// CHECK: store i8 [[I_VAL]], i8* [[K]]
+// CHECK-NOT: [[I]]
+// CHECK: call void @__kmpc_for_static_fini(
+#pragma omp for
+  for (i = 0; i < 2; ++i) {
+    k = i;
+  }
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-NOT: [[J]]
+// CHECK: store i8 %{{.+}}, i8* [[J_ADDR]]
+// CHECK-NOT: [[J]]
+// CHECK: [[J_VAL:%.+]] = load i8, i8* [[J_ADDR]],
+// CHECK-NOT: [[J]]
+// CHECK: store i8 [[J_VAL]], i8* [[K]]
+// CHECK-NOT: [[J]]
+// CHECK: call void @__kmpc_for_static_fini(
+#pragma omp for collapse(2)
+  for (int i = 0; i < 2; ++i)
+  for (j = 0; j < 2; ++j) {
+    k = i;
+    k = j;
+  }
+}
+
 #endif // HEADER
 
diff --git a/test/OpenMP/for_collapse_messages.cpp b/test/OpenMP/for_collapse_messages.cpp
index 9e14234e0723..0a7443319e8c 100644
--- a/test/OpenMP/for_collapse_messages.cpp
+++ b/test/OpenMP/for_collapse_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/for_firstprivate_codegen.cpp b/test/OpenMP/for_firstprivate_codegen.cpp
new file mode 100644
index 000000000000..2402b501283c
--- /dev/null
+++ b/test/OpenMP/for_firstprivate_codegen.cpp
@@ -0,0 +1,280 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g = 1212;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp parallel
+#pragma omp for firstprivate(t_var, vec, s_arr, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+// CHECK: [[TEST:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> test;
+// CHECK-DAG: [[T_VAR:@.+]] = global i{{[0-9]+}} 333,
+int t_var = 333;
+// CHECK-DAG: [[VEC:@.+]] = global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+int vec[] = {1, 2};
+// CHECK-DAG: [[S_ARR:@.+]] = global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+S<float> s_arr[] = {1, 2};
+// CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> var(3);
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: ([[S_FLOAT_TY]]*)* [[S_FLOAT_TY_DESTR:@[^ ]+]] {{[^,]+}}, {{.+}}([[S_FLOAT_TY]]* [[TEST]]
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+// LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+// LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp for firstprivate(g)
+  for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // Skip temp vars for loop
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G]]
+    // LAMBDA: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // LAMBDA: call i32 @__kmpc_cancel_barrier(
+    g = 1;
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: call i32 @__kmpc_cancel_barrier(
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+// BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+// BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp for firstprivate(g)
+  for (int i = 0; i < 2; ++i) {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // Skip temp vars for loop
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // BLOCKS: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G]]
+    // BLOCKS: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS: call i32 @__kmpc_cancel_barrier(
+    g = 1;
+    // BLOCKS: call void @__kmpc_for_static_init_4(
+    // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_for_static_fini(
+    // BLOCKS: call i32 @__kmpc_cancel_barrier(
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp for firstprivate(t_var, vec, s_arr, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: alloca i{{[0-9]+}},
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
+
+// firstprivate t_var(t_var)
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// firstprivate vec(vec)
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+
+// firstprivate s_arr(s_arr)
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: getelementptr inbounds ([2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+
+// firstprivate var(var)
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+
+// Synchronization for initialization.
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// ~(firstprivate var), ~(firstprivate s_arr)
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// firstprivate t_var(t_var)
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// firstprivate vec(vec)
+// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+
+// firstprivate s_arr(s_arr)
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+
+// firstprivate var(var)
+// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_REF_PTR]],
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+
+// Synchronization for initialization.
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// ~(firstprivate var), ~(firstprivate s_arr)
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+#endif
+
diff --git a/test/OpenMP/for_firstprivate_messages.cpp b/test/OpenMP/for_firstprivate_messages.cpp
index 6aa977b65d92..2ec81104e771 100644
--- a/test/OpenMP/for_firstprivate_messages.cpp
+++ b/test/OpenMP/for_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -26,8 +26,8 @@ class S3 {
   S3 &operator=(const S3 &s3);
 
 public:
-  S3() : a(0) {} // expected-note {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
-  S3(S3 &s3) : a(s3.a) {} // expected-note {{candidate constructor not viable: 1st argument ('const S3') would lose const qualifier}}
+  S3() : a(0) {} // expected-note 2 {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
+  S3(S3 &s3) : a(s3.a) {} // expected-note 2 {{candidate constructor not viable: 1st argument ('const S3') would lose const qualifier}}
 };
 const S3 c;
 const S3 ca[5];
@@ -152,6 +152,21 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp for firstprivate(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -194,7 +209,7 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{no matching constructor for initialization of 'const S3'}}
+#pragma omp for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{no matching constructor for initialization of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
@@ -210,7 +225,7 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for firstprivate(ca) // OK
+#pragma omp for firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
@@ -288,6 +303,10 @@ int main(int argc, char **argv) {
 #pragma omp for firstprivate(i)       // expected-error {{firstprivate variable must be shared}}
   for (i = 0; i < argc; ++i)
     foo();
+#pragma omp parallel
+#pragma omp for firstprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
 
   return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
 }
diff --git a/test/OpenMP/for_lastprivate_codegen.cpp b/test/OpenMP/for_lastprivate_codegen.cpp
new file mode 100644
index 000000000000..90c40dde2e9a
--- /dev/null
+++ b/test/OpenMP/for_lastprivate_codegen.cpp
@@ -0,0 +1,468 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S<T> &operator=(const S<T> &);
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile int g = 1212;
+float f;
+char cnt;
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[CAP_MAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]* }
+// CHECK: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[X:@.+]] = global double 0.0
+// CHECK-DAG: [[F:@.+]] = global float 0.0
+// CHECK-DAG: [[CNT:@.+]] = global i8 0
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp parallel
+#pragma omp for lastprivate(t_var, vec, s_arr, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+namespace A {
+double x;
+}
+namespace B {
+using A::x;
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp for lastprivate(g)
+  for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    // LAMBDA: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %{{.+}}
+    // LAMBDA: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+    // LAMBDA: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+    // LAMBDA: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+    g = 1;
+    // Check for final copying of private values back to original vars.
+    // LAMBDA: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+    // LAMBDA: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+    // LAMBDA: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+    // LAMBDA: [[LAST_THEN]]
+    // Actual copying.
+
+    // original g=private_g;
+    // LAMBDA: [[G_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // LAMBDA: store volatile i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G]],
+    // LAMBDA: br label %[[LAST_DONE]]
+    // LAMBDA: [[LAST_DONE]]
+    // LAMBDA: call i32 @__kmpc_cancel_barrier(%{{.+}}* @{{.+}}, i{{[0-9]+}} [[GTID]])
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp for lastprivate(g)
+  for (int i = 0; i < 2; ++i) {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // BLOCKS: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    // BLOCKS: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %{{.+}}
+    // BLOCKS: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+    // BLOCKS: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+    // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+    g = 1;
+    // Check for final copying of private values back to original vars.
+    // BLOCKS: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+    // BLOCKS: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+    // BLOCKS: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+    // BLOCKS: [[LAST_THEN]]
+    // Actual copying.
+
+    // original g=private_g;
+    // BLOCKS: [[G_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // BLOCKS: store volatile i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G]],
+    // BLOCKS: br label %[[LAST_DONE]]
+    // BLOCKS: [[LAST_DONE]]
+    // BLOCKS: call i32 @__kmpc_cancel_barrier(%{{.+}}* @{{.+}}, i{{[0-9]+}} [[GTID]])
+    g = 1;
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+#pragma omp parallel
+#pragma omp for lastprivate(t_var, vec, s_arr, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+#pragma omp parallel
+#pragma omp for lastprivate(A::x, B::x) firstprivate(f) lastprivate(f)
+  for (int i = 0; i < 2; ++i) {
+    A::x++;
+  }
+#pragma omp parallel
+#pragma omp for firstprivate(f) lastprivate(f)
+  for (int i = 0; i < 2; ++i) {
+    A::x++;
+  }
+#pragma omp parallel
+#pragma omp for lastprivate(cnt)
+  for (cnt = 0; cnt < 2; ++cnt) {
+    A::x++;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, %{{.+}}*)* [[MAIN_MICROTASK1:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, %{{.+}}*)* [[MAIN_MICROTASK2:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, %{{.+}}*)* [[MAIN_MICROTASK3:@.+]] to void
+// CHECK: = call {{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_MAIN_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+
+// Check for default initialization.
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: [[S_ARR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_REF_PTR]],
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_REF_PTR]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 %{{.+}}, i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// <Skip loop body>
+// CHECK: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 %{{.+}})
+
+// Check for final copying of private values back to original vars.
+// CHECK: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+// CHECK: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+// CHECK: [[LAST_THEN]]
+// Actual copying.
+
+// original t_var=private_t_var;
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_REF]],
+
+// original vec[]=private_vec[];
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+
+// original s_arr[]=private_s_arr[];
+// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], [[S_ARR_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}})
+// CHECK: br i1 {{.+}}, label %[[S_ARR_BODY_DONE]], label %[[S_ARR_BODY]]
+// CHECK: [[S_ARR_BODY_DONE]]
+
+// original var=private_var;
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN:@.+]]([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* {{.*}} [[VAR_PRIV]])
+// CHECK: br label %[[LAST_DONE]]
+// CHECK: [[LAST_DONE]]
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+
+//
+// CHECK: define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: [[F_PRIV:%.+]] = alloca float,
+// CHECK-NOT: alloca float
+// CHECK: [[X_PRIV:%.+]] = alloca double,
+// CHECK-NOT: alloca float
+// CHECK-NOT: alloca double
+
+// Check for default initialization.
+// CHECK-NOT: [[X_PRIV]]
+// CHECK: [[F_VAL:%.+]] = load float, float* [[F]],
+// CHECK: store float [[F_VAL]], float* [[F_PRIV]],
+// CHECK-NOT: [[X_PRIV]]
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// <Skip loop body>
+// CHECK: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+
+// Check for final copying of private values back to original vars.
+// CHECK: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+// CHECK: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+// CHECK: [[LAST_THEN]]
+// Actual copying.
+
+// original x=private_x;
+// CHECK: [[X_VAL:%.+]] = load double, double* [[X_PRIV]],
+// CHECK: store double [[X_VAL]], double* [[X]],
+
+// original f=private_f;
+// CHECK: [[F_VAL:%.+]] = load float, float* [[F_PRIV]],
+// CHECK: store float [[F_VAL]], float* [[F]],
+
+// CHECK-NEXT: br label %[[LAST_DONE]]
+// CHECK: [[LAST_DONE]]
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK2]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: [[F_PRIV:%.+]] = alloca float,
+// CHECK-NOT: alloca float
+
+// Check for default initialization.
+// CHECK: [[F_VAL:%.+]] = load float, float* [[F]],
+// CHECK: store float [[F_VAL]], float* [[F_PRIV]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// <Skip loop body>
+// CHECK: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+
+// Check for final copying of private values back to original vars.
+// CHECK: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+// CHECK: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+// CHECK: [[LAST_THEN]]
+// Actual copying.
+
+// original f=private_f;
+// CHECK: [[F_VAL:%.+]] = load float, float* [[F_PRIV]],
+// CHECK: store float [[F_VAL]], float* [[F]],
+
+// CHECK-NEXT: br label %[[LAST_DONE]]
+// CHECK: [[LAST_DONE]]
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK3]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: [[CNT_PRIV:%.+]] = alloca i8,
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
+// UB = min(UB, GlobalUB)
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 1
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 1, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// <Skip loop body>
+// CHECK: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+
+// Check for final copying of private values back to original vars.
+// CHECK: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+// CHECK: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+// CHECK: [[LAST_THEN]]
+
+// Calculate last iter count
+// CHECK: store i32 1, i32* [[OMP_IV]]
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add nsw i32 [[IV1_1]], 1
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[OMP_IV]]
+// Actual copying.
+
+// original cnt=private_cnt;
+// Calculate private cnt value.
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[MUL:%.+]] = mul nsw i32 [[IV1_1]], 1
+// CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]]
+// CHECK: [[CONV:%.+]] = trunc i32 [[ADD]] to i8
+// CHECK: store i8 [[CONV]], i8* [[CNT_PRIV]]
+// CHECK: [[CNT_VAL:%.+]] = load i8, i8* [[CNT_PRIV]],
+// CHECK: store i8 [[CNT_VAL]], i8* [[CNT]],
+
+// CHECK-NEXT: br label %[[LAST_DONE]]
+// CHECK: [[LAST_DONE]]
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+
+// Check for default initialization.
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: [[S_ARR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_REF_PTR]],
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_REF_PTR]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 %{{.+}}, i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// <Skip loop body>
+// CHECK: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 %{{.+}})
+
+// Check for final copying of private values back to original vars.
+// CHECK: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+// CHECK: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+// CHECK: [[LAST_THEN]]
+// Actual copying.
+
+// original t_var=private_t_var;
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_REF]],
+
+// original vec[]=private_vec[];
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+
+// original s_arr[]=private_s_arr[];
+// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
+// CHECK: [[S_ARR_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_BEGIN]], [[S_ARR_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}})
+// CHECK: br i1 {{.+}}, label %[[S_ARR_BODY_DONE]], label %[[S_ARR_BODY]]
+// CHECK: [[S_ARR_BODY_DONE]]
+
+// original var=private_var;
+// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN:@.+]]([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* {{.*}} [[VAR_PRIV]])
+// CHECK: br label %[[LAST_DONE]]
+// CHECK: [[LAST_DONE]]
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+#endif
+
diff --git a/test/OpenMP/for_lastprivate_messages.cpp b/test/OpenMP/for_lastprivate_messages.cpp
index a3a1c4b34ed8..2fa14b63bd11 100644
--- a/test/OpenMP/for_lastprivate_messages.cpp
+++ b/test/OpenMP/for_lastprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -15,15 +15,17 @@ class S2 {
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator =(const S2&) const;
+  S2 &operator =(const S2&);
   static float S2s; // expected-note {{static data member is predetermined as shared}}
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
 const S2 b;
 const S2 ba[5];
-class S3 { // expected-note 2 {{'S3' declared here}}
+class S3 {
   int a;
-  S3 &operator=(const S3 &s3);
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
 
 public:
   S3() : a(0) {}
@@ -32,17 +34,17 @@ public:
 const S3 c;         // expected-note {{global variable is predetermined as shared}}
 const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
 extern const int f; // expected-note {{global variable is predetermined as shared}}
-class S4 {          // expected-note 3 {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4();             // expected-note 3 {{implicitly declared private here}}
   S4(const S4 &s4);
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
 
 public:
   S5(const S5 &s5) : a(s5.a) {}
@@ -62,8 +64,8 @@ S3 h;
 
 template <class I, class C>
 int foomain(int argc, char **argv) {
-  I e(4); // expected-note {{'e' defined here}}
-  I g(5); // expected-note {{'g' defined here}}
+  I e(4);
+  I g(5);
   int i;
   int &j = i; // expected-note {{'j' defined here}}
 #pragma omp parallel
@@ -107,7 +109,7 @@ int foomain(int argc, char **argv) {
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp parallel
-#pragma omp for lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp parallel
@@ -140,12 +142,27 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp for lastprivate(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
-  S4 e(4);               // expected-note {{'e' defined here}}
-  S5 g(5);               // expected-note {{'g' defined here}}
-  S3 m;                  // expected-note 2 {{'m' defined here}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
   S6 n(2);
   int i;
   int &j = i; // expected-note {{'j' defined here}}
@@ -223,11 +240,11 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
@@ -235,6 +252,10 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
+#pragma omp for lastprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
 #pragma omp for private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
   for (i = 0; i < argc; ++i)
     foo();
@@ -255,7 +276,7 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for firstprivate(m) lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp for firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
diff --git a/test/OpenMP/for_loop_messages.cpp b/test/OpenMP/for_loop_messages.cpp
index cb32484e43fb..f425defdf3e2 100644
--- a/test/OpenMP/for_loop_messages.cpp
+++ b/test/OpenMP/for_loop_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
 
 class S {
   int a;
@@ -10,15 +10,18 @@ public:
 };
 
 static int sii;
-#pragma omp threadprivate(sii) // expected-note {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
 static int globalii;
 
+register int reg0 __asm__("0");
+
 int test_iteration_spaces() {
   const int N = 100;
   float a[N], b[N], c[N];
   int ii, jj, kk;
   float fii;
   double dii;
+  register int reg; // expected-warning {{'register' storage class specifier is deprecated}}
 #pragma omp parallel
 #pragma omp for
   for (int i = 0; i < 10; i += 1) {
@@ -305,7 +308,6 @@ int test_iteration_spaces() {
 
 #pragma omp parallel
   {
-// expected-error@+2 {{loop iteration variable in the associated loop of 'omp for' directive may not be threadprivate or thread local, predetermined as private}}
 #pragma omp for
     for (sii = 0; sii < 10; sii += 1)
       c[sii] = a[sii];
@@ -313,7 +315,20 @@ int test_iteration_spaces() {
 
 #pragma omp parallel
   {
-// expected-error@+2 {{loop iteration variable in the associated loop of 'omp for' directive may not be a variable with global storage without being explicitly marked as private}}
+#pragma omp for
+    for (reg0 = 0; reg0 < 10; reg0 += 1)
+      c[reg0] = a[reg0];
+  }
+
+#pragma omp parallel
+  {
+#pragma omp for
+    for (reg = 0; reg < 10; reg += 1)
+      c[reg] = a[reg];
+  }
+
+#pragma omp parallel
+  {
 #pragma omp for
     for (globalii = 0; globalii < 10; globalii += 1)
       c[globalii] = a[globalii];
@@ -321,7 +336,6 @@ int test_iteration_spaces() {
 
 #pragma omp parallel
   {
-// expected-error@+3 {{loop iteration variable in the associated loop of 'omp for' directive may not be a variable with global storage without being explicitly marked as private}}
 #pragma omp for collapse(2)
     for (ii = 0; ii < 10; ii += 1)
     for (globalii = 0; globalii < 10; globalii += 1)
diff --git a/test/OpenMP/for_misc_messages.c b/test/OpenMP/for_misc_messages.c
index 8a721807d8dc..f5b87514947c 100644
--- a/test/OpenMP/for_misc_messages.c
+++ b/test/OpenMP/for_misc_messages.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -triple x86_64-unknown-unknown -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -triple x86_64-unknown-unknown -verify %s
 
 // expected-error@+1 {{unexpected OpenMP directive '#pragma omp for'}}
 #pragma omp for
diff --git a/test/OpenMP/for_private_codegen.cpp b/test/OpenMP/for_private_codegen.cpp
new file mode 100644
index 000000000000..8fc5fe7a18b0
--- /dev/null
+++ b/test/OpenMP/for_private_codegen.cpp
@@ -0,0 +1,188 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[CAP_MAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]* }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp parallel
+#pragma omp for private(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp for private(g)
+  for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+    // LAMBDA: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    g = 1;
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store double* [[G_PRIVATE_ADDR]], double** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call{{( x86_thiscallcc)?}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+      // LAMBDA: store volatile double 2.0{{.+}}, double* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* {{.+}})
+#pragma omp parallel
+#pragma omp for private(g)
+  for (int i = 0; i < 2; ++i) {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+    // BLOCKS: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    g = 1;
+    // BLOCKS: call void @__kmpc_for_static_init_4(
+    // BLOCKS: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: double* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_for_static_fini(
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile double 2.0{{.+}}, double*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+#pragma omp parallel
+#pragma omp for private(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  int i;
+#pragma omp parallel
+#pragma omp for private(i)
+  for (i = 0; i < 2; ++i) {
+    ;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_MAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK-NOT: alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret void
+#endif
+
diff --git a/test/OpenMP/for_private_messages.cpp b/test/OpenMP/for_private_messages.cpp
index 45c8683cfa8e..225a1beb99ea 100644
--- a/test/OpenMP/for_private_messages.cpp
+++ b/test/OpenMP/for_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -108,6 +108,21 @@ int foomain(I argc, C **argv) {
   return 0;
 }
 
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp for private(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
@@ -149,6 +164,9 @@ int main(int argc, char **argv) {
 #pragma omp for private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
   for (int k = 0; k < argc; ++k)
     ++k;
+#pragma omp for private(B::x) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
 #pragma omp for shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp for'}}
   for (int k = 0; k < argc; ++k)
     ++k;
diff --git a/test/OpenMP/for_reduction_codegen.cpp b/test/OpenMP/for_reduction_codegen.cpp
new file mode 100644
index 000000000000..6763686dd0fc
--- /dev/null
+++ b/test/OpenMP/for_reduction_codegen.cpp
@@ -0,0 +1,702 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+volatile double g;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  operator T() { return T(); }
+  S &operator&(const S &) { return *this; }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { float*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float*, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]* }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [[S_INT_TY]]*, [[S_INT_TY]]*, i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]* }
+// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
+
+template <typename T>
+T tmain() {
+  T t;
+  S<T> test;
+  T t_var = T(), t_var1;
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3), var1;
+#pragma omp parallel
+#pragma omp for reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1) nowait
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+#pragma omp parallel
+#pragma omp for reduction(&& : t_var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp for reduction(+:g)
+    for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* %{{.+}})
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+
+    // Reduction list for runtime.
+    // LAMBDA: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+    // LAMBDA: store double 0.0{{.+}}, double* [[G_PRIVATE_ADDR]]
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    g = 1;
+    // LAMBDA: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store double* [[G_PRIVATE_ADDR]], double** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_for_static_fini(
+
+    // LAMBDA: [[G_PRIV_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i32 0, i32 0
+    // LAMBDA: [[BITCAST:%.+]] = bitcast double* [[G_PRIVATE_ADDR]] to i8*
+    // LAMBDA: store i8* [[BITCAST]], i8** [[G_PRIV_REF]],
+    // LAMBDA: call i32 @__kmpc_reduce(
+    // LAMBDA: switch i32 %{{.+}}, label %[[REDUCTION_DONE:.+]] [
+    // LAMBDA: i32 1, label %[[CASE1:.+]]
+    // LAMBDA: i32 2, label %[[CASE2:.+]]
+    // LAMBDA: [[CASE1]]
+    // LAMBDA: [[G_VAL:%.+]] = load double, double* [[G]]
+    // LAMBDA: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE_ADDR]]
+    // LAMBDA: [[ADD:%.+]] = fadd double [[G_VAL]], [[G_PRIV_VAL]]
+    // LAMBDA: store double [[ADD]], double* [[G]]
+    // LAMBDA: call void @__kmpc_end_reduce(
+    // LAMBDA: br label %[[REDUCTION_DONE]]
+    // LAMBDA: [[CASE2]]
+    // LAMBDA: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE_ADDR]]
+    // LAMBDA: fadd double
+    // LAMBDA: cmpxchg i64*
+    // LAMBDA: call void @__kmpc_end_reduce(
+    // LAMBDA: br label %[[REDUCTION_DONE]]
+    // LAMBDA: [[REDUCTION_DONE]]
+    // LAMBDA: ret void
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+      // LAMBDA: store volatile double 2.0{{.+}}, double* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp for reduction(-:g)
+    for (int i = 0; i < 2; ++i)  {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* %{{.+}})
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+
+    // Reduction list for runtime.
+    // BLOCKS: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+    // BLOCKS: store double 0.0{{.+}}, double* [[G_PRIVATE_ADDR]]
+    g = 1;
+    // BLOCKS: call void @__kmpc_for_static_init_4(
+    // BLOCKS: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: double* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_for_static_fini(
+
+    // BLOCKS: [[G_PRIV_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i32 0, i32 0
+    // BLOCKS: [[BITCAST:%.+]] = bitcast double* [[G_PRIVATE_ADDR]] to i8*
+    // BLOCKS: store i8* [[BITCAST]], i8** [[G_PRIV_REF]],
+    // BLOCKS: call i32 @__kmpc_reduce(
+    // BLOCKS: switch i32 %{{.+}}, label %[[REDUCTION_DONE:.+]] [
+    // BLOCKS: i32 1, label %[[CASE1:.+]]
+    // BLOCKS: i32 2, label %[[CASE2:.+]]
+    // BLOCKS: [[CASE1]]
+    // BLOCKS: [[G_VAL:%.+]] = load double, double* [[G]]
+    // BLOCKS: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE_ADDR]]
+    // BLOCKS: [[ADD:%.+]] = fadd double [[G_VAL]], [[G_PRIV_VAL]]
+    // BLOCKS: store double [[ADD]], double* [[G]]
+    // BLOCKS: call void @__kmpc_end_reduce(
+    // BLOCKS: br label %[[REDUCTION_DONE]]
+    // BLOCKS: [[CASE2]]
+    // BLOCKS: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE_ADDR]]
+    // BLOCKS: fadd double
+    // BLOCKS: cmpxchg i64*
+    // BLOCKS: call void @__kmpc_end_reduce(
+    // BLOCKS: br label %[[REDUCTION_DONE]]
+    // BLOCKS: [[REDUCTION_DONE]]
+    // BLOCKS: ret void
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile double 2.0{{.+}}, double*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  float t_var = 0, t_var1;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3), var1;
+#pragma omp parallel
+#pragma omp for reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_MAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca float,
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca float,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR_REF:%.+]] = load float*, float** [[T_VAR_PTR_REF]],
+// For + reduction operation initial value of private variable is 0.
+// CHECK: store float 0.0{{.+}}, float* [[T_VAR_PRIV]],
+
+// CHECK: [[VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_PTR_REF:%.+]],
+// For & reduction operation initial value of private variable is ones in all bits.
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+
+// CHECK: [[VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR1_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_PTR_REF:%.+]],
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[VAR1_PRIV]])
+
+// CHECK: [[T_VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR1_REF:%.+]] = load float*, float** [[T_VAR1_PTR_REF]],
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: store float 0x47EFFFFFE0000000, float* [[T_VAR1_PRIV]],
+
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 0
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 3
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: [[T_VAR_VAL:%.+]] = load float, float* [[T_VAR_REF]],
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load float, float* [[T_VAR_PRIV]],
+// CHECK: [[UP:%.+]] = fadd float [[T_VAR_VAL]], [[T_VAR_PRIV_VAL]]
+// CHECK: store float [[UP]], float* [[T_VAR_REF]],
+
+// var = var.operator &(var_reduction);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = uitofp i1 [[COND_LVALUE]] to float
+// CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: [[T_VAR1_VAL:%.+]] = load float, float* [[T_VAR1_REF]],
+// CHECK: [[T_VAR1_PRIV_VAL:%.+]] = load float, float* [[T_VAR1_PRIV]],
+// CHECK: [[CMP:%.+]] = fcmp olt float [[T_VAR1_VAL]], [[T_VAR1_PRIV_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi float
+// CHECK: store float [[UP]], float* [[T_VAR1_REF]],
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: load float, float* [[T_VAR_PRIV]]
+// CHECK: [[T_VAR_REF_INT:%.+]] = bitcast float* [[T_VAR_REF]] to i32*
+// CHECK: [[OLD1:%.+]] = load atomic i32, i32* [[T_VAR_REF_INT]] monotonic,
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[ORIG_OLD_INT:%.+]] = phi i32 [ [[OLD1]], %{{.+}} ], [ [[OLD2:%.+]], %[[CONT]] ]
+// CHECK: fadd float
+// CHECK: [[UP_INT:%.+]] = load i32, i32*
+// CHECK: [[T_VAR_REF_INT:%.+]] = bitcast float* [[T_VAR_REF]] to i32*
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[T_VAR_REF_INT]], i32 [[ORIG_OLD_INT]], i32 [[UP_INT]] monotonic monotonic
+// CHECK: [[OLD2:%.+]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[ATOMIC_DONE:.+]], label %[[CONT]]
+// CHECK: [[ATOMIC_DONE]]
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = uitofp i1 [[COND_LVALUE]] to float
+// CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: load float, float* [[T_VAR1_PRIV]]
+// CHECK: [[T_VAR1_REF_INT:%.+]] = bitcast float* [[T_VAR1_REF]] to i32*
+// CHECK: [[OLD1:%.+]] = load atomic i32, i32* [[T_VAR1_REF_INT]] monotonic,
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[ORIG_OLD_INT:%.+]] = phi i32 [ [[OLD1]], %{{.+}} ], [ [[OLD2:%.+]], %{{.+}} ]
+// CHECK: [[CMP:%.+]] = fcmp olt float
+// CHECK: br i1 [[CMP]]
+// CHECK: phi float
+// CHECK: [[UP_INT:%.+]] = load i32
+// CHECK: [[T_VAR1_REF_INT:%.+]] = bitcast float* [[T_VAR1_REF]] to i32*
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[T_VAR1_REF_INT]], i32 [[ORIG_OLD_INT]], i32 [[UP_INT]] monotonic monotonic
+// CHECK: [[OLD2:%.+]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[ATOMIC_DONE:.+]], label %[[CONT]]
+// CHECK: [[ATOMIC_DONE]]
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (float*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to float*
+// t_var_rhs = (float*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to float*
+
+// var_lhs = (S<float>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_FLOAT_TY]]*
+// var_rhs = (S<float>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// var1_lhs = (S<float>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_FLOAT_TY]]*
+// var1_rhs = (S<float>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// t_var1_lhs = (float*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to float*
+// t_var1_rhs = (float*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to float*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: [[T_VAR_LHS_VAL:%.+]] = load float, float* [[T_VAR_LHS]],
+// CHECK: [[T_VAR_RHS_VAL:%.+]] = load float, float* [[T_VAR_RHS]],
+// CHECK: [[UP:%.+]] = fadd float [[T_VAR_LHS_VAL]], [[T_VAR_RHS_VAL]]
+// CHECK: store float [[UP]], float* [[T_VAR_LHS]],
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_LHS]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_RHS]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_LHS]])
+// CHECK: [[VAR1_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_RHS]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = uitofp i1 [[COND_LVALUE]] to float
+// CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load float, float* [[T_VAR1_LHS]],
+// CHECK: [[T_VAR1_RHS_VAL:%.+]] = load float, float* [[T_VAR1_RHS]],
+// CHECK: [[CMP:%.+]] = fcmp olt float [[T_VAR1_LHS_VAL]], [[T_VAR1_RHS_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi float
+// CHECK: store float [[UP]], float* [[T_VAR1_LHS]],
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_TMAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca i{{[0-9]+}},
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// For + reduction operation initial value of private variable is 0.
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// CHECK: [[VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_PTR_REF:%.+]],
+// For & reduction operation initial value of private variable is ones in all bits.
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[VAR_PRIV]])
+
+// CHECK: [[VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_PTR_REF:%.+]],
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[VAR1_PRIV]])
+
+// CHECK: [[T_VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR1_PTR_REF]],
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: store i{{[0-9]+}} 2147483647, i{{[0-9]+}}* [[T_VAR1_PRIV]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 0
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 3
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce_nowait(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: [[UP:%.+]] = add nsw i{{[0-9]+}} [[T_VAR_VAL]], [[T_VAR_PRIV_VAL]]
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR_REF]],
+
+// var = var.operator &(var_reduction);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
+// CHECK: [[T_VAR1_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_PRIV]],
+// CHECK: [[CMP:%.+]] = icmp slt i{{[0-9]+}} [[T_VAR1_VAL]], [[T_VAR1_PRIV_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi i32
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR1_REF]],
+
+// __kmpc_end_reduce_nowait(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]]
+// CHECK: atomicrmw add i32* [[T_VAR_REF]], i32 [[T_VAR_PRIV_VAL]] monotonic
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: [[T_VAR1_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_PRIV]]
+// CHECK: atomicrmw min i32* [[T_VAR1_REF]], i32 [[T_VAR1_PRIV_VAL]] monotonic
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (i{{[0-9]+}}*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to i{{[0-9]+}}*
+// t_var_rhs = (i{{[0-9]+}}*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to i{{[0-9]+}}*
+
+// var_lhs = (S<i{{[0-9]+}}>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_INT_TY]]*
+// var_rhs = (S<i{{[0-9]+}}>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_INT_TY]]*
+
+// var1_lhs = (S<i{{[0-9]+}}>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_INT_TY]]*
+// var1_rhs = (S<i{{[0-9]+}}>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_INT_TY]]*
+
+// t_var1_lhs = (i{{[0-9]+}}*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to i{{[0-9]+}}*
+// t_var1_rhs = (i{{[0-9]+}}*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to i{{[0-9]+}}*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: [[T_VAR_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_LHS]],
+// CHECK: [[T_VAR_RHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_RHS]],
+// CHECK: [[UP:%.+]] = add nsw i{{[0-9]+}} [[T_VAR_LHS_VAL]], [[T_VAR_RHS_VAL]]
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR_LHS]],
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_RHS]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
+// CHECK: [[T_VAR1_RHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_RHS]],
+// CHECK: [[CMP:%.+]] = icmp slt i{{[0-9]+}} [[T_VAR1_LHS_VAL]], [[T_VAR1_RHS_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi i32
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR1_LHS]],
+// CHECK: ret void
+
+#endif
+
diff --git a/test/OpenMP/for_reduction_messages.cpp b/test/OpenMP/for_reduction_messages.cpp
index 62fee52e4903..9c15e37989b2 100644
--- a/test/OpenMP/for_reduction_messages.cpp
+++ b/test/OpenMP/for_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
 
 void foo() {
 }
@@ -11,7 +11,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;               // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i]; // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];       // expected-note 2 {{'q' defined here}}
-  T fl;                    // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp parallel
 #pragma omp for reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
@@ -104,11 +104,11 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp for reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -128,7 +128,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp for reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -152,7 +152,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -168,7 +168,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp for reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -204,18 +204,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;           // expected-note 2 {{'j' defined here}}
   S3 &p = k;            // expected-note 2 {{'p' defined here}}
   const int &r = da[i]; // expected-note {{'r' defined here}}
   int &q = qa[i];       // expected-note {{'q' defined here}}
-  float fl;             // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp parallel
 #pragma omp for reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
@@ -269,7 +277,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp for reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -293,7 +301,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -305,7 +313,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -313,7 +321,11 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp for reduction(+ : B::x, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel
+#pragma omp for reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
diff --git a/test/OpenMP/for_schedule_messages.cpp b/test/OpenMP/for_schedule_messages.cpp
index be4ff4f70349..c4490e88f02b 100644
--- a/test/OpenMP/for_schedule_messages.cpp
+++ b/test/OpenMP/for_schedule_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/for_simd_aligned_messages.cpp b/test/OpenMP/for_simd_aligned_messages.cpp
index 70131d5e5a35..d41ecc1f28ee 100644
--- a/test/OpenMP/for_simd_aligned_messages.cpp
+++ b/test/OpenMP/for_simd_aligned_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
 
 struct B {
   static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
diff --git a/test/OpenMP/for_simd_ast_print.cpp b/test/OpenMP/for_simd_ast_print.cpp
index 759706424692..21c15a2fd677 100644
--- a/test/OpenMP/for_simd_ast_print.cpp
+++ b/test/OpenMP/for_simd_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/for_simd_collapse_messages.cpp b/test/OpenMP/for_simd_collapse_messages.cpp
index 3b43f1f5d29e..7bb9b04cda7a 100644
--- a/test/OpenMP/for_simd_collapse_messages.cpp
+++ b/test/OpenMP/for_simd_collapse_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/for_simd_firstprivate_messages.cpp b/test/OpenMP/for_simd_firstprivate_messages.cpp
index 1345bfc9886a..6b49b11eaff6 100644
--- a/test/OpenMP/for_simd_firstprivate_messages.cpp
+++ b/test/OpenMP/for_simd_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -152,6 +152,14 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -247,6 +255,10 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
+#pragma omp for simd firstprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
 #pragma omp for simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
   for (i = 0; i < argc; ++i)
     foo();
diff --git a/test/OpenMP/for_simd_lastprivate_messages.cpp b/test/OpenMP/for_simd_lastprivate_messages.cpp
index 38651e5c8e27..2aa4dd12045c 100644
--- a/test/OpenMP/for_simd_lastprivate_messages.cpp
+++ b/test/OpenMP/for_simd_lastprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -15,15 +15,17 @@ class S2 {
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
+  S2 &operator =(const S2&);
+  const S2 &operator =(const S2&) const;
   static float S2s; // expected-note {{static data member is predetermined as shared}}
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
 const S2 b;
 const S2 ba[5];
-class S3 { // expected-note 2 {{'S3' declared here}}
+class S3 {
   int a;
-  S3 &operator=(const S3 &s3);
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
 
 public:
   S3() : a(0) {}
@@ -32,17 +34,17 @@ public:
 const S3 c;         // expected-note {{global variable is predetermined as shared}}
 const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
 extern const int f; // expected-note {{global variable is predetermined as shared}}
-class S4 {          // expected-note 3 {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4();             // expected-note 3 {{implicitly declared private here}}
   S4(const S4 &s4);
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
 
 public:
   S5(const S5 &s5) : a(s5.a) {}
@@ -62,8 +64,8 @@ S3 h;
 
 template <class I, class C>
 int foomain(int argc, char **argv) {
-  I e(4); // expected-note {{'e' defined here}}
-  I g(5); // expected-note {{'g' defined here}}
+  I e(4);
+  I g(5);
   int i;
   int &j = i; // expected-note {{'j' defined here}}
 #pragma omp parallel
@@ -107,7 +109,7 @@ int foomain(int argc, char **argv) {
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp parallel
-#pragma omp for simd lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp for simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp parallel
@@ -140,12 +142,20 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
-  S4 e(4);               // expected-note {{'e' defined here}}
-  S5 g(5);               // expected-note {{'g' defined here}}
-  S3 m;                  // expected-note 2 {{'m' defined here}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
   S6 n(2);
   int i;
   int &j = i; // expected-note {{'j' defined here}}
@@ -223,15 +233,15 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp for simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp for simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+#pragma omp for simd lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
@@ -255,7 +265,7 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd firstprivate(m) lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp for simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
diff --git a/test/OpenMP/for_simd_linear_messages.cpp b/test/OpenMP/for_simd_linear_messages.cpp
index 9a935c3fdf28..189c8b0be88e 100644
--- a/test/OpenMP/for_simd_linear_messages.cpp
+++ b/test/OpenMP/for_simd_linear_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 namespace X {
   int x;
@@ -148,6 +148,14 @@ template<class I, class C> int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   double darr[100];
   // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
@@ -185,7 +193,7 @@ int main(int argc, char **argv) {
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
   #pragma omp for simd linear(e, g)
   for (int k = 0; k < argc; ++k) ++k;
-  #pragma omp for simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  #pragma omp for simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
   for (int k = 0; k < argc; ++k) ++k;
   #pragma omp parallel
   {
diff --git a/test/OpenMP/for_simd_loop_messages.cpp b/test/OpenMP/for_simd_loop_messages.cpp
index 403709f3453e..6e68eb8a082d 100644
--- a/test/OpenMP/for_simd_loop_messages.cpp
+++ b/test/OpenMP/for_simd_loop_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
 
 class S {
   int a;
@@ -10,7 +10,7 @@ public:
 };
 
 static int sii;
-#pragma omp threadprivate(sii) // expected-note {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
 static int globalii;
 
 int test_iteration_spaces() {
@@ -306,7 +306,6 @@ int test_iteration_spaces() {
 
 #pragma omp parallel
   {
-// expected-error@+2 {{loop iteration variable in the associated loop of 'omp for simd' directive may not be threadprivate or thread local, predetermined as linear}}
 #pragma omp for simd
     for (sii = 0; sii < 10; sii += 1)
       c[sii] = a[sii];
@@ -314,7 +313,6 @@ int test_iteration_spaces() {
 
 #pragma omp parallel
   {
-// expected-error@+2 {{loop iteration variable in the associated loop of 'omp for simd' directive may not be a variable with global storage without being explicitly marked as linear}}
 #pragma omp for simd
     for (globalii = 0; globalii < 10; globalii += 1)
       c[globalii] = a[globalii];
@@ -322,7 +320,6 @@ int test_iteration_spaces() {
 
 #pragma omp parallel
   {
-// expected-error@+3 {{loop iteration variable in the associated loop of 'omp for simd' directive may not be a variable with global storage without being explicitly marked as lastprivate}}
 #pragma omp for simd collapse(2)
     for (ii = 0; ii < 10; ii += 1)
     for (globalii = 0; globalii < 10; globalii += 1)
diff --git a/test/OpenMP/for_simd_misc_messages.c b/test/OpenMP/for_simd_misc_messages.c
index 870c37df20aa..e87a78976e2c 100644
--- a/test/OpenMP/for_simd_misc_messages.c
+++ b/test/OpenMP/for_simd_misc_messages.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
 
 // expected-error@+1 {{unexpected OpenMP directive '#pragma omp for simd'}}
 #pragma omp for simd
diff --git a/test/OpenMP/for_simd_private_messages.cpp b/test/OpenMP/for_simd_private_messages.cpp
index 016a5ec6b581..96885c859e99 100644
--- a/test/OpenMP/for_simd_private_messages.cpp
+++ b/test/OpenMP/for_simd_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -108,6 +108,14 @@ int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
@@ -146,7 +154,7 @@ int main(int argc, char **argv) {
 #pragma omp for simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (int k = 0; k < argc; ++k)
     ++k;
-#pragma omp for simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp for simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp for simd shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp for simd'}}
diff --git a/test/OpenMP/for_simd_reduction_messages.cpp b/test/OpenMP/for_simd_reduction_messages.cpp
index 708973be51ba..74952776d508 100644
--- a/test/OpenMP/for_simd_reduction_messages.cpp
+++ b/test/OpenMP/for_simd_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -11,7 +11,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;               // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i]; // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];       // expected-note 2 {{'q' defined here}}
-  T fl;                    // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp parallel
 #pragma omp for simd reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
@@ -104,11 +104,11 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp for simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -128,7 +128,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp for simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -152,7 +152,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -168,7 +168,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -204,18 +204,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;           // expected-note 2 {{'j' defined here}}
   S3 &p = k;            // expected-note 2 {{'p' defined here}}
   const int &r = da[i]; // expected-note {{'r' defined here}}
   int &q = qa[i];       // expected-note {{'q' defined here}}
-  float fl;             // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp parallel
 #pragma omp for simd reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
@@ -269,7 +277,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp for simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -293,7 +301,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -305,15 +313,15 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp for simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+#pragma omp for simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
diff --git a/test/OpenMP/for_simd_safelen_messages.cpp b/test/OpenMP/for_simd_safelen_messages.cpp
index 1a72964ce4e9..27a87b559bc9 100644
--- a/test/OpenMP/for_simd_safelen_messages.cpp
+++ b/test/OpenMP/for_simd_safelen_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/for_simd_schedule_messages.cpp b/test/OpenMP/for_simd_schedule_messages.cpp
index 8624359eb420..1367676ad58c 100644
--- a/test/OpenMP/for_simd_schedule_messages.cpp
+++ b/test/OpenMP/for_simd_schedule_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/linking.c b/test/OpenMP/linking.c
index 979ba1f4e41e..778216d723b8 100644
--- a/test/OpenMP/linking.c
+++ b/test/OpenMP/linking.c
@@ -1,18 +1,20 @@
 // Test the that the driver produces reasonable linker invocations with
-// -fopenmp or -fopenmp=libiomp5|libgomp.
+// -fopenmp or -fopenmp|libgomp.
+//
+// FIXME: Replace DEFAULT_OPENMP_LIB below with the value chosen at configure time.
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -fopenmp -target i386-unknown-linux \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-32 %s
 // CHECK-LD-32: "{{.*}}ld{{(.exe)?}}"
-// CHECK-LD-32: "-lgomp" "-lrt" "-lgcc"
+// CHECK-LD-32: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc"
 // CHECK-LD-32: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -fopenmp -target x86_64-unknown-linux \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64 %s
 // CHECK-LD-64: "{{.*}}ld{{(.exe)?}}"
-// CHECK-LD-64: "-lgomp" "-lrt" "-lgcc"
+// CHECK-LD-64: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc"
 // CHECK-LD-64: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
@@ -30,17 +32,17 @@
 // CHECK-GOMP-LD-64: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp=libiomp5 -target i386-unknown-linux \
+// RUN:     -fopenmp -target i386-unknown-linux \
 // RUN:   | FileCheck --check-prefix=CHECK-IOMP5-LD-32 %s
 // CHECK-IOMP5-LD-32: "{{.*}}ld{{(.exe)?}}"
-// CHECK-IOMP5-LD-32: "-liomp5" "-lgcc"
+// CHECK-IOMP5-LD-32: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc"
 // CHECK-IOMP5-LD-32: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp=libiomp5 -target x86_64-unknown-linux \
+// RUN:     -fopenmp -target x86_64-unknown-linux \
 // RUN:   | FileCheck --check-prefix=CHECK-IOMP5-LD-64 %s
 // CHECK-IOMP5-LD-64: "{{.*}}ld{{(.exe)?}}"
-// CHECK-IOMP5-LD-64: "-liomp5" "-lgcc"
+// CHECK-IOMP5-LD-64: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc"
 // CHECK-IOMP5-LD-64: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
@@ -54,18 +56,16 @@
 // CHECK-LIB-LD-64: error: unsupported argument 'lib' to option 'fopenmp='
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp -fopenmp=libiomp5 -target i386-unknown-linux \
-// RUN:   | FileCheck --check-prefix=CHECK-LD-WARN-32 %s
-// CHECK-LD-WARN-32: warning: argument unused during compilation: '-fopenmp=libiomp5'
-// CHECK-LD-WARN-32: "{{.*}}ld{{(.exe)?}}"
-// CHECK-LD-WARN-32: "-lgomp" "-lrt" "-lgcc"
-// CHECK-LD-WARN-32: "-lpthread" "-lc"
+// RUN:     -fopenmp -fopenmp=libgomp -target i386-unknown-linux \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-OVERRIDE-32 %s
+// CHECK-LD-OVERRIDE-32: "{{.*}}ld{{(.exe)?}}"
+// CHECK-LD-OVERRIDE-32: "-lgomp" "-lrt" "-lgcc"
+// CHECK-LD-OVERRIDE-32: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp -fopenmp=libiomp5 -target x86_64-unknown-linux \
-// RUN:   | FileCheck --check-prefix=CHECK-LD-WARN-64 %s
-// CHECK-LD-WARN-64: warning: argument unused during compilation: '-fopenmp=libiomp5'
-// CHECK-LD-WARN-64: "{{.*}}ld{{(.exe)?}}"
-// CHECK-LD-WARN-64: "-lgomp" "-lrt" "-lgcc"
-// CHECK-LD-WARN-64: "-lpthread" "-lc"
+// RUN:     -fopenmp -fopenmp=libgomp -target x86_64-unknown-linux \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-OVERRIDE-64 %s
+// CHECK-LD-OVERRIDE-64: "{{.*}}ld{{(.exe)?}}"
+// CHECK-LD-OVERRIDE-64: "-lgomp" "-lrt" "-lgcc"
+// CHECK-LD-OVERRIDE-64: "-lpthread" "-lc"
 //
diff --git a/test/OpenMP/master_ast_print.cpp b/test/OpenMP/master_ast_print.cpp
index 7ce4c10020c0..a36dd02d557b 100644
--- a/test/OpenMP/master_ast_print.cpp
+++ b/test/OpenMP/master_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/master_codegen.cpp b/test/OpenMP/master_codegen.cpp
index d354bae2d7ee..1eb47e4e736d 100644
--- a/test/OpenMP/master_codegen.cpp
+++ b/test/OpenMP/master_codegen.cpp
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -13,6 +14,7 @@
 void foo() {}
 
 // CHECK-LABEL: @main
+// TERM_DEBUG-LABEL: @main
 int main() {
   // CHECK:       [[A_ADDR:%.+]] = alloca i8
   char a;
@@ -32,8 +34,8 @@ int main() {
 // CHECK-NEXT:  [[IS_MASTER:%.+]] = icmp ne i32 [[RES]], 0
 // CHECK-NEXT:  br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
 // CHECK:       [[THEN]]
-// CHECK-NEXT:  call void [[FOO]]()
-// CHECK-NEXT:  call void @__kmpc_end_master([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT:  invoke void [[FOO]]()
+// CHECK:       call void @__kmpc_end_master([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
 // CHECK-NEXT:  br label {{%?}}[[EXIT]]
 // CHECK:       [[EXIT]]
 #pragma omp master
@@ -43,4 +45,23 @@ int main() {
   return a;
 }
 
+// CHECK-LABEL:      parallel_master
+// TERM_DEBUG-LABEL: parallel_master
+void parallel_master() {
+#pragma omp parallel
+#pragma omp master
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call i32 @__kmpc_master({{.+}}), !dbg [[DBG_LOC_START:![0-9]+]]
+  // TERM_DEBUG:     invoke void {{.*}}foo{{.*}}()
+  // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_end_master({{.+}}), !dbg [[DBG_LOC_END:![0-9]+]]
+  // TERM_DEBUG:     [[TERM_LPAD]]
+  // TERM_DEBUG:     call void @__clang_call_terminate
+  // TERM_DEBUG:     unreachable
+  foo();
+}
+// TERM_DEBUG-DAG: [[DBG_LOC_START]] = !DILocation(line: [[@LINE-12]],
+// TERM_DEBUG-DAG: [[DBG_LOC_END]] = !DILocation(line: [[@LINE-3]],
+
 #endif
diff --git a/test/OpenMP/master_messages.cpp b/test/OpenMP/master_messages.cpp
index fbe35ac68e4a..397f1394a3e9 100644
--- a/test/OpenMP/master_messages.cpp
+++ b/test/OpenMP/master_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 int foo();
 
diff --git a/test/OpenMP/nesting_of_regions.cpp b/test/OpenMP/nesting_of_regions.cpp
index a948ca3e3bcb..8a3bacf0f115 100644
--- a/test/OpenMP/nesting_of_regions.cpp
+++ b/test/OpenMP/nesting_of_regions.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
 
 void bar();
 
@@ -1979,35 +1979,40 @@ void foo() {
 
 // ATOMIC DIRECTIVE
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp sections // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -2015,7 +2020,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp section // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -2023,7 +2029,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp single // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -2031,7 +2038,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp master // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -2039,7 +2047,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp critical // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -2047,21 +2056,24 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -2069,7 +2081,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp task // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -2077,49 +2090,57 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp flush // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp target // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
@@ -4134,35 +4155,40 @@ void foo() {
 
 // ATOMIC DIRECTIVE
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp sections // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -4170,7 +4196,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp section // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -4178,7 +4205,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp single // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -4186,7 +4214,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp master // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -4194,7 +4223,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp critical // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -4202,21 +4232,24 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     for (int i = 0; i < 10; ++i)
       ;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -4224,7 +4257,8 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp task // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     {
@@ -4232,49 +4266,57 @@ void foo() {
     }
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp flush // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     bar();
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp target // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
   }
 #pragma omp atomic
-  // expected-error@+1 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
   {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
diff --git a/test/OpenMP/openmp_common.c b/test/OpenMP/openmp_common.c
index 3131b3045b6f..3765f4c5dc14 100644
--- a/test/OpenMP/openmp_common.c
+++ b/test/OpenMP/openmp_common.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 #pragma omp // expected-error {{expected an OpenMP directive}}
 #pragma omp unknown_directive // expected-error {{expected an OpenMP directive}}
diff --git a/test/OpenMP/ordered_ast_print.cpp b/test/OpenMP/ordered_ast_print.cpp
index a44350070356..0006080b2082 100644
--- a/test/OpenMP/ordered_ast_print.cpp
+++ b/test/OpenMP/ordered_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/ordered_codegen.cpp b/test/OpenMP/ordered_codegen.cpp
new file mode 100644
index 000000000000..adc6ff6f862d
--- /dev/null
+++ b/test/OpenMP/ordered_codegen.cpp
@@ -0,0 +1,217 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+//
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_not_chunked(float *a, float *b, float *c, float *d) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(static) ordered
+// CHECK: call void @__kmpc_dispatch_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (int i = 32000000; i > 33; i += -7) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
+// CHECK-NEXT: [[CALC_I_2:%.+]] = sub nsw i32 32000000, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+
+// ... start of ordered region ...
+// CHECK-NEXT: call void @__kmpc_ordered([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK-NEXT: call void @__kmpc_end_ordered([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// ... end of ordered region ...
+    #pragma omp ordered
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: call void @__kmpc_dispatch_fini_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void dynamic1(float *a, float *b, float *c, float *d) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(dynamic) ordered
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 67, i64 0, i64 16908287, i64 1, i64 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+
+// ... start of ordered region ...
+// CHECK-NEXT: call void @__kmpc_ordered([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK-NEXT: call void @__kmpc_end_ordered([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// ... end of ordered region ...
+    #pragma omp ordered
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+
+// ... end iteration for ordered loop ...
+// CHECK-NEXT: call void @__kmpc_dispatch_fini_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}test_auto{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void test_auto(float *a, float *b, float *c, float *d) {
+  unsigned int x = 0;
+  unsigned int y = 0;
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(auto) collapse(2) ordered
+// CHECK: call void @__kmpc_dispatch_init_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 70, i64 0, i64 [[LAST_ITER:%[^,]+]], i64 1, i64 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+// FIXME: When the iteration count of some nested loop is not a known constant,
+// we should pre-calculate it, like we do for the total number of iterations!
+  for (char i = static_cast<char>(y); i <= '9'; ++i)
+    for (x = 11; x > 0; --x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+
+// ... start of ordered region ...
+// CHECK: call void @__kmpc_ordered([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK-NEXT: call void @__kmpc_end_ordered([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// ... end of ordered region ...
+    #pragma omp ordered
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+
+// ... end iteration for ordered loop ...
+// CHECK-NEXT: call void @__kmpc_dispatch_fini_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}runtime{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void runtime(float *a, float *b, float *c, float *d) {
+  int x = 0;
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for collapse(2) schedule(runtime) ordered
+// CHECK: call void @__kmpc_dispatch_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 69, i32 0, i32 199, i32 1, i32 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned char i = '0' ; i <= '9'; ++i)
+    for (x = -10; x < 10; ++x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+
+// ... start of ordered region ...
+// CHECK: call void @__kmpc_ordered([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK-NEXT: call void @__kmpc_end_ordered([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// ... end of ordered region ...
+    #pragma omp ordered
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+
+// ... end iteration for ordered loop ...
+// CHECK-NEXT: call void @__kmpc_dispatch_fini_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+#endif // HEADER
+
diff --git a/test/OpenMP/ordered_messages.cpp b/test/OpenMP/ordered_messages.cpp
index 3f79df649895..72e59b7c7646 100644
--- a/test/OpenMP/ordered_messages.cpp
+++ b/test/OpenMP/ordered_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 int foo();
 
diff --git a/test/OpenMP/parallel_ast_print.cpp b/test/OpenMP/parallel_ast_print.cpp
index 0415c0ca555c..c3bd416a6d5f 100644
--- a/test/OpenMP/parallel_ast_print.cpp
+++ b/test/OpenMP/parallel_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/parallel_codegen.cpp b/test/OpenMP/parallel_codegen.cpp
index e50ab43c2808..16c9c59d8df9 100644
--- a/test/OpenMP/parallel_codegen.cpp
+++ b/test/OpenMP/parallel_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK-DEBUG %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK-DEBUG %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -9,12 +9,12 @@
 // CHECK-DAG: %struct.anon = type { i32* }
 // CHECK-DAG: %struct.anon.0 = type { i8*** }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 // CHECK-DEBUG-DAG: %ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DEBUG-DAG: %struct.anon = type { i32* }
 // CHECK-DEBUG-DAG: %struct.anon.0 = type { i8*** }
 // CHECK-DEBUG-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DEBUG-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DEBUG-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 // CHECK-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}parallel_codegen.cpp;main;[[@LINE+14]];9;;\00"
 // CHECK-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}parallel_codegen.cpp;tmain;[[@LINE+7]];9;;\00"
 
@@ -36,11 +36,11 @@ int main (int argc, char **argv) {
 
 // CHECK-LABEL: define {{[a-z]*[ ]?i32}} @main({{i32[ ]?[a-z]*}} %argc, i8** %argv)
 // CHECK:       [[AGG_CAPTURED:%.+]] = alloca %struct.anon
-// CHECK:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon, %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
 // CHECK-NEXT:  store i32* {{%[a-z0-9.]+}}, i32** [[ARGC_REF]]
 // CHECK-NEXT:  [[BITCAST:%.+]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
-// CHECK-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* @.omp_outlined. to void (i32*, i32*, ...)*), i8* [[BITCAST]])
-// CHECK-NEXT:  [[ARGV:%.+]] = load i8*** {{%[a-z0-9.]+}}
+// CHECK-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]])
+// CHECK-NEXT:  [[ARGV:%.+]] = load i8**, i8*** {{%[a-z0-9.]+}}
 // CHECK-NEXT:  [[RET:%.+]] = call {{[a-z]*[ ]?i32}} [[TMAIN:@.+tmain.+]](i8** [[ARGV]])
 // CHECK-NEXT:  ret i32 [[RET]]
 // CHECK-NEXT:  }
@@ -49,40 +49,44 @@ int main (int argc, char **argv) {
 // CHECK-DEBUG-DAG:   [[LOC_2_ADDR:%.+]] = alloca %ident_t
 // CHECK-DEBUG:       [[KMPC_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[LOC_2_ADDR]] to i8*
 // CHECK-DEBUG-NEXT:  [[KMPC_DEFAULT_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[DEF_LOC_2]] to i8*
-// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[KMPC_LOC_VOIDPTR]], i8* [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 ptrtoint (%ident_t* getelementptr (%ident_t* null, i32 1) to i64), i32 8, i1 false)
-// CHECK-DEBUG:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[KMPC_LOC_VOIDPTR]], i8* [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 ptrtoint (%ident_t* getelementptr (%ident_t, %ident_t* null, i32 1) to i64), i32 8, i1 false)
+// CHECK-DEBUG:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon, %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
 // CHECK-DEBUG-NEXT:  store i32* {{%[a-z0-9.]+}}, i32** [[ARGC_REF]]
-// CHECK-DEBUG-NEXT:  [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
-// CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8]* [[LOC1]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
+// CHECK-DEBUG-NEXT:  [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
+// CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC1]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
 // CHECK-DEBUG-NEXT:  [[BITCAST:%.+]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
-// CHECK-DEBUG-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* @.omp_outlined. to void (i32*, i32*, ...)*), i8* [[BITCAST]])
-// CHECK-DEBUG-NEXT:  [[ARGV:%.+]] = load i8*** {{%[a-z0-9.]+}}
+// CHECK-DEBUG-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]])
+// CHECK-DEBUG-NEXT:  [[ARGV:%.+]] = load i8**, i8*** {{%[a-z0-9.]+}}
 // CHECK-DEBUG-NEXT:  [[RET:%.+]] = call i32 [[TMAIN:@.+tmain.+]](i8** [[ARGV]])
 // CHECK-DEBUG-NEXT:  ret i32 [[RET]]
 // CHECK-DEBUG-NEXT:  }
 
-// CHECK-LABEL: define internal void @.omp_outlined.(i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context)
+// CHECK:       define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context)
+// CHECK:       #[[FN_ATTRS:[0-9]+]]
 // CHECK:       [[CONTEXT_ADDR:%.+]] = alloca %struct.anon*
 // CHECK:       store %struct.anon* %__context, %struct.anon** [[CONTEXT_ADDR]]
-// CHECK:       [[CONTEXT_PTR:%.+]] = load %struct.anon** [[CONTEXT_ADDR]]
-// CHECK-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon* [[CONTEXT_PTR]], i32 0, i32 0
-// CHECK-NEXT:  [[ARGC_REF:%.+]] = load i32** [[ARGC_PTR_REF]]
-// CHECK-NEXT:  [[ARGC:%.+]] = load i32* [[ARGC_REF]]
+// CHECK:       [[CONTEXT_PTR:%.+]] = load %struct.anon*, %struct.anon** [[CONTEXT_ADDR]]
+// CHECK-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon, %struct.anon* [[CONTEXT_PTR]], i32 0, i32 0
+// CHECK-NEXT:  [[ARGC_REF:%.+]] = load i32*, i32** [[ARGC_PTR_REF]]
+// CHECK-NEXT:  [[ARGC:%.+]] = load i32, i32* [[ARGC_REF]]
 // CHECK-NEXT:  invoke void [[FOO:@.+foo.+]](i32{{[ ]?[a-z]*}} [[ARGC]])
+// CHECK:       call {{.+}} @__kmpc_cancel_barrier(
 // CHECK:       ret void
-// CHECK:       call void @{{.+terminate.*}}(
+// CHECK:       call void @{{.+terminate.*|abort}}(
 // CHECK-NEXT:  unreachable
 // CHECK-NEXT:  }
-// CHECK-DEBUG-LABEL: define internal void @.omp_outlined.(i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context)
+// CHECK-DEBUG:       define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context)
+// CHECK-DEBUG:       #[[FN_ATTRS:[0-9]+]]
 // CHECK-DEBUG:       [[CONTEXT_ADDR:%.+]] = alloca %struct.anon*
 // CHECK-DEBUG:       store %struct.anon* %__context, %struct.anon** [[CONTEXT_ADDR]]
-// CHECK-DEBUG:       [[CONTEXT_PTR:%.+]] = load %struct.anon** [[CONTEXT_ADDR]]
-// CHECK-DEBUG-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon* [[CONTEXT_PTR]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:  [[ARGC_REF:%.+]] = load i32** [[ARGC_PTR_REF]]
-// CHECK-DEBUG-NEXT:  [[ARGC:%.+]] = load i32* [[ARGC_REF]]
+// CHECK-DEBUG:       [[CONTEXT_PTR:%.+]] = load %struct.anon*, %struct.anon** [[CONTEXT_ADDR]]
+// CHECK-DEBUG-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon, %struct.anon* [[CONTEXT_PTR]], i32 0, i32 0
+// CHECK-DEBUG-NEXT:  [[ARGC_REF:%.+]] = load i32*, i32** [[ARGC_PTR_REF]]
+// CHECK-DEBUG-NEXT:  [[ARGC:%.+]] = load i32, i32* [[ARGC_REF]]
 // CHECK-DEBUG-NEXT:  invoke void [[FOO:@.+foo.+]](i32 [[ARGC]])
+// CHECK-DEBUG:       call {{.+}} @__kmpc_cancel_barrier(
 // CHECK-DEBUG:       ret void
-// CHECK-DEBUG:       call void @{{.+terminate.*}}(
+// CHECK-DEBUG:       call void @{{.+terminate.*|abort}}(
 // CHECK-DEBUG-NEXT:  unreachable
 // CHECK-DEBUG-NEXT:  }
 
@@ -93,10 +97,10 @@ int main (int argc, char **argv) {
 
 // CHECK:       define linkonce_odr {{[a-z]*[ ]?i32}} [[TMAIN]](i8** %argc)
 // CHECK:       [[AGG_CAPTURED:%.+]] = alloca %struct.anon.0
-// CHECK:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon.0* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* [[AGG_CAPTURED]], i32 0, i32 0
 // CHECK-NEXT:  store i8*** {{%[a-z0-9.]+}}, i8**** [[ARGC_REF]]
 // CHECK-NEXT:  [[BITCAST:%.+]] = bitcast %struct.anon.0* [[AGG_CAPTURED]] to i8*
-// CHECK-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* @.omp_outlined.1 to void (i32*, i32*, ...)*), i8* [[BITCAST]])
+// CHECK-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]])
 // CHECK-NEXT:  ret i32 0
 // CHECK-NEXT:  }
 // CHECK-DEBUG:       define linkonce_odr i32 [[TMAIN]](i8** %argc)
@@ -104,42 +108,47 @@ int main (int argc, char **argv) {
 // CHECK-DEBUG-DAG:   [[LOC_2_ADDR:%.+]] = alloca %ident_t
 // CHECK-DEBUG:       [[KMPC_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[LOC_2_ADDR]] to i8*
 // CHECK-DEBUG-NEXT:  [[KMPC_DEFAULT_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[DEF_LOC_2]] to i8*
-// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[KMPC_LOC_VOIDPTR]], i8* [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 ptrtoint (%ident_t* getelementptr (%ident_t* null, i32 1) to i64), i32 8, i1 false)
-// CHECK-DEBUG:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon.0* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[KMPC_LOC_VOIDPTR]], i8* [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 ptrtoint (%ident_t* getelementptr (%ident_t, %ident_t* null, i32 1) to i64), i32 8, i1 false)
+// CHECK-DEBUG:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* [[AGG_CAPTURED]], i32 0, i32 0
 // CHECK-DEBUG-NEXT:  store i8*** {{%[a-z0-9.]+}}, i8**** [[ARGC_REF]]
-// CHECK-DEBUG-NEXT:  [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
-// CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8]* [[LOC2]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
+// CHECK-DEBUG-NEXT:  [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
+// CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC2]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
 // CHECK-DEBUG-NEXT:  [[BITCAST:%.+]] = bitcast %struct.anon.0* [[AGG_CAPTURED]] to i8*
-// CHECK-DEBUG-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* @.omp_outlined.1 to void (i32*, i32*, ...)*), i8* [[BITCAST]])
+// CHECK-DEBUG-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]])
 // CHECK-DEBUG-NEXT:  ret i32 0
 // CHECK-DEBUG-NEXT:  }
 
-// CHECK-LABEL: define internal void @.omp_outlined.1(i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context)
+// CHECK:       define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context)
 // CHECK:       [[CONTEXT_ADDR:%.+]] = alloca %struct.anon.0*
 // CHECK:       store %struct.anon.0* %__context, %struct.anon.0** [[CONTEXT_ADDR]]
-// CHECK:       [[CONTEXT_PTR:%.+]] = load %struct.anon.0** [[CONTEXT_ADDR]]
-// CHECK-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon.0* [[CONTEXT_PTR]], i32 0, i32 0
-// CHECK-NEXT:  [[ARGC_REF:%.+]] = load i8**** [[ARGC_PTR_REF]]
-// CHECK-NEXT:  [[ARGC:%.+]] = load i8*** [[ARGC_REF]]
+// CHECK:       [[CONTEXT_PTR:%.+]] = load %struct.anon.0*, %struct.anon.0** [[CONTEXT_ADDR]]
+// CHECK-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* [[CONTEXT_PTR]], i32 0, i32 0
+// CHECK-NEXT:  [[ARGC_REF:%.+]] = load i8***, i8**** [[ARGC_PTR_REF]]
+// CHECK-NEXT:  [[ARGC:%.+]] = load i8**, i8*** [[ARGC_REF]]
 // CHECK-NEXT:  invoke void [[FOO1:@.+foo.+]](i8** [[ARGC]])
+// CHECK:       call {{.+}} @__kmpc_cancel_barrier(
 // CHECK:       ret void
-// CHECK:       call void @{{.+terminate.*}}(
+// CHECK:       call void @{{.+terminate.*|abort}}(
 // CHECK-NEXT:  unreachable
 // CHECK-NEXT:  }
-// CHECK-DEBUG-LABEL: define internal void @.omp_outlined.1(i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context)
+// CHECK-DEBUG:       define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context)
 // CHECK-DEBUG:       [[CONTEXT_ADDR:%.+]] = alloca %struct.anon.0*
 // CHECK-DEBUG:       store %struct.anon.0* %__context, %struct.anon.0** [[CONTEXT_ADDR]]
-// CHECK-DEBUG:       [[CONTEXT_PTR:%.+]] = load %struct.anon.0** [[CONTEXT_ADDR]]
-// CHECK-DEBUG-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon.0* [[CONTEXT_PTR]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:  [[ARGC_REF:%.+]] = load i8**** [[ARGC_PTR_REF]]
-// CHECK-DEBUG-NEXT:  [[ARGC:%.+]] = load i8*** [[ARGC_REF]]
+// CHECK-DEBUG:       [[CONTEXT_PTR:%.+]] = load %struct.anon.0*, %struct.anon.0** [[CONTEXT_ADDR]]
+// CHECK-DEBUG-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* [[CONTEXT_PTR]], i32 0, i32 0
+// CHECK-DEBUG-NEXT:  [[ARGC_REF:%.+]] = load i8***, i8**** [[ARGC_PTR_REF]]
+// CHECK-DEBUG-NEXT:  [[ARGC:%.+]] = load i8**, i8*** [[ARGC_REF]]
 // CHECK-DEBUG-NEXT:  invoke void [[FOO1:@.+foo.+]](i8** [[ARGC]])
+// CHECK-DEBUG:       call {{.+}} @__kmpc_cancel_barrier(
 // CHECK-DEBUG:       ret void
-// CHECK-DEBUG:       call void @{{.+terminate.*}}(
+// CHECK-DEBUG:       call void @{{.+terminate.*|abort}}(
 // CHECK-DEBUG-NEXT:  unreachable
 // CHECK-DEBUG-NEXT:  }
 
 // CHECK: define linkonce_odr void [[FOO1]](i8** %argc)
 // CHECK-DEBUG: define linkonce_odr void [[FOO1]](i8** %argc)
 
+// CHECK: attributes #[[FN_ATTRS]] = {{.+}} nounwind
+// CHECK-DEBUG: attributes #[[FN_ATTRS]] = {{.+}} nounwind
+
 #endif
diff --git a/test/OpenMP/parallel_copyin_codegen.cpp b/test/OpenMP/parallel_copyin_codegen.cpp
new file mode 100644
index 000000000000..e69ace7820b6
--- /dev/null
+++ b/test/OpenMP/parallel_copyin_codegen.cpp
@@ -0,0 +1,296 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+volatile int g = 1212;
+#pragma omp threadprivate(g)
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S &operator=(const S &) { return *this; };
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+
+
+// CHECK-DAG: [[T_VAR:@.+]] = internal global i{{[0-9]+}} 1122,
+// CHECK-DAG: [[VEC:@.+]] = internal global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+// CHECK-DAG: [[S_ARR:@.+]] = internal global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+// CHECK-DAG: [[VAR:@.+]] = internal global [[S_FLOAT_TY]] zeroinitializer,
+// CHECK-DAG: [[TMAIN_T_VAR:@.+]] = linkonce_odr global i{{[0-9]+}} 333,
+// CHECK-DAG: [[TMAIN_VEC:@.+]] = linkonce_odr global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 3, i{{[0-9]+}} 3],
+// CHECK-DAG: [[TMAIN_S_ARR:@.+]] = linkonce_odr global [2 x [[S_INT_TY]]] zeroinitializer,
+// CHECK-DAG: [[TMAIN_VAR:@.+]] = linkonce_odr global [[S_INT_TY]] zeroinitializer,
+template <typename T>
+T tmain() {
+  S<T> test;
+  test = S<T>();
+  static T t_var = 333;
+  static T vec[] = {3, 3};
+  static S<T> s_arr[] = {1, 2};
+  static S<T> var(3);
+#pragma omp threadprivate(t_var, vec, s_arr, var)
+#pragma omp parallel copyin(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+#pragma omp parallel copyin(t_var)
+  {}
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8*
+#pragma omp parallel copyin(g)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+
+    // threadprivate_g = g;
+    // LAMBDA: call i8* @__kmpc_threadprivate_cached({{.+}} [[G]]
+    // LAMBDA: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+    // LAMBDA: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[G]] to i{{[0-9]+}}), %{{.+}}
+    // LAMBDA: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+    // LAMBDA: [[NOT_MASTER]]
+    // LAMBDA: load i{{[0-9]+}}, i{{[0-9]+}}* [[G]],
+    // LAMBDA: store volatile i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: [[DONE]]
+
+    // LAMBDA: call i32 @__kmpc_cancel_barrier(
+    g = 1;
+    // LAMBDA: call{{( x86_thiscallcc)?}} void [[INNER_LAMBDA:@.+]](%{{.+}}*
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8*
+#pragma omp parallel copyin(g)
+  {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+
+    // threadprivate_g = g;
+    // BLOCKS: call i8* @__kmpc_threadprivate_cached({{.+}} [[G]]
+    // BLOCKS: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+    // BLOCKS: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[G]] to i{{[0-9]+}}), %{{.+}}
+    // BLOCKS: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+    // BLOCKS: [[NOT_MASTER]]
+    // BLOCKS: load i{{[0-9]+}}, i{{[0-9]+}}* [[G]],
+    // BLOCKS: store volatile i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS: [[DONE]]
+
+    // BLOCKS: call i32 @__kmpc_cancel_barrier(
+    g = 1;
+    // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: call i8* @__kmpc_threadprivate_cached({{.+}} [[G]]
+      // BLOCKS: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  test = S<float>();
+  static int t_var = 1122;
+  static int vec[] = {1, 2};
+  static S<float> s_arr[] = {1, 2};
+  static S<float> var(3);
+#pragma omp threadprivate(t_var, vec, s_arr, var)
+#pragma omp parallel copyin(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+#pragma omp parallel copyin(t_var)
+  {}
+  return tmain<int>();
+#endif
+}
+
+// CHECK-LABEL: @main
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN:@.+]]([[S_FLOAT_TY]]* [[TEST]], [[S_FLOAT_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, {{%.+}}*)* [[MAIN_MICROTASK:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, {{%.+}}*)* [[MAIN_MICROTASK1:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, {{%.+}}* %{{.+}})
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]],
+
+// threadprivate_t_var = t_var;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[T_VAR]]
+// CHECK: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+// CHECK: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[T_VAR]] to i{{[0-9]+}}), %{{.+}}
+// CHECK: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+// CHECK: [[NOT_MASTER]]
+// CHECK: load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
+// CHECK: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+
+// threadprivate_vec = vec;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[VEC]]
+// CHECK: call void @llvm.memcpy{{.*}}(i8* %{{.+}}, i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+
+// threadprivate_s_arr = s_arr;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[S_ARR]]
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* {{%.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}})
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+
+// threadprivate_var = var;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[VAR]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_ASSIGN]]([[S_FLOAT_TY]]* {{%.+}}, [[S_FLOAT_TY]]* {{.*}}[[VAR]])
+// CHECK: [[DONE]]
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, {{%.+}}* %{{.+}})
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]],
+
+// threadprivate_t_var = t_var;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[T_VAR]]
+// CHECK: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+// CHECK: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[T_VAR]] to i{{[0-9]+}}), %{{.+}}
+// CHECK: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+// CHECK: [[NOT_MASTER]]
+// CHECK: load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
+// CHECK: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+// CHECK: [[DONE]]
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN:@.+]]([[S_INT_TY]]* [[TEST]], [[S_INT_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, {{%.+}}*)* [[TMAIN_MICROTASK:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, {{%.+}}*)* [[TMAIN_MICROTASK1:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, {{%.+}}* %{{.+}})
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]],
+
+// threadprivate_t_var = t_var;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_T_VAR]]
+// CHECK: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+// CHECK: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[TMAIN_T_VAR]] to i{{[0-9]+}}), %{{.+}}
+// CHECK: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+// CHECK: [[NOT_MASTER]]
+// CHECK: load i{{[0-9]+}}, i{{[0-9]+}}* [[TMAIN_T_VAR]],
+// CHECK: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+
+// threadprivate_vec = vec;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_VEC]]
+// CHECK: call void @llvm.memcpy{{.*}}(i8* %{{.+}}, i8* bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*),
+
+// threadprivate_s_arr = s_arr;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_S_ARR]]
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* {{%.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}})
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+
+// threadprivate_var = var;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_VAR]]
+// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN]]([[S_INT_TY]]* {{%.+}}, [[S_INT_TY]]* {{.*}}[[TMAIN_VAR]])
+// CHECK: [[DONE]]
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+
+// CHECK: define internal void [[TMAIN_MICROTASK1]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, {{%.+}}* %{{.+}})
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]],
+
+// threadprivate_t_var = t_var;
+// CHECK: call i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_T_VAR]]
+// CHECK: ptrtoint i{{[0-9]+}}* %{{.+}} to i{{[0-9]+}}
+// CHECK: icmp ne i{{[0-9]+}} ptrtoint (i{{[0-9]+}}* [[TMAIN_T_VAR]] to i{{[0-9]+}}), %{{.+}}
+// CHECK: br i1 %{{.+}}, label %[[NOT_MASTER:.+]], label %[[DONE:.+]]
+// CHECK: [[NOT_MASTER]]
+// CHECK: load i{{[0-9]+}}, i{{[0-9]+}}* [[TMAIN_T_VAR]],
+// CHECK: store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}* %{{.+}},
+// CHECK: [[DONE]]
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i32 [[GTID]])
+// CHECK: ret void
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St &operator=(const St &) { return *this; };
+  ~St() {}
+};
+
+void array_func() {
+  static int a[2];
+  static St s[2];
+// ARRAY: @__kmpc_fork_call(
+// ARRAY: call i8* @__kmpc_threadprivate_cached(
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* bitcast ([2 x i32]* @{{.+}} to i8*), i64 8, i32 4, i1 false)
+// ARRAY: call dereferenceable(8) %struct.St* @{{.+}}(%struct.St* %{{.+}}, %struct.St* dereferenceable(8) %{{.+}})
+#pragma omp threadprivate(a, s)
+#pragma omp parallel copyin(a, s)
+  ;
+}
+#endif
+
+
diff --git a/test/OpenMP/parallel_copyin_messages.cpp b/test/OpenMP/parallel_copyin_messages.cpp
index c1ce363b47bd..2b54b43b87c6 100644
--- a/test/OpenMP/parallel_copyin_messages.cpp
+++ b/test/OpenMP/parallel_copyin_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo() {
 }
@@ -20,17 +20,17 @@ public:
   S3():a(0) { }
   S3 &operator =(S3 &s3) { return *this; }
 };
-class S4 { // expected-note {{'S4' declared here}}
+class S4 {
   int a;
   S4();
-  S4 &operator =(const S4 &s4);
+  S4 &operator =(const S4 &s4); // expected-note {{implicitly declared private here}}
 public:
   S4(int v):a(v) { }
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
   S5():a(0) {}
-  S5 &operator =(const S5 &s5) { return *this; }
+  S5 &operator =(const S5 &s5) { return *this; } // expected-note {{implicitly declared private here}}
 public:
   S5(int v):a(v) { }
 };
@@ -40,11 +40,18 @@ public:
   static T s;
 };
 
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
 
 S2 k;
 S3 h;
-S4 l(3); // expected-note {{'l' defined here}}
-S5 m(4); // expected-note {{'m' defined here}}
+S4 l(3);
+S5 m(4);
 #pragma omp threadprivate(h, k, l, m)
 
 int main(int argc, char **argv) {
@@ -55,12 +62,13 @@ int main(int argc, char **argv) {
   #pragma omp parallel copyin (k // expected-error {{expected ')'}} expected-note {{to match this '('}}
   #pragma omp parallel copyin (h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
   #pragma omp parallel copyin (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
-  #pragma omp parallel copyin (l) // expected-error {{copyin variable must have an accessible, unambiguous copy assignment operator}}
+  #pragma omp parallel copyin (l) // expected-error {{'operator=' is a private member of 'S4'}}
   #pragma omp parallel copyin (S1) // expected-error {{'S1' does not refer to a value}}
   #pragma omp parallel copyin (argv[1]) // expected-error {{expected variable name}}
   #pragma omp parallel copyin(i) // expected-error {{copyin variable must be threadprivate}}
-  #pragma omp parallel copyin(m) // expected-error {{copyin variable must have an accessible, unambiguous copy assignment operator}}
+  #pragma omp parallel copyin(m) // expected-error {{'operator=' is a private member of 'S5'}}
   #pragma omp parallel copyin(ST<int>::s) // expected-error {{copyin variable must be threadprivate}}
+  #pragma omp parallel copyin(B::x)
   foo();
 
   return 0;
diff --git a/test/OpenMP/parallel_default_messages.cpp b/test/OpenMP/parallel_default_messages.cpp
index 6014cdfd4bcc..cbc6a73fe35d 100644
--- a/test/OpenMP/parallel_default_messages.cpp
+++ b/test/OpenMP/parallel_default_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_firstprivate_codegen.cpp b/test/OpenMP/parallel_firstprivate_codegen.cpp
index 811f2df9df35..3f61362a3d97 100644
--- a/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -1,9 +1,11 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
 // expected-no-diagnostics
+#ifndef ARRAY
 #ifndef HEADER
 #define HEADER
 
@@ -45,6 +47,8 @@ T tmain() {
     vec[0] = t_var;
     s_arr[0] = var;
   }
+#pragma omp parallel firstprivate(t_var)
+  {}
   return T();
 }
 
@@ -52,36 +56,36 @@ int main() {
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // LAMBDA: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // LAMBDA: store i{{[0-9]+}}* [[G]], i{{[0-9]+}}** [[G_LOCAL_REF]]
   // LAMBDA: [[ARG:%.+]] = bitcast %{{.+}}* [[AGG_CAPTURED]] to i8*
-  // LAMBDA: call void {{.+}}* @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
 #pragma omp parallel firstprivate(g)
   {
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
     // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
-    // LAMBDA: [[ARG:%.+]] = load %{{.+}}** [[ARG_REF]]
-    // LAMBDA: [[G_REF_ADDR:%.+]] = getelementptr inbounds %{{.+}}* [[ARG]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-    // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}** [[G_REF_ADDR]]
-    // LAMBDA: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}* [[G_REF]]
-    // LAMBDA: store volatile i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // LAMBDA: [[ARG:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_REF]]
+    // LAMBDA: [[G_REF_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_REF_ADDR]]
+    // LAMBDA: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G_REF]]
+    // LAMBDA: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
     // LAMBDA: call i32 @__kmpc_cancel_barrier(
     g = 1;
     // LAMBDA: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
-    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
     // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
-    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call{{( x86_thiscallcc)?}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
     [&]() {
       // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
       // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
       g = 2;
-      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}** [[ARG_PTR_REF]]
-      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
       // LAMBDA: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
     }();
   }
@@ -90,30 +94,30 @@ int main() {
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
-  // BLOCKS: call void {{%.+}}(i8*
+  // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // BLOCKS: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // BLOCKS: store i{{[0-9]+}}* [[G]], i{{[0-9]+}}** [[G_LOCAL_REF]]
   // BLOCKS: [[ARG:%.+]] = bitcast %{{.+}}* [[AGG_CAPTURED]] to i8*
-  // BLOCKS: call void {{.+}}* @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
 #pragma omp parallel firstprivate(g)
   {
     // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
     // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
     // BLOCKS: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
-    // BLOCKS: [[ARG:%.+]] = load %{{.+}}** [[ARG_REF]]
-    // BLOCKS: [[G_REF_ADDR:%.+]] = getelementptr inbounds %{{.+}}* [[ARG]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-    // BLOCKS: [[G_REF:%.+]] = load i{{[0-9]+}}** [[G_REF_ADDR]]
-    // BLOCKS: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}* [[G_REF]]
-    // BLOCKS: store volatile i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS: [[ARG:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_REF]]
+    // BLOCKS: [[G_REF_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // BLOCKS: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_REF_ADDR]]
+    // BLOCKS: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G_REF]]
+    // BLOCKS: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
     // BLOCKS: call i32 @__kmpc_cancel_barrier(
     g = 1;
     // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
     // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
     // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
     // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
-    // BLOCKS: call void {{%.+}}(i8*
+    // BLOCKS: call void {{%.+}}(i8
     ^{
       // BLOCKS: define {{.+}} void {{@.+}}(i8*
       g = 2;
@@ -136,6 +140,8 @@ int main() {
     vec[0] = t_var;
     s_arr[0] = var;
   }
+#pragma omp parallel firstprivate(t_var)
+  {}
   return tmain<int>();
 #endif
 }
@@ -144,7 +150,7 @@ int main() {
 // CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
 // CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...)* @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 // CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
 // CHECK: ret
@@ -155,27 +161,20 @@ int main() {
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
-// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}** [[T_VAR_PTR_REF]],
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
-// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
-// CHECK: br label %[[VEC_PRIV_INIT:.+]]
-// CHECK: [[VEC_PRIV_INIT]]
+// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
-// CHECK: br label %[[VEC_PRIV_INIT_END:.+]]
-// CHECK: [[VEC_PRIV_INIT_END]]
-// CHECK: [[S_ARR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
-// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]** [[S_ARR_REF_PTR]],
-// CHECK: br label %[[S_ARR_PRIV_INIT:.+]]
-// CHECK: [[S_ARR_PRIV_INIT]]
-// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_END:%.+]] = getelementptr [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
-// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[S_ARR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_REF_PTR]],
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
 // CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
 // CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
 // CHECK: [[S_ARR_BODY]]
@@ -183,15 +182,13 @@ int main() {
 // CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
-// CHECK: br label %[[S_ARR_PRIV_INIT_END:.+]]
-// CHECK: [[S_ARR_PRIV_INIT_END]]
-// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
-// CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]** [[VAR_REF_PTR]],
+// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_REF_PTR]],
 // CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
 // CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
 // CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
@@ -200,7 +197,7 @@ int main() {
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...)* @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
@@ -210,27 +207,20 @@ int main() {
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
-// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}** [[T_VAR_PTR_REF]],
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
-// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
-// CHECK: br label %[[VEC_PRIV_INIT:.+]]
-// CHECK: [[VEC_PRIV_INIT]]
+// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
-// CHECK: br label %[[VEC_PRIV_INIT_END:.+]]
-// CHECK: [[VEC_PRIV_INIT_END]]
-// CHECK: [[S_ARR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
-// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_INT_TY]]]** [[S_ARR_REF_PTR]],
-// CHECK: br label %[[S_ARR_PRIV_INIT:.+]]
-// CHECK: [[S_ARR_PRIV_INIT]]
-// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_END:%.+]] = getelementptr [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
-// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[S_ARR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_REF_PTR]],
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_REF]] to [[S_INT_TY]]*
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
 // CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
 // CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
 // CHECK: [[S_ARR_BODY]]
@@ -238,18 +228,44 @@ int main() {
 // CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
-// CHECK: br label %[[S_ARR_PRIV_INIT_END:.+]]
-// CHECK: [[S_ARR_PRIV_INIT_END]]
-// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
-// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]** [[VAR_REF_PTR]],
+// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_REF_PTR]],
 // CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
 // CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
 // CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
 // CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
 // CHECK: ret void
+
 #endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) { }
+  ~St() {}
+};
+
+void array_func(float a[3], St s[2], int n, long double vla1[n]) {
+  double vla2[n];
+// ARRAY: @__kmpc_fork_call(
+// ARRAY: [[PRIV_A:%.+]] = alloca float*
+// ARRAY: [[PRIV_S:%.+]] = alloca %struct.St*
+// ARRAY: [[PRIV_VLA1:%.+]] = alloca x86_fp80*
+// ARRAY: store float* %{{.+}}, float** [[PRIV_A]],
+// ARRAY: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
+// ARRAY: call i8* @llvm.stacksave()
+// ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* %{{.+}}, i64 [[SIZE]], i32 8, i1 false)
+// ARRAY: call void @llvm.stackrestore(i8*
+#pragma omp parallel firstprivate(a, s, vla1, vla2)
+  ;
+}
+#endif
+
 
diff --git a/test/OpenMP/parallel_firstprivate_messages.cpp b/test/OpenMP/parallel_firstprivate_messages.cpp
index 7d1e3593500e..c6f8dbe138d5 100644
--- a/test/OpenMP/parallel_firstprivate_messages.cpp
+++ b/test/OpenMP/parallel_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
@@ -47,6 +47,14 @@ public:
 S3 h;
 #pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = { 0 };
@@ -70,7 +78,7 @@ int main(int argc, char **argv) {
   #pragma omp parallel firstprivate(S2::S2s)
   #pragma omp parallel firstprivate(S2::S2sc)
   #pragma omp parallel firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
-  #pragma omp parallel firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  #pragma omp parallel firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
   #pragma omp parallel private(i), firstprivate(i) // expected-error {{private variable cannot be firstprivate}} expected-note{{defined as private}}
   foo();
   #pragma omp parallel shared(i)
diff --git a/test/OpenMP/parallel_for_ast_print.cpp b/test/OpenMP/parallel_for_ast_print.cpp
index 375664f4243d..f2899ee16ef5 100644
--- a/test/OpenMP/parallel_for_ast_print.cpp
+++ b/test/OpenMP/parallel_for_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/parallel_for_codegen.cpp b/test/OpenMP/parallel_for_codegen.cpp
new file mode 100644
index 000000000000..43bf83269655
--- /dev/null
+++ b/test/OpenMP/parallel_for_codegen.cpp
@@ -0,0 +1,400 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
+//
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[CAP_TY:%.+]] = type { i8* }
+
+// CHECK-LABEL: with_var_schedule
+void with_var_schedule() {
+  double a = 5;
+// CHECK: [[CHUNK_SIZE:%.+]] = fptosi double %{{.+}}to i8
+// CHECK: store i8 %{{.+}}, i8* [[CHUNK:%.+]],
+// CHECK: [[CHUNK_REF:%.+]] = getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP_ARG:%.+]], i{{.+}} 0, i{{.+}} 0
+// CHECK: store i8* [[CHUNK]], i8** [[CHUNK_REF]],
+// CHECK: [[BITCAST:%.+]] = bitcast [[CAP_TY]]* [[CAP_ARG]] to i8*
+// CHECK: call void {{.+}} @__kmpc_fork_call({{.+}}, i8* [[BITCAST]])
+
+// CHECK: [[CHUNK_REF:%.+]] = getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+// CHECK: [[CHUNK:%.+]] = load i8*, i8** [[CHUNK_REF]],
+// CHECK: [[CHUNK_VAL:%.+]] = load i8, i8* [[CHUNK]],
+// CHECK: [[CHUNK_SIZE:%.+]] = sext i8 [[CHUNK_VAL]] to i64
+// CHECK: call void @__kmpc_for_static_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC:@[^,]+]], i32 [[GTID:%[^,]+]], i32 33, i32* [[IS_LAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]], i64 1, i64 [[CHUNK_SIZE]])
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: __kmpc_cancel_barrier
+#pragma omp parallel for schedule(static, char(a))
+  for (unsigned long long i = 1; i < 2; ++i) {
+  }
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void without_schedule_clause(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
+// UB = min(UB, GlobalUB)
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// Loop header
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (int i = 33; i < 32000000; i += 7) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add nsw i32 33, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_not_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for schedule(static)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
+// UB = min(UB, GlobalUB)
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// Loop header
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (int i = 32000000; i > 33; i += -7) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
+// CHECK-NEXT: [[CALC_I_2:%.+]] = sub nsw i32 32000000, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for schedule(static, 5)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
+// UB = min(UB, GlobalUB)
+// CHECK: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 16908288, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+
+// Outer loop header
+// CHECK: [[O_IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[O_UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ule i32 [[O_IV]], [[O_UB]]
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned i = 131071; i <= 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i32 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// Update the counters, adding stride
+// CHECK:  [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]]
+// CHECK-NEXT: [[ADD_LB:%.+]] = add i32 [[LB]], [[ST]]
+// CHECK-NEXT: store i32 [[ADD_LB]], i32* [[OMP_LB]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]]
+// CHECK-NEXT: [[ADD_UB:%.+]] = add i32 [[UB]], [[ST]]
+// CHECK-NEXT: store i32 [[ADD_UB]], i32* [[OMP_UB]]
+
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void dynamic1(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for schedule(dynamic)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 35, i64 0, i64 16908287, i64 1, i64 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}guided7{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void guided7(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for schedule(guided, 7)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 36, i64 0, i64 16908287, i64 1, i64 7)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}test_auto{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void test_auto(float *a, float *b, float *c, float *d) {
+  unsigned int x = 0;
+  unsigned int y = 0;
+  #pragma omp parallel for schedule(auto) collapse(2)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_dispatch_init_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 38, i64 0, i64 [[LAST_ITER:%[^,]+]], i64 1, i64 1)
+//
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+// FIXME: When the iteration count of some nested loop is not a known constant,
+// we should pre-calculate it, like we do for the total number of iterations!
+  for (char i = static_cast<char>(y); i <= '9'; ++i)
+    for (x = 11; x > 0; --x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}runtime{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void runtime(float *a, float *b, float *c, float *d) {
+  int x = 0;
+  #pragma omp parallel for collapse(2) schedule(runtime)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_dispatch_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 37, i32 0, i32 199, i32 1, i32 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned char i = '0' ; i <= '9'; ++i)
+    for (x = -10; x < 10; ++x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// TERM_DEBUG-LABEL: foo
+int foo() {return 0;};
+
+// TERM_DEBUG-LABEL: parallel_for
+void parallel_for(float *a) {
+#pragma omp parallel for schedule(static, 5)
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_for_static_init_4u({{.+}}), !dbg [[DBG_LOC_START:![0-9]+]]
+  // TERM_DEBUG:     invoke i32 {{.*}}foo{{.*}}()
+  // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_for_static_fini({{.+}}), !dbg [[DBG_LOC_END:![0-9]+]]
+  // TERM_DEBUG:     call {{.+}} @__kmpc_cancel_barrier({{.+}}), !dbg [[DBG_LOC_CANCEL:![0-9]+]]
+  // TERM_DEBUG:     [[TERM_LPAD]]
+  // TERM_DEBUG:     call void @__clang_call_terminate
+  // TERM_DEBUG:     unreachable
+  for (unsigned i = 131071; i <= 2147483647; i += 127)
+    a[i] += foo();
+}
+// Check source line corresponds to "#pragma omp parallel for schedule(static, 5)" above:
+// TERM_DEBUG-DAG: [[DBG_LOC_START]] = !DILocation(line: [[@LINE-4]],
+// TERM_DEBUG-DAG: [[DBG_LOC_END]] = !DILocation(line: [[@LINE-16]],
+// TERM_DEBUG-DAG: [[DBG_LOC_CANCEL]] = !DILocation(line: [[@LINE-17]],
+
+#endif // HEADER
+
diff --git a/test/OpenMP/parallel_for_collapse_messages.cpp b/test/OpenMP/parallel_for_collapse_messages.cpp
index 06dfe0f77721..042b819fe58e 100644
--- a/test/OpenMP/parallel_for_collapse_messages.cpp
+++ b/test/OpenMP/parallel_for_collapse_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_copyin_messages.cpp b/test/OpenMP/parallel_for_copyin_messages.cpp
index 2ebc2ded7387..5a5d1632c8eb 100644
--- a/test/OpenMP/parallel_for_copyin_messages.cpp
+++ b/test/OpenMP/parallel_for_copyin_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo() {
 }
@@ -22,18 +22,18 @@ public:
   S3() : a(0) {}
   S3 &operator=(S3 &s3) { return *this; }
 };
-class S4 { // expected-note {{'S4' declared here}}
+class S4 {
   int a;
   S4();
-  S4 &operator=(const S4 &s4);
+  S4 &operator=(const S4 &s4); // expected-note {{implicitly declared private here}}
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
   S5() : a(0) {}
-  S5 &operator=(const S5 &s5) { return *this; }
+  S5 &operator=(const S5 &s5) { return *this; } // expected-note {{implicitly declared private here}}
 
 public:
   S5(int v) : a(v) {}
@@ -46,10 +46,18 @@ public:
 
 S2 k;
 S3 h;
-S4 l(3); // expected-note {{'l' defined here}}
-S5 m(4); // expected-note {{'m' defined here}}
+S4 l(3);
+S5 m(4);
 #pragma omp threadprivate(h, k, l, m)
 
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   int i;
 #pragma omp parallel for copyin // expected-error {{expected '(' after 'copyin'}}
@@ -70,7 +78,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for copyin(l) // expected-error {{copyin variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel for copyin(l) // expected-error {{'operator=' is a private member of 'S4'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for copyin(S1) // expected-error {{'S1' does not refer to a value}}
@@ -82,10 +90,10 @@ int main(int argc, char **argv) {
 #pragma omp parallel for copyin(i) // expected-error {{copyin variable must be threadprivate}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for copyin(m) // expected-error {{copyin variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel for copyin(m) // expected-error {{'operator=' is a private member of 'S5'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for copyin(ST < int > ::s) // expected-error {{copyin variable must be threadprivate}}
+#pragma omp parallel for copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
   for (i = 0; i < argc; ++i)
     foo();
 
diff --git a/test/OpenMP/parallel_for_default_messages.cpp b/test/OpenMP/parallel_for_default_messages.cpp
index ed478a8c9721..d2129a345939 100644
--- a/test/OpenMP/parallel_for_default_messages.cpp
+++ b/test/OpenMP/parallel_for_default_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_for_firstprivate_messages.cpp b/test/OpenMP/parallel_for_firstprivate_messages.cpp
index b4958733deca..2c762b49b39e 100644
--- a/test/OpenMP/parallel_for_firstprivate_messages.cpp
+++ b/test/OpenMP/parallel_for_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -137,6 +137,14 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -207,7 +215,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for firstprivate(m) // OK
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+#pragma omp parallel for firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
diff --git a/test/OpenMP/parallel_for_if_messages.cpp b/test/OpenMP/parallel_for_if_messages.cpp
index 295d7398b839..cf2a3b431dc7 100644
--- a/test/OpenMP/parallel_for_if_messages.cpp
+++ b/test/OpenMP/parallel_for_if_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_lastprivate_messages.cpp b/test/OpenMP/parallel_for_lastprivate_messages.cpp
index bd1dd4b52768..09bed02a05f9 100644
--- a/test/OpenMP/parallel_for_lastprivate_messages.cpp
+++ b/test/OpenMP/parallel_for_lastprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -15,15 +15,17 @@ class S2 {
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
-  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  S2 &operator=(const S2 &);
+  const S2 &operator=(const S2 &) const;
+  static float S2s;
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
 const S2 b;
 const S2 ba[5];
-class S3 { // expected-note 2 {{'S3' declared here}}
+class S3 {
   int a;
-  S3 &operator=(const S3 &s3);
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
 
 public:
   S3() : a(0) {}
@@ -32,17 +34,17 @@ public:
 const S3 c;         // expected-note {{global variable is predetermined as shared}}
 const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
 extern const int f; // expected-note {{global variable is predetermined as shared}}
-class S4 {          // expected-note 3 {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4();             // expected-note 3 {{implicitly declared private here}}
   S4(const S4 &s4);
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
 
 public:
   S5(const S5 &s5) : a(s5.a) {}
@@ -62,8 +64,8 @@ S3 h;
 
 template <class I, class C>
 int foomain(int argc, char **argv) {
-  I e(4); // expected-note {{'e' defined here}}
-  I g(5); // expected-note {{'g' defined here}}
+  I e(4);
+  I g(5);
   int i;
   int &j = i;                        // expected-note {{'j' defined here}}
 #pragma omp parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
@@ -96,7 +98,7 @@ int foomain(int argc, char **argv) {
 #pragma omp parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
   for (int k = 0; k < argc; ++k)
     ++k;
-#pragma omp parallel for lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp parallel for lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
@@ -126,12 +128,20 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
-  S4 e(4);               // expected-note {{'e' defined here}}
-  S5 g(5);               // expected-note {{'g' defined here}}
-  S3 m;                  // expected-note 2 {{'m' defined here}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
   S6 n(2);
   int i;
   int &j = i;                        // expected-note {{'j' defined here}}
@@ -181,7 +191,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for lastprivate(xa) // OK
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+#pragma omp parallel for lastprivate(S2::S2s)
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
@@ -190,13 +200,13 @@ int main(int argc, char **argv) {
 #pragma omp parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp parallel for'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+#pragma omp parallel for lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
@@ -216,7 +226,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for lastprivate(j) // expected-error {{arguments of OpenMP clause 'lastprivate' cannot be of reference type}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for firstprivate(m) lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel for firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for lastprivate(n) firstprivate(n) // OK
diff --git a/test/OpenMP/parallel_for_loop_messages.cpp b/test/OpenMP/parallel_for_loop_messages.cpp
index c3299976bd06..09a15e2dc337 100644
--- a/test/OpenMP/parallel_for_loop_messages.cpp
+++ b/test/OpenMP/parallel_for_loop_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
 
 class S {
   int a;
@@ -10,7 +10,7 @@ public:
 };
 
 static int sii;
-#pragma omp threadprivate(sii) // expected-note {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
 static int globalii;
 
 int test_iteration_spaces() {
@@ -258,21 +258,18 @@ int test_iteration_spaces() {
     c[ii] = a[ii];
 
   {
-// expected-error@+2 {{loop iteration variable in the associated loop of 'omp parallel for' directive may not be threadprivate or thread local, predetermined as private}}
 #pragma omp parallel for
     for (sii = 0; sii < 10; sii += 1)
       c[sii] = a[sii];
   }
 
   {
-// expected-error@+2 {{loop iteration variable in the associated loop of 'omp parallel for' directive may not be a variable with global storage without being explicitly marked as private}}
 #pragma omp parallel for
     for (globalii = 0; globalii < 10; globalii += 1)
       c[globalii] = a[globalii];
   }
 
   {
-// expected-error@+3 {{loop iteration variable in the associated loop of 'omp parallel for' directive may not be a variable with global storage without being explicitly marked as private}}
 #pragma omp parallel for collapse(2)
     for (ii = 0; ii < 10; ii += 1)
     for (globalii = 0; globalii < 10; globalii += 1)
diff --git a/test/OpenMP/parallel_for_messages.cpp b/test/OpenMP/parallel_for_messages.cpp
index e4ea0d5e2d93..7c4f926d6d2b 100644
--- a/test/OpenMP/parallel_for_messages.cpp
+++ b/test/OpenMP/parallel_for_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 -o - %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_misc_messages.c b/test/OpenMP/parallel_for_misc_messages.c
index f07a0f2a4c4c..ee6f2e81bb8d 100644
--- a/test/OpenMP/parallel_for_misc_messages.c
+++ b/test/OpenMP/parallel_for_misc_messages.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
 
 // expected-error@+1 {{unexpected OpenMP directive '#pragma omp parallel for'}}
 #pragma omp parallel for
diff --git a/test/OpenMP/parallel_for_num_threads_messages.cpp b/test/OpenMP/parallel_for_num_threads_messages.cpp
index e1928982ad1a..60c7dfb6e3cd 100644
--- a/test/OpenMP/parallel_for_num_threads_messages.cpp
+++ b/test/OpenMP/parallel_for_num_threads_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_private_messages.cpp b/test/OpenMP/parallel_for_private_messages.cpp
index 31b84588de0b..17344f62d7f1 100644
--- a/test/OpenMP/parallel_for_private_messages.cpp
+++ b/test/OpenMP/parallel_for_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -108,6 +108,14 @@ int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
@@ -146,7 +154,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (int k = 0; k < argc; ++k)
     ++k;
-#pragma omp parallel for private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp parallel for private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp parallel for nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp parallel for'}}
diff --git a/test/OpenMP/parallel_for_proc_bind_messages.cpp b/test/OpenMP/parallel_for_proc_bind_messages.cpp
index 0347caf68b02..0f9c47f5f29e 100644
--- a/test/OpenMP/parallel_for_proc_bind_messages.cpp
+++ b/test/OpenMP/parallel_for_proc_bind_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_for_reduction_messages.cpp b/test/OpenMP/parallel_for_reduction_messages.cpp
index 8e482ef73536..26fa48cf5697 100644
--- a/test/OpenMP/parallel_for_reduction_messages.cpp
+++ b/test/OpenMP/parallel_for_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo() {
 }
@@ -11,12 +11,12 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
-  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static float S2s;
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;                       // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
-  T fl;                            // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp parallel for reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
     foo();
@@ -96,10 +96,10 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel for reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name}}
@@ -114,7 +114,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp parallel for reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp parallel for reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified variable cannot be reduction}}
@@ -132,10 +132,10 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp parallel for reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}} expected-error {{a reduction variable with array type 'const float [5]'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp parallel for reduction(&& : S2::S2s)
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
@@ -144,7 +144,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp parallel for reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -177,18 +177,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;                      // expected-note 2 {{'j' defined here}}
   S3 &p = k;                       // expected-note 2 {{'p' defined here}}
   const int &r = da[i];            // expected-note {{'r' defined here}}
   int &q = qa[i];                  // expected-note {{'q' defined here}}
-  float fl;                        // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp parallel for reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
     foo();
@@ -228,7 +236,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp parallel for reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}}
@@ -246,22 +254,22 @@ int main(int argc, char **argv) {
 #pragma omp parallel for reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp parallel for reduction(&& : S2::S2s)
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+#pragma omp parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp parallel for reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
diff --git a/test/OpenMP/parallel_for_schedule_messages.cpp b/test/OpenMP/parallel_for_schedule_messages.cpp
index b03758a10d66..c12c4282686b 100644
--- a/test/OpenMP/parallel_for_schedule_messages.cpp
+++ b/test/OpenMP/parallel_for_schedule_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_simd_aligned_messages.cpp b/test/OpenMP/parallel_for_simd_aligned_messages.cpp
index ea4ec21f28a9..e1b9602f0ca7 100644
--- a/test/OpenMP/parallel_for_simd_aligned_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_aligned_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
 
 struct B {
   static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
diff --git a/test/OpenMP/parallel_for_simd_ast_print.cpp b/test/OpenMP/parallel_for_simd_ast_print.cpp
index 4192695cf037..cd62fc51af7f 100644
--- a/test/OpenMP/parallel_for_simd_ast_print.cpp
+++ b/test/OpenMP/parallel_for_simd_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/parallel_for_simd_collapse_messages.cpp b/test/OpenMP/parallel_for_simd_collapse_messages.cpp
index b829497e17f3..22090e6b23ef 100644
--- a/test/OpenMP/parallel_for_simd_collapse_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_collapse_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_simd_copyin_messages.cpp b/test/OpenMP/parallel_for_simd_copyin_messages.cpp
index e0b7e6354ebb..1e6fdc9c31b1 100644
--- a/test/OpenMP/parallel_for_simd_copyin_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_copyin_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -o - %s
 
 void foo() {
 }
@@ -22,18 +22,18 @@ public:
   S3() : a(0) {}
   S3 &operator=(S3 &s3) { return *this; }
 };
-class S4 { // expected-note {{'S4' declared here}}
+class S4 {
   int a;
   S4();
-  S4 &operator=(const S4 &s4);
+  S4 &operator=(const S4 &s4); // expected-note {{implicitly declared private here}}
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
   S5() : a(0) {}
-  S5 &operator=(const S5 &s5) { return *this; }
+  S5 &operator=(const S5 &s5) { return *this; } // expected-note {{implicitly declared private here}}
 
 public:
   S5(int v) : a(v) {}
@@ -46,10 +46,18 @@ public:
 
 S2 k;
 S3 h;
-S4 l(3); // expected-note {{'l' defined here}}
-S5 m(4); // expected-note {{'m' defined here}}
+S4 l(3);
+S5 m(4);
 #pragma omp threadprivate(h, k, l, m)
 
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   int i;
 #pragma omp parallel for simd copyin // expected-error {{expected '(' after 'copyin'}}
@@ -70,7 +78,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd copyin(l) // expected-error {{copyin variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel for simd copyin(l) // expected-error {{'operator=' is a private member of 'S4'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for simd copyin(S1) // expected-error {{'S1' does not refer to a value}}
@@ -82,10 +90,10 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd copyin(i) // expected-error {{copyin variable must be threadprivate}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd copyin(m) // expected-error {{copyin variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel for simd copyin(m) // expected-error {{'operator=' is a private member of 'S5'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd copyin(ST < int > ::s) // expected-error {{copyin variable must be threadprivate}}
+#pragma omp parallel for simd copyin(ST < int > ::s, B::x) // expected-error {{copyin variable must be threadprivate}}
   for (i = 0; i < argc; ++i)
     foo();
 
diff --git a/test/OpenMP/parallel_for_simd_default_messages.cpp b/test/OpenMP/parallel_for_simd_default_messages.cpp
index 6675029abe50..9bd8113c8d55 100644
--- a/test/OpenMP/parallel_for_simd_default_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_default_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_for_simd_firstprivate_messages.cpp b/test/OpenMP/parallel_for_simd_firstprivate_messages.cpp
index 876d422e634d..2ff32246dc17 100644
--- a/test/OpenMP/parallel_for_simd_firstprivate_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -136,6 +136,14 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -206,7 +214,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd firstprivate(m) // OK
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+#pragma omp parallel for simd firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
diff --git a/test/OpenMP/parallel_for_simd_if_messages.cpp b/test/OpenMP/parallel_for_simd_if_messages.cpp
index b91dd18635bd..ca327cff7206 100644
--- a/test/OpenMP/parallel_for_simd_if_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_if_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_simd_lastprivate_messages.cpp b/test/OpenMP/parallel_for_simd_lastprivate_messages.cpp
index b620c7fc010b..e85e28b3f18a 100644
--- a/test/OpenMP/parallel_for_simd_lastprivate_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_lastprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -15,15 +15,16 @@ class S2 {
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
-  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  const S2 &operator=(const S2 &) const;
+  static float S2s;
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
 const S2 b;
 const S2 ba[5];
-class S3 { // expected-note 2 {{'S3' declared here}}
+class S3 {
   int a;
-  S3 &operator=(const S3 &s3);
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
 
 public:
   S3() : a(0) {}
@@ -32,17 +33,17 @@ public:
 const S3 c;         // expected-note {{global variable is predetermined as shared}}
 const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
 extern const int f; // expected-note {{global variable is predetermined as shared}}
-class S4 {          // expected-note 3 {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4();          // expected-note 3 {{implicitly declared private here}}
   S4(const S4 &s4);
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
 
 public:
   S5(const S5 &s5) : a(s5.a) {}
@@ -62,8 +63,8 @@ S3 h;
 
 template <class I, class C>
 int foomain(int argc, char **argv) {
-  I e(4); // expected-note {{'e' defined here}}
-  I g(5); // expected-note {{'g' defined here}}
+  I e(4);
+  I g(5);
   int i;
   int &j = i;                        // expected-note {{'j' defined here}}
 #pragma omp parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
@@ -96,7 +97,7 @@ int foomain(int argc, char **argv) {
 #pragma omp parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
   for (int k = 0; k < argc; ++k)
     ++k;
-#pragma omp parallel for simd lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp parallel for simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp parallel for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
@@ -126,12 +127,20 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
-  S4 e(4);               // expected-note {{'e' defined here}}
-  S5 g(5);               // expected-note {{'g' defined here}}
-  S3 m;                  // expected-note 2 {{'m' defined here}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
   S6 n(2);
   int i;
   int &j = i;                        // expected-note {{'j' defined here}}
@@ -181,7 +190,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd lastprivate(xa) // OK
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+#pragma omp parallel for simd lastprivate(S2::S2s)
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for simd lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
@@ -190,13 +199,13 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd safelen(5)
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp parallel for simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel for simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+#pragma omp parallel for simd lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for simd private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
@@ -216,7 +225,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd lastprivate(j) // expected-error {{arguments of OpenMP clause 'lastprivate' cannot be of reference type}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp parallel for simd firstprivate(m) lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel for simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel for simd lastprivate(n) firstprivate(n) // OK
diff --git a/test/OpenMP/parallel_for_simd_linear_messages.cpp b/test/OpenMP/parallel_for_simd_linear_messages.cpp
index 3918de2681db..55b0c3d7f9c8 100644
--- a/test/OpenMP/parallel_for_simd_linear_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_linear_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 namespace X {
   int x;
@@ -148,6 +148,14 @@ template<class I, class C> int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   double darr[100];
   // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
@@ -185,7 +193,7 @@ int main(int argc, char **argv) {
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
   #pragma omp parallel for simd linear(e, g)
   for (int k = 0; k < argc; ++k) ++k;
-  #pragma omp parallel for simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  #pragma omp parallel for simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
   for (int k = 0; k < argc; ++k) ++k;
   #pragma omp parallel
   {
diff --git a/test/OpenMP/parallel_for_simd_loop_messages.cpp b/test/OpenMP/parallel_for_simd_loop_messages.cpp
index 50acb10feed9..0473b248a69d 100644
--- a/test/OpenMP/parallel_for_simd_loop_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_loop_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
 
 class S {
   int a;
@@ -10,7 +10,7 @@ public:
 };
 
 static int sii;
-#pragma omp threadprivate(sii) // expected-note {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
 static int globalii;
 
 int test_iteration_spaces() {
@@ -259,21 +259,18 @@ int test_iteration_spaces() {
     c[ii] = a[ii];
 
   {
-// expected-error@+2 {{loop iteration variable in the associated loop of 'omp parallel for simd' directive may not be threadprivate or thread local, predetermined as linear}}
 #pragma omp parallel for simd
     for (sii = 0; sii < 10; sii += 1)
       c[sii] = a[sii];
   }
 
   {
-// expected-error@+2 {{loop iteration variable in the associated loop of 'omp parallel for simd' directive may not be a variable with global storage without being explicitly marked as linear}}
 #pragma omp parallel for simd
     for (globalii = 0; globalii < 10; globalii += 1)
       c[globalii] = a[globalii];
   }
 
   {
-// expected-error@+3 {{loop iteration variable in the associated loop of 'omp parallel for simd' directive may not be a variable with global storage without being explicitly marked as lastprivate}}
 #pragma omp parallel for simd collapse(2)
     for (ii = 0; ii < 10; ii += 1)
     for (globalii = 0; globalii < 10; globalii += 1)
diff --git a/test/OpenMP/parallel_for_simd_messages.cpp b/test/OpenMP/parallel_for_simd_messages.cpp
index 67a025c5e6a6..fe14883080c0 100644
--- a/test/OpenMP/parallel_for_simd_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -o - %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_simd_misc_messages.c b/test/OpenMP/parallel_for_simd_misc_messages.c
index 4cb084318309..ed9ac4b5242a 100644
--- a/test/OpenMP/parallel_for_simd_misc_messages.c
+++ b/test/OpenMP/parallel_for_simd_misc_messages.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
 
 // expected-error@+1 {{unexpected OpenMP directive '#pragma omp parallel for simd'}}
 #pragma omp parallel for simd
diff --git a/test/OpenMP/parallel_for_simd_num_threads_messages.cpp b/test/OpenMP/parallel_for_simd_num_threads_messages.cpp
index 3d11d4f59cf6..5b5d33464ef8 100644
--- a/test/OpenMP/parallel_for_simd_num_threads_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_num_threads_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_simd_private_messages.cpp b/test/OpenMP/parallel_for_simd_private_messages.cpp
index 67d881318645..130736a33a07 100644
--- a/test/OpenMP/parallel_for_simd_private_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -108,6 +108,14 @@ int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
@@ -146,7 +154,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (int k = 0; k < argc; ++k)
     ++k;
-#pragma omp parallel for simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp parallel for simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp parallel for simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp parallel for simd'}}
diff --git a/test/OpenMP/parallel_for_simd_proc_bind_messages.cpp b/test/OpenMP/parallel_for_simd_proc_bind_messages.cpp
index bc1e0d2b243b..a05b150f5694 100644
--- a/test/OpenMP/parallel_for_simd_proc_bind_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_proc_bind_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_for_simd_reduction_messages.cpp b/test/OpenMP/parallel_for_simd_reduction_messages.cpp
index 61690ddd42c1..75733a3647ac 100644
--- a/test/OpenMP/parallel_for_simd_reduction_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -o - %s
 
 void foo() {
 }
@@ -11,12 +11,12 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
-  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static float S2s;
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;                       // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
-  T fl;                            // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
     foo();
@@ -96,10 +96,10 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel for simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name}}
@@ -114,7 +114,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp parallel for simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified variable cannot be reduction}}
@@ -132,10 +132,10 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp parallel for simd reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}} expected-error {{a reduction variable with array type 'const float [5]'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp parallel for simd reduction(&& : S2::S2s)
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
@@ -144,7 +144,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp parallel for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp parallel for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -177,18 +177,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;                      // expected-note 2 {{'j' defined here}}
   S3 &p = k;                       // expected-note 2 {{'p' defined here}}
   const int &r = da[i];            // expected-note {{'r' defined here}}
   int &q = qa[i];                  // expected-note {{'q' defined here}}
-  float fl;                        // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
     foo();
@@ -228,7 +236,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}}
@@ -246,22 +254,22 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp parallel for simd reduction(&& : S2::S2s)
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp parallel for simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+#pragma omp parallel for simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp parallel for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
diff --git a/test/OpenMP/parallel_for_simd_safelen_messages.cpp b/test/OpenMP/parallel_for_simd_safelen_messages.cpp
index 3fef81c74735..eb0aa5ae8b5e 100644
--- a/test/OpenMP/parallel_for_simd_safelen_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_safelen_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_for_simd_schedule_messages.cpp b/test/OpenMP/parallel_for_simd_schedule_messages.cpp
index 9e153d99a0a4..36befb213559 100644
--- a/test/OpenMP/parallel_for_simd_schedule_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_schedule_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_if_codegen.cpp b/test/OpenMP/parallel_if_codegen.cpp
index 44c874f1fb06..3461743221e0 100644
--- a/test/OpenMP/parallel_if_codegen.cpp
+++ b/test/OpenMP/parallel_if_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -16,7 +16,7 @@ int Arg;
 
 // CHECK-LABEL: define void @{{.+}}gtid_test
 void gtid_test() {
-// CHECK:  call void {{.+}}* @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 1, {{.+}}* [[GTID_TEST_REGION1:@.+]] to void
+// CHECK:  call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 1, {{.+}}* [[GTID_TEST_REGION1:@.+]] to void
 #pragma omp parallel
 #pragma omp parallel if (false)
   gtid_test();
@@ -25,10 +25,10 @@ void gtid_test() {
 
 // CHECK: define internal void [[GTID_TEST_REGION1]](i{{.+}}* [[GTID_PARAM:%.+]], i
 // CHECK: store i{{[0-9]+}}* [[GTID_PARAM]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]],
-// CHECK: [[GTID_ADDR:%.+]] = load i{{[0-9]+}}** [[GTID_ADDR_REF]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}* [[GTID_ADDR]]
+// CHECK: [[GTID_ADDR:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_ADDR]]
 // CHECK: call void @__kmpc_serialized_parallel(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]])
-// CHECK: [[GTID_ADDR:%.+]] = load i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID_ADDR:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
 // CHECK: call void [[GTID_TEST_REGION2:@.+]](i{{[0-9]+}}* [[GTID_ADDR]]
 // CHECK: call void @__kmpc_end_serialized_parallel(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]])
 // CHECK: ret void
diff --git a/test/OpenMP/parallel_if_messages.cpp b/test/OpenMP/parallel_if_messages.cpp
index 1559692a989d..97096dfae3e8 100644
--- a/test/OpenMP/parallel_if_messages.cpp
+++ b/test/OpenMP/parallel_if_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_messages.cpp b/test/OpenMP/parallel_messages.cpp
index 1e0edbc6f808..8aee8414f034 100644
--- a/test/OpenMP/parallel_messages.cpp
+++ b/test/OpenMP/parallel_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 -o - %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_num_threads_codegen.cpp b/test/OpenMP/parallel_num_threads_codegen.cpp
index c095e430b60b..2342c47f0be2 100644
--- a/test/OpenMP/parallel_num_threads_codegen.cpp
+++ b/test/OpenMP/parallel_num_threads_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple %itanium_abi_triple -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple %itanium_abi_triple -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -10,7 +10,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[S_TY:%.+]] = type { [[INTPTR_T_TY:i[0-9]+]], [[INTPTR_T_TY]], [[INTPTR_T_TY]] }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
@@ -48,11 +48,11 @@ int main() {
 // CHECK:       [[S_CHAR_OP:%.+]] = invoke{{.*}} i8 [[S_TY_CHAR_OP:@.+]]([[S_TY]]* [[S_ADDR]])
 // CHECK:       store i8 [[S_CHAR_OP]], i8* [[A_ADDR]]
 // CHECK:       call void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 2)
-// CHECK:       call void {{.*}}* @__kmpc_fork_call(
-// CHECK:       [[A_VAL:%.+]] = load i8* [[A_ADDR]]
+// CHECK:       call void {{.*}} @__kmpc_fork_call(
+// CHECK:       [[A_VAL:%.+]] = load i8, i8* [[A_ADDR]]
 // CHECK:       [[RES:%.+]] = sext i8 [[A_VAL]] to i32
 // CHECK:       call void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 [[RES]])
-// CHECK:       call void {{.*}}* @__kmpc_fork_call(
+// CHECK:       call void {{.*}} @__kmpc_fork_call(
 // CHECK:       invoke{{.*}} [[INT_TY:i[0-9]+]] [[TMAIN_CHAR_5:@.+]]()
 // CHECK:       invoke{{.*}} [[INT_TY]] [[TMAIN_S_1:@.+]]()
 // CHECK:       call {{.*}} [[S_TY_DESTR:@.+]]([[S_TY]]* [[S_ADDR]])
@@ -62,22 +62,22 @@ int main() {
 // CHECK:       define{{.*}} [[INT_TY]] [[TMAIN_CHAR_5]]()
 // CHECK:       [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEF_LOC_2]])
 // CHECK:       call void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 5)
-// CHECK:       call void {{.*}}* @__kmpc_fork_call(
+// CHECK:       call void {{.*}} @__kmpc_fork_call(
 // CHECK:       call void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 23)
-// CHECK:       call void {{.*}}* @__kmpc_fork_call(
+// CHECK:       call void {{.*}} @__kmpc_fork_call(
 // CHECK:       ret [[INT_TY]] 0
 // CHECK-NEXT:  }
 
 // CHECK:       define{{.*}} [[INT_TY]] [[TMAIN_S_1]]()
 // CHECK:       [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEF_LOC_2]])
 // CHECK:       call void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 1)
-// CHECK:       call void {{.*}}* @__kmpc_fork_call(
+// CHECK:       call void {{.*}} @__kmpc_fork_call(
 // CHECK:       call {{.*}} [[S_TY_CONSTR]]([[S_TY]]* [[S_TEMP:%.+]], [[INTPTR_T_TY]] [[INTPTR_T_TY_ATTR]]23)
 // CHECK:       [[S_CHAR_OP:%.+]] = invoke{{.*}} i8 [[S_TY_CHAR_OP]]([[S_TY]]* [[S_TEMP]])
 // CHECK:       [[RES:%.+]] = sext {{.*}}i8 [[S_CHAR_OP]] to i32
 // CHECK:       call void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 [[RES]])
 // CHECK:       call {{.*}} [[S_TY_DESTR]]([[S_TY]]* [[S_TEMP]])
-// CHECK:       call void {{.*}}* @__kmpc_fork_call(
+// CHECK:       call void {{.*}} @__kmpc_fork_call(
 // CHECK:       ret [[INT_TY]] 0
 // CHECK:       }
 
diff --git a/test/OpenMP/parallel_num_threads_messages.cpp b/test/OpenMP/parallel_num_threads_messages.cpp
index facca5e35879..180d9cd8035a 100644
--- a/test/OpenMP/parallel_num_threads_messages.cpp
+++ b/test/OpenMP/parallel_num_threads_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
@@ -9,6 +9,8 @@ bool foobool(int argc) {
 
 struct S1; // expected-note {{declared here}}
 
+#define redef_num_threads(a, b) num_threads(a)
+
 template <class T, typename S, int N> // expected-note {{declared here}}
 T tmain(T argc, S **argv) {
   #pragma omp parallel num_threads // expected-error {{expected '(' after 'num_threads'}}
@@ -22,6 +24,7 @@ T tmain(T argc, S **argv) {
   #pragma omp parallel num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
   #pragma omp parallel num_threads (argc)
   #pragma omp parallel num_threads (N) // expected-error {{argument to 'num_threads' clause must be a positive integer value}}
+  #pragma omp parallel redef_num_threads (argc, argc)
   foo();
 
   return argc;
@@ -38,6 +41,7 @@ int main(int argc, char **argv) {
   #pragma omp parallel num_threads (S1) // expected-error {{'S1' does not refer to a value}}
   #pragma omp parallel num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
   #pragma omp parallel num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  #pragma omp parallel redef_num_threads (argc, argc)
   foo();
 
   return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
diff --git a/test/OpenMP/parallel_private_codegen.cpp b/test/OpenMP/parallel_private_codegen.cpp
index 6911068250f5..99e2d4d404db 100644
--- a/test/OpenMP/parallel_private_codegen.cpp
+++ b/test/OpenMP/parallel_private_codegen.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -22,7 +22,6 @@ volatile int g = 1212;
 // CHECK: [[CAP_MAIN_TY:%.+]] = type { [2 x i{{[0-9]+}}]*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]* }
 // CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
 // CHECK: [[CAP_TMAIN_TY:%.+]] = type { [2 x i{{[0-9]+}}]*, i{{[0-9]+}}*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
-// CHECK: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 template <typename T>
 T tmain() {
   S<T> test;
@@ -42,31 +41,30 @@ int main() {
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // LAMBDA: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // LAMBDA: store i{{[0-9]+}}* [[G]], i{{[0-9]+}}** [[G_LOCAL_REF]]
   // LAMBDA: [[ARG:%.+]] = bitcast %{{.+}}* [[AGG_CAPTURED]] to i8*
-  // LAMBDA: call void {{.+}}* @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
 #pragma omp parallel private(g)
   {
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
     // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
-    // LAMBDA: call i32 @__kmpc_cancel_barrier(
     g = 1;
     // LAMBDA: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
-    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
     // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
-    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call{{( x86_thiscallcc)?}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
     [&]() {
       // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
       // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
       g = 2;
-      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}** [[ARG_PTR_REF]]
-      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
       // LAMBDA: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
     }();
   }
@@ -75,25 +73,24 @@ int main() {
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
-  // BLOCKS: call void {{%.+}}(i8*
+  // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // BLOCKS: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // BLOCKS: store i{{[0-9]+}}* [[G]], i{{[0-9]+}}** [[G_LOCAL_REF]]
   // BLOCKS: [[ARG:%.+]] = bitcast %{{.+}}* [[AGG_CAPTURED]] to i8*
-  // BLOCKS: call void {{.+}}* @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
 #pragma omp parallel private(g)
   {
     // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
     // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
     // BLOCKS: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
-    // BLOCKS: call i32 @__kmpc_cancel_barrier(
     g = 1;
     // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
     // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
     // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
     // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
-    // BLOCKS: call void {{%.+}}(i8*
+    // BLOCKS: call void {{%.+}}(i8
     ^{
       // BLOCKS: define {{.+}} void {{@.+}}(i8*
       g = 2;
@@ -124,7 +121,7 @@ int main() {
 // CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
 // CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...)* @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
 // CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
 // CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
 // CHECK: ret
@@ -143,9 +140,6 @@ int main() {
 // CHECK-NOT: [[T_VAR_PRIV]]
 // CHECK-NOT: [[VEC_PRIV]]
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}** [[GTID_ADDR_REF]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
 // CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
 // CHECK: ret void
@@ -153,7 +147,7 @@ int main() {
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...)* @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
 // CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
@@ -171,9 +165,6 @@ int main() {
 // CHECK-NOT: [[T_VAR_PRIV]]
 // CHECK-NOT: [[VEC_PRIV]]
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}** [[GTID_ADDR_REF]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
 // CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
 // CHECK: ret void
diff --git a/test/OpenMP/parallel_private_messages.cpp b/test/OpenMP/parallel_private_messages.cpp
index 14c5cbdb8aea..a637b5018f53 100644
--- a/test/OpenMP/parallel_private_messages.cpp
+++ b/test/OpenMP/parallel_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
@@ -13,7 +13,7 @@ class S2 {
   mutable int a;
 public:
   S2():a(0) { }
-  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static float S2s;
 };
 const S2 b;
 const S2 ba[5];
@@ -41,6 +41,14 @@ public:
 int threadvar;
 #pragma omp threadprivate(threadvar) // expected-note {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5; // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = { 0 }; // expected-note {{constant variable is predetermined as shared}}
@@ -61,9 +69,9 @@ int main(int argc, char **argv) {
   #pragma omp parallel private(ba)
   #pragma omp parallel private(ca) // expected-error {{shared variable cannot be private}}
   #pragma omp parallel private(da) // expected-error {{shared variable cannot be private}}
-  #pragma omp parallel private(S2::S2s) // expected-error {{shared variable cannot be private}}
+  #pragma omp parallel private(S2::S2s)
   #pragma omp parallel private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
-  #pragma omp parallel private(threadvar) // expected-error {{threadprivate or thread local variable cannot be private}}
+  #pragma omp parallel private(threadvar, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   #pragma omp parallel shared(i), private(i) // expected-error {{shared variable cannot be private}} expected-note {{defined as shared}}
   foo();
   #pragma omp parallel firstprivate(i) private(i) // expected-error {{firstprivate variable cannot be private}} expected-note {{defined as firstprivate}}
diff --git a/test/OpenMP/parallel_proc_bind_messages.cpp b/test/OpenMP/parallel_proc_bind_messages.cpp
index 0bb9fc7db626..78ba297fdf5d 100644
--- a/test/OpenMP/parallel_proc_bind_messages.cpp
+++ b/test/OpenMP/parallel_proc_bind_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_reduction_codegen.cpp b/test/OpenMP/parallel_reduction_codegen.cpp
new file mode 100644
index 000000000000..9ce16e919290
--- /dev/null
+++ b/test/OpenMP/parallel_reduction_codegen.cpp
@@ -0,0 +1,683 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+volatile int g = 1212;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  operator T() { return T(); }
+  S &operator&(const S &) { return *this; }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i{{[0-9]+}}]*, float*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float* }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i{{[0-9]+}}]*, i{{[0-9]+}}*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*, [[S_INT_TY]]*, i{{[0-9]+}}* }
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
+
+template <typename T>
+T tmain() {
+  T t;
+  S<T> test;
+  T t_var = T(), t_var1;
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3), var1;
+#pragma omp parallel reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // LAMBDA: store i{{[0-9]+}}* [[G]], i{{[0-9]+}}** [[G_LOCAL_REF]]
+  // LAMBDA: [[ARG:%.+]] = bitcast %{{.+}}* [[AGG_CAPTURED]] to i8*
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
+#pragma omp parallel reduction(+:g)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+
+    // Reduction list for runtime.
+    // LAMBDA: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+    // LAMBDA: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    // LAMBDA: [[ARG:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_REF]]
+    // LAMBDA: [[G_REF_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_REF_ADDR]]
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    g = 1;
+    // LAMBDA: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+
+    // LAMBDA: [[G_PRIV_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i32 0, i32 0
+    // LAMBDA: [[BITCAST:%.+]] = bitcast i32* [[G_PRIVATE_ADDR]] to i8*
+    // LAMBDA: store i8* [[BITCAST]], i8** [[G_PRIV_REF]],
+    // LAMBDA: call i32 @__kmpc_reduce_nowait(
+    // LAMBDA: switch i32 %{{.+}}, label %[[REDUCTION_DONE:.+]] [
+    // LAMBDA: i32 1, label %[[CASE1:.+]]
+    // LAMBDA: i32 2, label %[[CASE2:.+]]
+    // LAMBDA: [[CASE1]]
+    // LAMBDA: [[G_VAL:%.+]] = load i32, i32* [[G_REF]]
+    // LAMBDA: [[G_PRIV_VAL:%.+]] = load i32, i32* [[G_PRIVATE_ADDR]]
+    // LAMBDA: [[ADD:%.+]] = add nsw i32 [[G_VAL]], [[G_PRIV_VAL]]
+    // LAMBDA: store i32 [[ADD]], i32* [[G_REF]]
+    // LAMBDA: call void @__kmpc_end_reduce_nowait(
+    // LAMBDA: br label %[[REDUCTION_DONE]]
+    // LAMBDA: [[CASE2]]
+    // LAMBDA: [[G_PRIV_VAL:%.+]] = load i32, i32* [[G_PRIVATE_ADDR]]
+    // LAMBDA: atomicrmw add i32* [[G_REF]], i32 [[G_PRIV_VAL]] monotonic
+    // LAMBDA: br label %[[REDUCTION_DONE]]
+    // LAMBDA: [[REDUCTION_DONE]]
+    // LAMBDA: ret void
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[G_LOCAL_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[AGG_CAPTURED:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // BLOCKS: store i{{[0-9]+}}* [[G]], i{{[0-9]+}}** [[G_LOCAL_REF]]
+  // BLOCKS: [[ARG:%.+]] = bitcast %{{.+}}* [[AGG_CAPTURED]] to i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* [[ARG]])
+#pragma omp parallel reduction(-:g)
+  {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+
+    // Reduction list for runtime.
+    // BLOCKS: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+    // BLOCKS: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    // BLOCKS: [[ARG:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_REF]]
+    // BLOCKS: [[G_REF_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // BLOCKS: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_REF_ADDR]]
+    // BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    g = 1;
+    // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+
+    // BLOCKS: [[G_PRIV_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i32 0, i32 0
+    // BLOCKS: [[BITCAST:%.+]] = bitcast i32* [[G_PRIVATE_ADDR]] to i8*
+    // BLOCKS: store i8* [[BITCAST]], i8** [[G_PRIV_REF]],
+    // BLOCKS: call i32 @__kmpc_reduce_nowait(
+    // BLOCKS: switch i32 %{{.+}}, label %[[REDUCTION_DONE:.+]] [
+    // BLOCKS: i32 1, label %[[CASE1:.+]]
+    // BLOCKS: i32 2, label %[[CASE2:.+]]
+    // BLOCKS: [[CASE1]]
+    // BLOCKS: [[G_VAL:%.+]] = load i32, i32* [[G_REF]]
+    // BLOCKS: [[G_PRIV_VAL:%.+]] = load i32, i32* [[G_PRIVATE_ADDR]]
+    // BLOCKS: [[ADD:%.+]] = add nsw i32 [[G_VAL]], [[G_PRIV_VAL]]
+    // BLOCKS: store i32 [[ADD]], i32* [[G_REF]]
+    // BLOCKS: call void @__kmpc_end_reduce_nowait(
+    // BLOCKS: br label %[[REDUCTION_DONE]]
+    // BLOCKS: [[CASE2]]
+    // BLOCKS: [[G_PRIV_VAL:%.+]] = load i32, i32* [[G_PRIVATE_ADDR]]
+    // BLOCKS: atomicrmw add i32* [[G_REF]], i32 [[G_PRIV_VAL]] monotonic
+    // BLOCKS: br label %[[REDUCTION_DONE]]
+    // BLOCKS: [[REDUCTION_DONE]]
+    // BLOCKS: ret void
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  float t_var = 0, t_var1;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3), var1;
+#pragma omp parallel reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_MAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca float,
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca float,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR_REF:%.+]] = load float*, float** [[T_VAR_PTR_REF]],
+// For + reduction operation initial value of private variable is 0.
+// CHECK: store float 0.0{{.+}}, float* [[T_VAR_PRIV]],
+
+// CHECK: [[VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_PTR_REF:%.+]],
+// For & reduction operation initial value of private variable is ones in all bits.
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+
+// CHECK: [[VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR1_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_PTR_REF:%.+]],
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[VAR1_PRIV]])
+
+// CHECK: [[T_VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR1_REF:%.+]] = load float*, float** [[T_VAR1_PTR_REF]],
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: store float 0x47EFFFFFE0000000, float* [[T_VAR1_PRIV]],
+
+// Skip checks for internal operations.
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 0
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 3
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce_nowait(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: [[T_VAR_VAL:%.+]] = load float, float* [[T_VAR_REF]],
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load float, float* [[T_VAR_PRIV]],
+// CHECK: [[UP:%.+]] = fadd float [[T_VAR_VAL]], [[T_VAR_PRIV_VAL]]
+// CHECK: store float [[UP]], float* [[T_VAR_REF]],
+
+// var = var.operator &(var_reduction);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = uitofp i1 [[COND_LVALUE]] to float
+// CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: [[T_VAR1_VAL:%.+]] = load float, float* [[T_VAR1_REF]],
+// CHECK: [[T_VAR1_PRIV_VAL:%.+]] = load float, float* [[T_VAR1_PRIV]],
+// CHECK: [[CMP:%.+]] = fcmp olt float [[T_VAR1_VAL]], [[T_VAR1_PRIV_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi float
+// CHECK: store float [[UP]], float* [[T_VAR1_REF]],
+
+// __kmpc_end_reduce_nowait(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: load float, float* [[T_VAR_PRIV]]
+// CHECK: [[T_VAR_REF_INT:%.+]] = bitcast float* [[T_VAR_REF]] to i32*
+// CHECK: [[OLD1:%.+]] = load atomic i32, i32* [[T_VAR_REF_INT]] monotonic,
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[ORIG_OLD_INT:%.+]] = phi i32 [ [[OLD1]], %{{.+}} ], [ [[OLD2:%.+]], %[[CONT]] ]
+// CHECK: fadd float
+// CHECK: [[UP_INT:%.+]] = load i32
+// CHECK: [[T_VAR_REF_INT:%.+]] = bitcast float* [[T_VAR_REF]] to i32*
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[T_VAR_REF_INT]], i32 [[ORIG_OLD_INT]], i32 [[UP_INT]] monotonic monotonic
+// CHECK: [[OLD2:%.+]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[ATOMIC_DONE:.+]], label %[[CONT]]
+// CHECK: [[ATOMIC_DONE]]
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = uitofp i1 [[COND_LVALUE]] to float
+// CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: load float, float* [[T_VAR1_PRIV]]
+// CHECK: [[T_VAR1_REF_INT:%.+]] = bitcast float* [[T_VAR1_REF]] to i32*
+// CHECK: [[OLD1:%.+]] = load atomic i32, i32* [[T_VAR1_REF_INT]] monotonic,
+// CHECK: br label %[[CONT:.+]]
+// CHECK: [[CONT]]
+// CHECK: [[ORIG_OLD_INT:%.+]] = phi i32 [ [[OLD1]], %{{.+}} ], [ [[OLD2:%.+]], %{{.+}} ]
+// CHECK: [[CMP:%.+]] = fcmp olt float
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi float
+// CHECK: [[UP_INT:%.+]] = load i32
+// CHECK: [[T_VAR1_REF_INT:%.+]] = bitcast float* [[T_VAR1_REF]] to i32*
+// CHECK: [[RES:%.+]] = cmpxchg i32* [[T_VAR1_REF_INT]], i32 [[ORIG_OLD_INT]], i32 [[UP_INT]] monotonic monotonic
+// CHECK: [[OLD2:%.+]] = extractvalue { i32, i1 } [[RES]], 0
+// CHECK: [[SUCCESS_FAIL:%.+]] = extractvalue { i32, i1 } [[RES]], 1
+// CHECK: br i1 [[SUCCESS_FAIL]], label %[[ATOMIC_DONE:.+]], label %[[CONT]]
+// CHECK: [[ATOMIC_DONE]]
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (float*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to float*
+// t_var_rhs = (float*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to float*
+
+// var_lhs = (S<float>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_FLOAT_TY]]*
+// var_rhs = (S<float>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// var1_lhs = (S<float>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_FLOAT_TY]]*
+// var1_rhs = (S<float>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// t_var1_lhs = (float*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to float*
+// t_var1_rhs = (float*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to float*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: [[T_VAR_LHS_VAL:%.+]] = load float, float* [[T_VAR_LHS]],
+// CHECK: [[T_VAR_RHS_VAL:%.+]] = load float, float* [[T_VAR_RHS]],
+// CHECK: [[UP:%.+]] = fadd float [[T_VAR_LHS_VAL]], [[T_VAR_RHS_VAL]]
+// CHECK: store float [[UP]], float* [[T_VAR_LHS]],
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_LHS]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_RHS]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_LHS]])
+// CHECK: [[VAR1_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_RHS]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = fcmp une float [[TO_FLOAT]], 0.0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = uitofp i1 [[COND_LVALUE]] to float
+// CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load float, float* [[T_VAR1_LHS]],
+// CHECK: [[T_VAR1_RHS_VAL:%.+]] = load float, float* [[T_VAR1_RHS]],
+// CHECK: [[CMP:%.+]] = fcmp olt float [[T_VAR1_LHS_VAL]], [[T_VAR1_RHS_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi float
+// CHECK: store float [[UP]], float* [[T_VAR1_LHS]],
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_TMAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca i{{[0-9]+}},
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// For + reduction operation initial value of private variable is 0.
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// CHECK: [[VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_PTR_REF:%.+]],
+// For & reduction operation initial value of private variable is ones in all bits.
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[VAR_PRIV]])
+
+// CHECK: [[VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_PTR_REF:%.+]],
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[VAR1_PRIV]])
+
+// CHECK: [[T_VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR1_PTR_REF]],
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: store i{{[0-9]+}} 2147483647, i{{[0-9]+}}* [[T_VAR1_PRIV]],
+
+// Skip checks for internal operations.
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 0
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 3
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce_nowait(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: [[UP:%.+]] = add nsw i{{[0-9]+}} [[T_VAR_VAL]], [[T_VAR_PRIV_VAL]]
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR_REF]],
+
+// var = var.operator &(var_reduction);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
+// CHECK: [[T_VAR1_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_PRIV]],
+// CHECK: [[CMP:%.+]] = icmp slt i{{[0-9]+}} [[T_VAR1_VAL]], [[T_VAR1_PRIV_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi i32
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR1_REF]],
+
+// __kmpc_end_reduce_nowait(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]]
+// CHECK: atomicrmw add i32* [[T_VAR_REF]], i32 [[T_VAR_PRIV_VAL]] monotonic
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: [[T_VAR1_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_PRIV]]
+// CHECK: atomicrmw min i32* [[T_VAR1_REF]], i32 [[T_VAR1_PRIV_VAL]] monotonic
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (i{{[0-9]+}}*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to i{{[0-9]+}}*
+// t_var_rhs = (i{{[0-9]+}}*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to i{{[0-9]+}}*
+
+// var_lhs = (S<i{{[0-9]+}}>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_INT_TY]]*
+// var_rhs = (S<i{{[0-9]+}}>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_INT_TY]]*
+
+// var1_lhs = (S<i{{[0-9]+}}>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_INT_TY]]*
+// var1_rhs = (S<i{{[0-9]+}}>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_INT_TY]]*
+
+// t_var1_lhs = (i{{[0-9]+}}*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to i{{[0-9]+}}*
+// t_var1_rhs = (i{{[0-9]+}}*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to i{{[0-9]+}}*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: [[T_VAR_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_LHS]],
+// CHECK: [[T_VAR_RHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_RHS]],
+// CHECK: [[UP:%.+]] = add nsw i{{[0-9]+}} [[T_VAR_LHS_VAL]], [[T_VAR_RHS_VAL]]
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR_LHS]],
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_RHS]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
+// CHECK: [[T_VAR1_RHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_RHS]],
+// CHECK: [[CMP:%.+]] = icmp slt i{{[0-9]+}} [[T_VAR1_LHS_VAL]], [[T_VAR1_RHS_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi i32
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR1_LHS]],
+// CHECK: ret void
+
+#endif
+
diff --git a/test/OpenMP/parallel_reduction_messages.cpp b/test/OpenMP/parallel_reduction_messages.cpp
index 43ebc01b2a36..c93cfbd9efaf 100644
--- a/test/OpenMP/parallel_reduction_messages.cpp
+++ b/test/OpenMP/parallel_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo() {
 }
@@ -11,12 +11,12 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
-  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static float S2s;
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;                   // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i];     // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];           // expected-note 2 {{'q' defined here}}
-  T fl;                        // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp parallel reduction // expected-error {{expected '(' after 'reduction'}}
   foo();
 #pragma omp parallel reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
@@ -89,9 +89,9 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   foo();
 #pragma omp parallel reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
   foo();
-#pragma omp parallel reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
-#pragma omp parallel reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
 #pragma omp parallel reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name}}
   foo();
@@ -101,7 +101,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   foo();
 #pragma omp parallel reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   foo();
-#pragma omp parallel reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp parallel reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   foo();
 #pragma omp parallel reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified variable cannot be reduction}}
   foo();
@@ -113,15 +113,15 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   foo();
 #pragma omp parallel reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}} expected-error {{a reduction variable with array type 'const float [5]'}}
   foo();
-#pragma omp parallel reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
-#pragma omp parallel reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp parallel reduction(&& : S2::S2s)
   foo();
 #pragma omp parallel reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
   foo();
 #pragma omp parallel reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   foo();
-#pragma omp parallel reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp parallel reduction(+ : o) // expected-error {{no viable overloaded '='}}
   foo();
 #pragma omp parallel private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   foo();
@@ -150,18 +150,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;                  // expected-note 2 {{'j' defined here}}
   S3 &p = k;                   // expected-note 2 {{'p' defined here}}
   const int &r = da[i];        // expected-note {{'r' defined here}}
   int &q = qa[i];              // expected-note {{'q' defined here}}
-  float fl;                    // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp parallel reduction // expected-error {{expected '(' after 'reduction'}}
   foo();
 #pragma omp parallel reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
@@ -188,7 +196,7 @@ int main(int argc, char **argv) {
   foo();
 #pragma omp parallel reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
   foo();
-#pragma omp parallel reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp parallel reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   foo();
 #pragma omp parallel reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}}
   foo();
@@ -200,17 +208,17 @@ int main(int argc, char **argv) {
   foo();
 #pragma omp parallel reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}}
   foo();
-#pragma omp parallel reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
-#pragma omp parallel reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp parallel reduction(&& : S2::S2s)
   foo();
 #pragma omp parallel reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
   foo();
-#pragma omp parallel reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp parallel reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{nvalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   foo();
-#pragma omp parallel reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+#pragma omp parallel reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
   foo();
-#pragma omp parallel reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp parallel reduction(+ : o) // expected-error {{no viable overloaded '='}}
   foo();
 #pragma omp parallel private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   foo();
diff --git a/test/OpenMP/parallel_sections_ast_print.cpp b/test/OpenMP/parallel_sections_ast_print.cpp
index 43665f743952..7667e45bfdbc 100644
--- a/test/OpenMP/parallel_sections_ast_print.cpp
+++ b/test/OpenMP/parallel_sections_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/parallel_sections_codegen.cpp b/test/OpenMP/parallel_sections_codegen.cpp
new file mode 100644
index 000000000000..c16cc8d5edd7
--- /dev/null
+++ b/test/OpenMP/parallel_sections_codegen.cpp
@@ -0,0 +1,98 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+// CHECK: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-LABEL: foo
+void foo() {};
+// CHECK-LABEL: bar
+void bar() {};
+
+template <class T>
+T tmain() {
+#pragma omp parallel sections
+  {
+    foo();
+  }
+  return T();
+}
+
+// CHECK-LABEL: @main
+int main() {
+// CHECK: call void (%{{.+}}*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK-LABEL: }
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+#pragma omp parallel sections
+  {
+// CHECK:      store i32 0, i32* [[LB_PTR:%.+]],
+// CHECK:      store i32 1, i32* [[UB_PTR:%.+]],
+// CHECK:      [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK:      [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK:      call void @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_PTR:%.+]], i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[STRIDE_PTR:%.+]], i32 1, i32 1)
+// <<UB = min(UB, GlobalUB);>>
+// CHECK:      [[UB:%.+]] = load i32, i32* [[UB_PTR]]
+// CHECK:      [[CMP:%.+]] = icmp slt i32 [[UB]], 1
+// CHECK:      [[MIN_UB_GLOBALUB:%.+]] = select i1 [[CMP]], i32 [[UB]], i32 1
+// CHECK:      store i32 [[MIN_UB_GLOBALUB]], i32* [[UB_PTR]]
+// <<IV = LB;>>
+// CHECK:      [[LB:%.+]] = load i32, i32* [[LB_PTR]]
+// CHECK:      store i32 [[LB]], i32* [[IV_PTR:%.+]]
+// CHECK:      br label %[[INNER_FOR_COND:.+]]
+// CHECK:      [[INNER_FOR_COND]]
+// <<IV <= UB?>>
+// CHECK:      [[IV:%.+]] = load i32, i32* [[IV_PTR]]
+// CHECK:      [[UB:%.+]] = load i32, i32* [[UB_PTR]]
+// CHECK:      [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK:      br i1 [[CMP]], label %[[INNER_LOOP_BODY:.+]], label %[[INNER_LOOP_END:.+]]
+// CHECK:      [[INNER_LOOP_BODY]]
+// <<TRUE>> - > <BODY>
+// CHECK:      [[IV:%.+]] = load i32, i32* [[IV_PTR]]
+// CHECK:      switch i32 [[IV]], label %[[SECTIONS_EXIT:.+]] [
+// CHECK-NEXT: i32 0, label %[[SECTIONS_CASE0:.+]]
+// CHECK-NEXT: i32 1, label %[[SECTIONS_CASE1:.+]]
+#pragma omp section
+// CHECK:      [[SECTIONS_CASE0]]
+// CHECK-NEXT: invoke void @{{.*}}foo{{.*}}()
+// CHECK:      br label %[[SECTIONS_EXIT]]
+    foo();
+#pragma omp section
+// CHECK:      [[SECTIONS_CASE1]]
+// CHECK-NEXT: invoke void @{{.*}}bar{{.*}}()
+// CHECK:      br label %[[SECTIONS_EXIT]]
+    bar();
+// CHECK:      [[SECTIONS_EXIT]]
+// <<++IV;>>
+// CHECK:      [[IV:%.+]] = load i32, i32* [[IV_PTR]]
+// CHECK-NEXT: [[INC:%.+]] = add nsw i32 [[IV]], 1
+// CHECK-NEXT: store i32 [[INC]], i32* [[IV_PTR]]
+// CHECK-NEXT: br label %[[INNER_FOR_COND]]
+// CHECK:      [[INNER_LOOP_END]]
+  }
+// CHECK:      call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+// CHECK:      call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]],
+  return tmain<int>();
+}
+
+// CHECK-LABEL: tmain
+// CHECK:       call void {{.*}} @__kmpc_fork_call(
+// CHECK-NOT:   __kmpc_global_thread_num
+// CHECK:       [[RES:%.+]] = call i32 @__kmpc_single(
+// CHECK-NEXT:  [[BOOLRES:%.+]] = icmp ne i32 [[RES]], 0
+// CHECK-NEXT:  br i1 [[BOOLRES]], label %[[THEN:.+]], label %[[END:.+]]
+// CHECK:       [[THEN]]
+// CHECK-NEXT:  invoke void @{{.*}}foo{{.*}}()
+// CHECK-NEXT:  unwind label %[[TERM_LPAD:.+]]
+// CHECK:       call void @__kmpc_end_single(
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK-NEXT:  call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]],
+// CHECK-NEXT:  ret
+// CHECK:       [[TERM_LPAD]]
+// CHECK:       call void @__clang_call_terminate(i8*
+// CHECK-NEXT:  unreachable
+
+#endif
diff --git a/test/OpenMP/parallel_sections_copyin_messages.cpp b/test/OpenMP/parallel_sections_copyin_messages.cpp
index 500417e113be..62b5d05db065 100644
--- a/test/OpenMP/parallel_sections_copyin_messages.cpp
+++ b/test/OpenMP/parallel_sections_copyin_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo() {
 }
@@ -22,18 +22,18 @@ public:
   S3() : a(0) {}
   S3 &operator=(S3 &s3) { return *this; }
 };
-class S4 { // expected-note {{'S4' declared here}}
+class S4 {
   int a;
   S4();
-  S4 &operator=(const S4 &s4);
+  S4 &operator=(const S4 &s4); // expected-note {{implicitly declared private here}}
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
   S5() : a(0) {}
-  S5 &operator=(const S5 &s5) { return *this; }
+  S5 &operator=(const S5 &s5) { return *this; } // expected-note {{implicitly declared private here}}
 
 public:
   S5(int v) : a(v) {}
@@ -46,10 +46,18 @@ public:
 
 S2 k;
 S3 h;
-S4 l(3); // expected-note {{'l' defined here}}
-S5 m(4); // expected-note {{'m' defined here}}
+S4 l(3);
+S5 m(4);
 #pragma omp threadprivate(h, k, l, m)
 
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   int i;
 #pragma omp parallel sections copyin // expected-error {{expected '(' after 'copyin'}}
@@ -76,7 +84,7 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections copyin(l) // expected-error {{copyin variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel sections copyin(l) // expected-error {{'operator=' is a private member of 'S4'}}
   {
     foo();
   }
@@ -92,11 +100,11 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections copyin(m) // expected-error {{copyin variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel sections copyin(m) // expected-error {{'operator=' is a private member of 'S5'}}
   {
     foo();
   }
-#pragma omp parallel sections copyin(ST < int > ::s) // expected-error {{copyin variable must be threadprivate}}
+#pragma omp parallel sections copyin(ST < int > ::s, B::x) // expected-error {{copyin variable must be threadprivate}}
   {
     foo();
   }
diff --git a/test/OpenMP/parallel_sections_default_messages.cpp b/test/OpenMP/parallel_sections_default_messages.cpp
index 55d5d4f2efb6..9b2bdad627aa 100644
--- a/test/OpenMP/parallel_sections_default_messages.cpp
+++ b/test/OpenMP/parallel_sections_default_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_sections_firstprivate_messages.cpp b/test/OpenMP/parallel_sections_firstprivate_messages.cpp
index 2d27b1a600da..84d18596c99c 100644
--- a/test/OpenMP/parallel_sections_firstprivate_messages.cpp
+++ b/test/OpenMP/parallel_sections_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -155,6 +155,14 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -245,7 +253,7 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+#pragma omp parallel sections firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
   {
     foo();
   }
diff --git a/test/OpenMP/parallel_sections_if_messages.cpp b/test/OpenMP/parallel_sections_if_messages.cpp
index 4af0d8561d03..03182b4c8fe1 100644
--- a/test/OpenMP/parallel_sections_if_messages.cpp
+++ b/test/OpenMP/parallel_sections_if_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_sections_lastprivate_messages.cpp b/test/OpenMP/parallel_sections_lastprivate_messages.cpp
index c71c115bd187..10cc36e5ea7f 100644
--- a/test/OpenMP/parallel_sections_lastprivate_messages.cpp
+++ b/test/OpenMP/parallel_sections_lastprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -15,15 +15,16 @@ class S2 {
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
-  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  const S2 &operator=(const S2 &) const;
+  static float S2s;
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
 const S2 b;
 const S2 ba[5];
-class S3 { // expected-note 2 {{'S3' declared here}}
+class S3 {
   int a;
-  S3 &operator=(const S3 &s3);
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
 
 public:
   S3() : a(0) {}
@@ -32,17 +33,17 @@ public:
 const S3 c;         // expected-note {{global variable is predetermined as shared}}
 const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
 extern const int f; // expected-note {{global variable is predetermined as shared}}
-class S4 {          // expected-note 3 {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4();             // expected-note 3 {{implicitly declared private here}}
   S4(const S4 &s4);
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
 
 public:
   S5(const S5 &s5) : a(s5.a) {}
@@ -62,8 +63,8 @@ S3 h;
 
 template <class I, class C>
 int foomain(int argc, char **argv) {
-  I e(4); // expected-note {{'e' defined here}}
-  I g(5); // expected-note {{'g' defined here}}
+  I e(4);
+  I g(5);
   int i;
   int &j = i;                             // expected-note {{'j' defined here}}
 #pragma omp parallel sections lastprivate // expected-error {{expected '(' after 'lastprivate'}}
@@ -106,7 +107,7 @@ int foomain(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp parallel sections lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
   {
     foo();
   }
@@ -141,12 +142,20 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
-  S4 e(4);               // expected-note {{'e' defined here}}
-  S5 g(5);               // expected-note {{'g' defined here}}
-  S3 m;                  // expected-note 2 {{'m' defined here}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
   S6 n(2);
   int i;
   int &j = i;                             // expected-note {{'j' defined here}}
@@ -211,7 +220,7 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+#pragma omp parallel sections lastprivate(S2::S2s)
   {
     foo();
   }
@@ -223,15 +232,15 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp parallel sections lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   {
     foo();
   }
-#pragma omp parallel sections lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel sections lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   {
     foo();
   }
-#pragma omp parallel sections lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+#pragma omp parallel sections lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
   {
     foo();
   }
@@ -257,7 +266,7 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections firstprivate(m) lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp parallel sections firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   {
     foo();
   }
diff --git a/test/OpenMP/parallel_sections_messages.cpp b/test/OpenMP/parallel_sections_messages.cpp
index f5875098e82b..1e71a305069b 100644
--- a/test/OpenMP/parallel_sections_messages.cpp
+++ b/test/OpenMP/parallel_sections_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 -o - %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_sections_misc_messages.c b/test/OpenMP/parallel_sections_misc_messages.c
index 94f1241335ab..203d12c52aab 100644
--- a/test/OpenMP/parallel_sections_misc_messages.c
+++ b/test/OpenMP/parallel_sections_misc_messages.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_sections_num_threads_messages.cpp b/test/OpenMP/parallel_sections_num_threads_messages.cpp
index 927e8d76d54c..a50025660da9 100644
--- a/test/OpenMP/parallel_sections_num_threads_messages.cpp
+++ b/test/OpenMP/parallel_sections_num_threads_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
diff --git a/test/OpenMP/parallel_sections_private_messages.cpp b/test/OpenMP/parallel_sections_private_messages.cpp
index e0b7488e5162..d9c4404bd984 100644
--- a/test/OpenMP/parallel_sections_private_messages.cpp
+++ b/test/OpenMP/parallel_sections_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -123,6 +123,14 @@ int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
@@ -172,7 +180,7 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp parallel sections private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   {
     foo();
   }
diff --git a/test/OpenMP/parallel_sections_proc_bind_messages.cpp b/test/OpenMP/parallel_sections_proc_bind_messages.cpp
index 9e58a734efb0..79796daa8ce6 100644
--- a/test/OpenMP/parallel_sections_proc_bind_messages.cpp
+++ b/test/OpenMP/parallel_sections_proc_bind_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/parallel_sections_reduction_messages.cpp b/test/OpenMP/parallel_sections_reduction_messages.cpp
index 8b02f2315c9d..2ed828dabfa7 100644
--- a/test/OpenMP/parallel_sections_reduction_messages.cpp
+++ b/test/OpenMP/parallel_sections_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo() {
 }
@@ -11,12 +11,12 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
-  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static float S2s;
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;                            // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i];              // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];                    // expected-note 2 {{'q' defined here}}
-  T fl;                                 // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp parallel sections reduction // expected-error {{expected '(' after 'reduction'}}
   {
     foo();
@@ -103,11 +103,11 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel sections reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel sections reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   {
     foo();
   }
@@ -127,7 +127,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp parallel sections reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   {
     foo();
   }
@@ -151,11 +151,11 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel sections reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp parallel sections reduction(&& : S2::S2s)
   {
     foo();
   }
@@ -167,7 +167,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp parallel sections reduction(+ : o) // expected-error {{no viable overloaded '='}}
   {
     foo();
   }
@@ -208,18 +208,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;                           // expected-note 2 {{'j' defined here}}
   S3 &p = k;                            // expected-note 2 {{'p' defined here}}
   const int &r = da[i];                 // expected-note {{'r' defined here}}
   int &q = qa[i];                       // expected-note {{'q' defined here}}
-  float fl;                             // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp parallel sections reduction // expected-error {{expected '(' after 'reduction'}}
   {
     foo();
@@ -272,7 +280,7 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp parallel sections reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   {
     foo();
   }
@@ -296,11 +304,11 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp parallel sections reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp parallel sections reduction(&& : S2::S2s)
   {
     foo();
   }
@@ -308,15 +316,15 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp parallel sections reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+#pragma omp parallel sections reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp parallel sections reduction(+ : o) // expected-error {{no viable overloaded '='}}
   {
     foo();
   }
diff --git a/test/OpenMP/parallel_sections_shared_messages.cpp b/test/OpenMP/parallel_sections_shared_messages.cpp
index d4915c8eaa45..2b8359711119 100644
--- a/test/OpenMP/parallel_sections_shared_messages.cpp
+++ b/test/OpenMP/parallel_sections_shared_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
@@ -48,6 +48,14 @@ public:
 S3 h;
 #pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -83,7 +91,7 @@ int main(int argc, char **argv) {
   { foo(); }
 #pragma omp parallel sections shared(e, g)
   { foo(); }
-#pragma omp parallel sections shared(h) // expected-error {{threadprivate or thread local variable cannot be shared}}
+#pragma omp parallel sections shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
   { foo(); }
 #pragma omp parallel sections private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
   { foo(); }
diff --git a/test/OpenMP/parallel_shared_messages.cpp b/test/OpenMP/parallel_shared_messages.cpp
index 8363989439be..f6f4573a6693 100644
--- a/test/OpenMP/parallel_shared_messages.cpp
+++ b/test/OpenMP/parallel_shared_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
@@ -44,6 +44,14 @@ public:
 S3 h;
 #pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = { 0 };
@@ -65,7 +73,7 @@ int main(int argc, char **argv) {
   #pragma omp parallel shared(ca)
   #pragma omp parallel shared(da)
   #pragma omp parallel shared(e, g)
-  #pragma omp parallel shared(h) // expected-error {{threadprivate or thread local variable cannot be shared}}
+  #pragma omp parallel shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
   #pragma omp parallel private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
   foo();
   #pragma omp parallel firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
diff --git a/test/OpenMP/predefined_macro.c b/test/OpenMP/predefined_macro.c
index c9829ce4572a..9a961bce552f 100644
--- a/test/OpenMP/predefined_macro.c
+++ b/test/OpenMP/predefined_macro.c
@@ -1,33 +1,17 @@
-// RUN: %clang_cc1 -fopenmp=libiomp5 -verify -DFOPENMP -o - %s
+// RUN: %clang_cc1 -fopenmp -verify -DFOPENMP -o - %s
 // RUN: %clang_cc1 -verify -o - %s
 // expected-no-diagnostics
 #ifdef FOPENMP
-// -fopenmp=libiomp5 option is specified
+// -fopenmp option is specified
 #ifndef _OPENMP
 #error "No _OPENMP macro is defined with -fopenmp option"
 #elsif _OPENMP != 201307
 #error "_OPENMP has incorrect value"
 #endif //_OPENMP
 #else
-// No -fopenmp=libiomp5 option is specified
+// No -fopenmp option is specified
 #ifdef _OPENMP
 #error "_OPENMP macro is defined without -fopenmp option"
 #endif // _OPENMP
 #endif // FOPENMP
 
-// RUN: %clang_cc1 -fopenmp=libiomp5 -verify -DFOPENMP -o - %s
-// RUN: %clang_cc1 -verify -o - %s
-// expected-no-diagnostics
-#ifdef FOPENMP
-// -fopenmp=libiomp5 option is specified
-#ifndef _OPENMP
-#error "No _OPENMP macro is defined with -fopenmp option"
-#elsif _OPENMP != 201307
-#error "_OPENMP has incorrect value"
-#endif // _OPENMP
-#else
-// No -fopenmp=libiomp5 option is specified
-#ifdef _OPENMP
-#error "_OPENMP macro is defined without -fopenmp option"
-#endif // _OPENMP
-#endif // FOPENMP
diff --git a/test/OpenMP/sections_ast_print.cpp b/test/OpenMP/sections_ast_print.cpp
index b1a2e0304001..ea96be3ff08a 100644
--- a/test/OpenMP/sections_ast_print.cpp
+++ b/test/OpenMP/sections_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/sections_codegen.cpp b/test/OpenMP/sections_codegen.cpp
new file mode 100644
index 000000000000..11cc900fbecf
--- /dev/null
+++ b/test/OpenMP/sections_codegen.cpp
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+// CHECK: [[IMPLICIT_BARRIER_SECTIONS_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
+// CHECK: [[IMPLICIT_BARRIER_SINGLE_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
+// CHECK-LABEL: foo
+void foo() {};
+// CHECK-LABEL: bar
+void bar() {};
+
+template <class T>
+T tmain() {
+#pragma omp parallel
+#pragma omp sections
+  {
+    foo();
+  }
+  return T();
+}
+
+// CHECK-LABEL: @main
+int main() {
+  float l = 0.0; // Used as a base point in checks.
+// CHECK: [[GTID:%.+]] = call{{.*}} i32 @__kmpc_global_thread_num({{.*}})
+// CHECK: store float
+#pragma omp sections
+  {
+// CHECK:      store i32 0, i32* [[LB_PTR:%.+]],
+// CHECK:      store i32 1, i32* [[UB_PTR:%.+]],
+// CHECK:      call void @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_PTR:%.+]], i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[STRIDE_PTR:%.+]], i32 1, i32 1)
+// <<UB = min(UB, GlobalUB);>>
+// CHECK:      [[UB:%.+]] = load i32, i32* [[UB_PTR]]
+// CHECK:      [[CMP:%.+]] = icmp slt i32 [[UB]], 1
+// CHECK:      [[MIN_UB_GLOBALUB:%.+]] = select i1 [[CMP]], i32 [[UB]], i32 1
+// CHECK:      store i32 [[MIN_UB_GLOBALUB]], i32* [[UB_PTR]]
+// <<IV = LB;>>
+// CHECK:      [[LB:%.+]] = load i32, i32* [[LB_PTR]]
+// CHECK:      store i32 [[LB]], i32* [[IV_PTR:%.+]]
+// CHECK:      br label %[[INNER_FOR_COND:.+]]
+// CHECK:      [[INNER_FOR_COND]]
+// <<IV <= UB?>>
+// CHECK:      [[IV:%.+]] = load i32, i32* [[IV_PTR]]
+// CHECK:      [[UB:%.+]] = load i32, i32* [[UB_PTR]]
+// CHECK:      [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK:      br i1 [[CMP]], label %[[INNER_LOOP_BODY:.+]], label %[[INNER_LOOP_END:.+]]
+// CHECK:      [[INNER_LOOP_BODY]]
+// <<TRUE>> - > <BODY>
+// CHECK:      [[IV:%.+]] = load i32, i32* [[IV_PTR]]
+// CHECK:      switch i32 [[IV]], label %[[SECTIONS_EXIT:.+]] [
+// CHECK-NEXT: i32 0, label %[[SECTIONS_CASE0:.+]]
+// CHECK-NEXT: i32 1, label %[[SECTIONS_CASE1:.+]]
+#pragma omp section
+// CHECK:      [[SECTIONS_CASE0]]
+// CHECK-NEXT: invoke void @{{.*}}foo{{.*}}()
+// CHECK:      br label %[[SECTIONS_EXIT]]
+    foo();
+#pragma omp section
+// CHECK:      [[SECTIONS_CASE1]]
+// CHECK-NEXT: invoke void @{{.*}}bar{{.*}}()
+// CHECK:      br label %[[SECTIONS_EXIT]]
+    bar();
+// CHECK:      [[SECTIONS_EXIT]]
+// <<++IV;>>
+// CHECK:      [[IV:%.+]] = load i32, i32* [[IV_PTR]]
+// CHECK-NEXT: [[INC:%.+]] = add nsw i32 [[IV]], 1
+// CHECK-NEXT: store i32 [[INC]], i32* [[IV_PTR]]
+// CHECK-NEXT: br label %[[INNER_FOR_COND]]
+// CHECK:      [[INNER_LOOP_END]]
+  }
+// CHECK:      call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+// CHECK:      call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_SECTIONS_LOC]],
+#pragma omp sections nowait
+  {
+    foo();
+#pragma omp section
+    bar();
+  }
+// CHECK-NOT:  __kmpc_cancel_barrier
+  return tmain<int>();
+}
+
+// CHECK-LABEL: tmain
+// CHECK:       call void {{.*}} @__kmpc_fork_call(
+// CHECK-NOT:   __kmpc_global_thread_num
+// CHECK:       [[RES:%.+]] = call i32 @__kmpc_single(
+// CHECK-NEXT:  [[BOOLRES:%.+]] = icmp ne i32 [[RES]], 0
+// CHECK-NEXT:  br i1 [[BOOLRES]], label %[[THEN:.+]], label %[[END:.+]]
+// CHECK:       [[THEN]]
+// CHECK-NEXT:  invoke void @{{.*}}foo{{.*}}()
+// CHECK-NEXT:  unwind label %[[TERM_LPAD:.+]]
+// CHECK:       call void @__kmpc_end_single(
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK-NEXT:  call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_SINGLE_LOC]],
+// CHECK-NEXT:  call i32 @__kmpc_cancel_barrier(
+// CHECK-NEXT:  ret
+// CHECK:       [[TERM_LPAD]]
+// CHECK:       call void @__clang_call_terminate(i8*
+// CHECK-NEXT:  unreachable
+
+#endif
diff --git a/test/OpenMP/sections_firstprivate_codegen.cpp b/test/OpenMP/sections_firstprivate_codegen.cpp
new file mode 100644
index 000000000000..ba49864dda82
--- /dev/null
+++ b/test/OpenMP/sections_firstprivate_codegen.cpp
@@ -0,0 +1,277 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g = 1212;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp parallel
+#pragma omp sections firstprivate(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+#pragma omp section
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+// CHECK: [[TEST:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> test;
+// CHECK-DAG: [[T_VAR:@.+]] = global i{{[0-9]+}} 333,
+int t_var = 333;
+// CHECK-DAG: [[VEC:@.+]] = global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+int vec[] = {1, 2};
+// CHECK-DAG: [[S_ARR:@.+]] = global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+S<float> s_arr[] = {1, 2};
+// CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> var(3);
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
+
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: ([[S_FLOAT_TY]]*)* [[S_FLOAT_TY_DESTR:@[^ ]+]] {{[^,]+}}, {{.+}}([[S_FLOAT_TY]]* [[TEST]]
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+// LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+// LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp sections firstprivate(g)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // Skip temp vars for loop
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G]]
+    // LAMBDA: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // LAMBDA: call i32 @__kmpc_cancel_barrier(
+    g = 1;
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: call i32 @__kmpc_cancel_barrier(
+#pragma omp section
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+// BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+// BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp sections firstprivate(g)
+   {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // Skip temp vars for loop
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // BLOCKS: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G]]
+    // BLOCKS: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS: call i32 @__kmpc_cancel_barrier(
+    g = 1;
+    // BLOCKS: call void @__kmpc_for_static_init_4(
+    // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_for_static_fini(
+    // BLOCKS: call i32 @__kmpc_cancel_barrier(
+#pragma omp section
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp sections firstprivate(t_var, vec, s_arr, var) nowait
+  {
+    {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    }
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+
+// CHECK: call i32 @__kmpc_single(
+// firstprivate t_var(t_var)
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// firstprivate vec(vec)
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+
+// firstprivate s_arr(s_arr)
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: getelementptr inbounds ([2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+
+// firstprivate var(var)
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+
+// ~(firstprivate var), ~(firstprivate s_arr)
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_end_single(
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// firstprivate t_var(t_var)
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// firstprivate vec(vec)
+// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+
+// firstprivate s_arr(s_arr)
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+
+// firstprivate var(var)
+// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_REF_PTR]],
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+
+// Synchronization for initialization.
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// ~(firstprivate var), ~(firstprivate s_arr)
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[SECTIONS_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+#endif
+
diff --git a/test/OpenMP/sections_firstprivate_messages.cpp b/test/OpenMP/sections_firstprivate_messages.cpp
index ecee45900fea..ff7614967ee8 100644
--- a/test/OpenMP/sections_firstprivate_messages.cpp
+++ b/test/OpenMP/sections_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -170,6 +170,14 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -281,7 +289,7 @@ int main(int argc, char **argv) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+#pragma omp sections firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
   {
     foo();
   }
diff --git a/test/OpenMP/sections_lastprivate_codegen.cpp b/test/OpenMP/sections_lastprivate_codegen.cpp
new file mode 100644
index 000000000000..b57f0b05c0c2
--- /dev/null
+++ b/test/OpenMP/sections_lastprivate_codegen.cpp
@@ -0,0 +1,332 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S<T> &operator=(const S<T> &);
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile int g = 1212;
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[CAP_MAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]* }
+// CHECK: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
+// CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
+// CHECK-DAG: [[X:@.+]] = global double 0.0
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp parallel
+#pragma omp sections lastprivate(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+#pragma omp section
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+namespace A {
+double x;
+}
+namespace B {
+using A::x;
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp sections lastprivate(g)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    // LAMBDA: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %{{.+}}
+    // LAMBDA: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+    // LAMBDA: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+    // LAMBDA: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+    g = 1;
+    // Check for final copying of private values back to original vars.
+    // LAMBDA: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+    // LAMBDA: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+    // LAMBDA: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+    // LAMBDA: [[LAST_THEN]]
+    // Actual copying.
+
+    // original g=private_g;
+    // LAMBDA: [[G_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // LAMBDA: store volatile i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G]],
+    // LAMBDA: br label %[[LAST_DONE]]
+    // LAMBDA: [[LAST_DONE]]
+    // LAMBDA: call i32 @__kmpc_cancel_barrier(%{{.+}}* @{{.+}}, i{{[0-9]+}} [[GTID]])
+#pragma omp section
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp sections lastprivate(g)
+  {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: alloca i{{[0-9]+}},
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // BLOCKS: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    // BLOCKS: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %{{.+}}
+    // BLOCKS: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+    // BLOCKS: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+    // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+    g = 1;
+    // Check for final copying of private values back to original vars.
+    // BLOCKS: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+    // BLOCKS: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+    // BLOCKS: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+    // BLOCKS: [[LAST_THEN]]
+    // Actual copying.
+
+    // original g=private_g;
+    // BLOCKS: [[G_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // BLOCKS: store volatile i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G]],
+    // BLOCKS: br label %[[LAST_DONE]]
+    // BLOCKS: [[LAST_DONE]]
+    // BLOCKS: call i32 @__kmpc_cancel_barrier(%{{.+}}* @{{.+}}, i{{[0-9]+}} [[GTID]])
+#pragma omp section
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+#pragma omp parallel
+#pragma omp sections lastprivate(t_var, vec, s_arr, var)
+  {
+    {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    }
+  }
+#pragma omp parallel
+#pragma omp sections lastprivate(A::x, B::x)
+  {
+    A::x++;
+#pragma omp section
+    ;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, %{{.+}}*)* [[MAIN_MICROTASK1:@.+]] to void
+// CHECK: = call {{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_MAIN_TY]]* %{{.+}})
+// CHECK-NOT: alloca i{{[0-9]+}},
+// CHECK-NOT: alloca [2 x i{{[0-9]+}}],
+// CHECK-NOT: alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_single(
+
+// CHECK-DAG: getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK-DAG: getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK-DAG: getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK-DAG: getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+
+// <Skip loop body>
+
+// CHECK-NOT: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-NOT: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+
+// CHECK: call void @__kmpc_end_single(
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[SINGLE_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+
+//
+// CHECK: define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: [[X_PRIV:%.+]] = alloca double,
+// CHECK-NOT: alloca double
+
+// Check for default initialization.
+// CHECK-NOT: [[X_PRIV]]
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 [[GTID]], i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// <Skip loop body>
+// CHECK: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 [[GTID]])
+
+// Check for final copying of private values back to original vars.
+// CHECK: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+// CHECK: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+// CHECK: [[LAST_THEN]]
+// Actual copying.
+
+// original x=private_x;
+// CHECK: [[X_VAL:%.+]] = load double, double* [[X_PRIV]],
+// CHECK: store double [[X_VAL]], double* [[X]],
+// CHECK-NEXT: br label %[[LAST_DONE]]
+// CHECK: [[LAST_DONE]]
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[SECTIONS_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+
+// Check for default initialization.
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: [[S_ARR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_REF_PTR]],
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_REF_PTR]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK: call {{.+}} @__kmpc_for_static_init_4(%{{.+}}* @{{.+}}, i32 %{{.+}}, i32 34, i32* [[IS_LAST_ADDR:%.+]], i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// <Skip loop body>
+// CHECK: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 %{{.+}})
+
+// Check for final copying of private values back to original vars.
+// CHECK: [[IS_LAST_VAL:%.+]] = load i32, i32* [[IS_LAST_ADDR]],
+// CHECK: [[IS_LAST_ITER:%.+]] = icmp ne i32 [[IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
+// CHECK: [[LAST_THEN]]
+// Actual copying.
+
+// original t_var=private_t_var;
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_REF]],
+
+// original vec[]=private_vec[];
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+
+// original s_arr[]=private_s_arr[];
+// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
+// CHECK: [[S_ARR_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_BEGIN]], [[S_ARR_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}})
+// CHECK: br i1 {{.+}}, label %[[S_ARR_BODY_DONE]], label %[[S_ARR_BODY]]
+// CHECK: [[S_ARR_BODY_DONE]]
+
+// original var=private_var;
+// CHECK: call {{.*}} [[S_INT_TY_COPY_ASSIGN:@.+]]([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* {{.*}} [[VAR_PRIV]])
+// CHECK: br label %[[LAST_DONE]]
+// CHECK: [[LAST_DONE]]
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[SECTIONS_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+#endif
+
diff --git a/test/OpenMP/sections_lastprivate_messages.cpp b/test/OpenMP/sections_lastprivate_messages.cpp
index 54c6005dcef3..870dd158aeeb 100644
--- a/test/OpenMP/sections_lastprivate_messages.cpp
+++ b/test/OpenMP/sections_lastprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -15,15 +15,16 @@ class S2 {
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator=(const S2 &) const;
   static float S2s; // expected-note {{static data member is predetermined as shared}}
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
 const S2 b;
 const S2 ba[5];
-class S3 { // expected-note 2 {{'S3' declared here}}
+class S3 {
   int a;
-  S3 &operator=(const S3 &s3);
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
 
 public:
   S3() : a(0) {}
@@ -32,17 +33,17 @@ public:
 const S3 c;         // expected-note {{global variable is predetermined as shared}}
 const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
 extern const int f; // expected-note {{global variable is predetermined as shared}}
-class S4 {          // expected-note 3 {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4();          // expected-note 3 {{implicitly declared private here}}
   S4(const S4 &s4);
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
 
 public:
   S5(const S5 &s5) : a(s5.a) {}
@@ -62,8 +63,8 @@ S3 h;
 
 template <class I, class C>
 int foomain(int argc, char **argv) {
-  I e(4); // expected-note {{'e' defined here}}
-  I g(5); // expected-note {{'g' defined here}}
+  I e(4);
+  I g(5);
   int i;
   int &j = i; // expected-note {{'j' defined here}}
 #pragma omp parallel
@@ -117,7 +118,7 @@ int foomain(int argc, char **argv) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp sections lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
   {
     foo();
   }
@@ -155,12 +156,20 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
-  S4 e(4);               // expected-note {{'e' defined here}}
-  S5 g(5);               // expected-note {{'g' defined here}}
-  S3 m;                  // expected-note 2 {{'m' defined here}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
   S6 n(2);
   int i;
   int &j = i; // expected-note {{'j' defined here}}
@@ -256,17 +265,17 @@ int main(int argc, char **argv) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp sections lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp sections lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+#pragma omp sections lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
   {
     foo();
   }
@@ -296,7 +305,7 @@ int main(int argc, char **argv) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections firstprivate(m) lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp sections firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   {
     foo();
   }
diff --git a/test/OpenMP/sections_misc_messages.c b/test/OpenMP/sections_misc_messages.c
index 0297513543a9..da20aa18aa12 100644
--- a/test/OpenMP/sections_misc_messages.c
+++ b/test/OpenMP/sections_misc_messages.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
 
 void foo();
 
diff --git a/test/OpenMP/sections_private_codegen.cpp b/test/OpenMP/sections_private_codegen.cpp
new file mode 100644
index 000000000000..4bad714908d2
--- /dev/null
+++ b/test/OpenMP/sections_private_codegen.cpp
@@ -0,0 +1,192 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[CAP_MAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]* }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp parallel
+#pragma omp sections private(t_var, vec, s_arr, s_arr, var, var)
+  {
+    vec[0] = t_var;
+#pragma omp section
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp sections private(g)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+    // LAMBDA: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    g = 1;
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store double* [[G_PRIVATE_ADDR]], double** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call{{( x86_thiscallcc)?}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_for_static_fini(
+#pragma omp section
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+      // LAMBDA: store volatile double 2.0{{.+}}, double* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* {{.+}})
+#pragma omp parallel
+#pragma omp sections private(g)
+    {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+    // BLOCKS: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    g = 1;
+    // BLOCKS: call void @__kmpc_for_static_init_4(
+    // BLOCKS: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: double* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_for_static_fini(
+#pragma omp section
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile double 2.0{{.+}}, double*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+#pragma omp parallel
+#pragma omp sections private(t_var, vec, s_arr, s_arr, var, var)
+  {
+    {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    }
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_MAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK: call i32 @__kmpc_single(
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_end_single(
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK-NOT: alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret void
+#endif
+
diff --git a/test/OpenMP/sections_private_messages.cpp b/test/OpenMP/sections_private_messages.cpp
index 8b330bf71052..7854b3d8bdd3 100644
--- a/test/OpenMP/sections_private_messages.cpp
+++ b/test/OpenMP/sections_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -123,6 +123,14 @@ int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
@@ -172,7 +180,7 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp sections private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp sections private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   {
     foo();
   }
diff --git a/test/OpenMP/sections_reduction_codegen.cpp b/test/OpenMP/sections_reduction_codegen.cpp
new file mode 100644
index 000000000000..a50f04983d4a
--- /dev/null
+++ b/test/OpenMP/sections_reduction_codegen.cpp
@@ -0,0 +1,470 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+volatile double g;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  operator T() { return T(); }
+  S &operator&(const S &) { return *this; }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { float*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float*, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]* }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [[S_INT_TY]]*, [[S_INT_TY]]*, i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]* }
+// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
+
+template <typename T>
+T tmain() {
+  T t;
+  S<T> test;
+  T t_var = T(), t_var1;
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3), var1;
+#pragma omp parallel
+#pragma omp sections reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1) nowait
+  {
+    vec[0] = t_var;
+#pragma omp section
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp sections reduction(+:g)
+    {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* %{{.+}})
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+
+    // Reduction list for runtime.
+    // LAMBDA: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+    // LAMBDA: store double 0.0{{.+}}, double* [[G_PRIVATE_ADDR]]
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    g = 1;
+    // LAMBDA: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store double* [[G_PRIVATE_ADDR]], double** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_for_static_fini(
+
+    // LAMBDA: [[G_PRIV_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i32 0, i32 0
+    // LAMBDA: [[BITCAST:%.+]] = bitcast double* [[G_PRIVATE_ADDR]] to i8*
+    // LAMBDA: store i8* [[BITCAST]], i8** [[G_PRIV_REF]],
+    // LAMBDA: call i32 @__kmpc_reduce(
+    // LAMBDA: switch i32 %{{.+}}, label %[[REDUCTION_DONE:.+]] [
+    // LAMBDA: i32 1, label %[[CASE1:.+]]
+    // LAMBDA: i32 2, label %[[CASE2:.+]]
+    // LAMBDA: [[CASE1]]
+    // LAMBDA: [[G_VAL:%.+]] = load double, double* [[G]]
+    // LAMBDA: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE_ADDR]]
+    // LAMBDA: [[ADD:%.+]] = fadd double [[G_VAL]], [[G_PRIV_VAL]]
+    // LAMBDA: store double [[ADD]], double* [[G]]
+    // LAMBDA: call void @__kmpc_end_reduce(
+    // LAMBDA: br label %[[REDUCTION_DONE]]
+    // LAMBDA: [[CASE2]]
+    // LAMBDA: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE_ADDR]]
+    // LAMBDA: fadd double
+    // LAMBDA: cmpxchg i64*
+    // LAMBDA: call void @__kmpc_end_reduce(
+    // LAMBDA: br label %[[REDUCTION_DONE]]
+    // LAMBDA: [[REDUCTION_DONE]]
+    // LAMBDA: ret void
+#pragma omp section
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+      // LAMBDA: store volatile double 2.0{{.+}}, double* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp sections reduction(-:g)
+    {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* %{{.+}})
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+
+    // Reduction list for runtime.
+    // BLOCKS: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+    // BLOCKS: store double 0.0{{.+}}, double* [[G_PRIVATE_ADDR]]
+    g = 1;
+    // BLOCKS: call void @__kmpc_for_static_init_4(
+    // BLOCKS: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: double* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_for_static_fini(
+
+    // BLOCKS: [[G_PRIV_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i32 0, i32 0
+    // BLOCKS: [[BITCAST:%.+]] = bitcast double* [[G_PRIVATE_ADDR]] to i8*
+    // BLOCKS: store i8* [[BITCAST]], i8** [[G_PRIV_REF]],
+    // BLOCKS: call i32 @__kmpc_reduce(
+    // BLOCKS: switch i32 %{{.+}}, label %[[REDUCTION_DONE:.+]] [
+    // BLOCKS: i32 1, label %[[CASE1:.+]]
+    // BLOCKS: i32 2, label %[[CASE2:.+]]
+    // BLOCKS: [[CASE1]]
+    // BLOCKS: [[G_VAL:%.+]] = load double, double* [[G]]
+    // BLOCKS: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE_ADDR]]
+    // BLOCKS: [[ADD:%.+]] = fadd double [[G_VAL]], [[G_PRIV_VAL]]
+    // BLOCKS: store double [[ADD]], double* [[G]]
+    // BLOCKS: call void @__kmpc_end_reduce(
+    // BLOCKS: br label %[[REDUCTION_DONE]]
+    // BLOCKS: [[CASE2]]
+    // BLOCKS: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE_ADDR]]
+    // BLOCKS: fadd double
+    // BLOCKS: cmpxchg i64*
+    // BLOCKS: call void @__kmpc_end_reduce(
+    // BLOCKS: br label %[[REDUCTION_DONE]]
+    // BLOCKS: [[REDUCTION_DONE]]
+    // BLOCKS: ret void
+#pragma omp section
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile double 2.0{{.+}}, double*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  float t_var = 0, t_var1;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3), var1;
+#pragma omp parallel
+#pragma omp sections reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1)
+  {
+    {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    vec[1] = t_var1;
+    s_arr[1] = var1;
+    }
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_MAIN_TY]]* %{{.+}})
+// CHECK-NOT: alloca float,
+// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK-NOT: alloca float,
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_single(
+
+// CHECK-DAG: getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK-DAG: getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK-DAG: getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK-DAG: getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+
+// CHECK-NOT: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-NOT: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+
+// CHECK: call void @__kmpc_end_single(
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[SINGLE_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_TMAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca i{{[0-9]+}},
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// For + reduction operation initial value of private variable is 0.
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// CHECK: [[VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_PTR_REF:%.+]],
+// For & reduction operation initial value of private variable is ones in all bits.
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[VAR_PRIV]])
+
+// CHECK: [[VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[VAR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_PTR_REF:%.+]],
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[VAR1_PRIV]])
+
+// CHECK: [[T_VAR1_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} {{[0-9]+}}
+// CHECK: [[T_VAR1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR1_PTR_REF]],
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: store i{{[0-9]+}} 2147483647, i{{[0-9]+}}* [[T_VAR1_PRIV]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 0
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i32 0, i32 3
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce_nowait(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: [[UP:%.+]] = add nsw i{{[0-9]+}} [[T_VAR_VAL]], [[T_VAR_PRIV_VAL]]
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR_REF]],
+
+// var = var.operator &(var_reduction);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
+// CHECK: [[T_VAR1_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_PRIV]],
+// CHECK: [[CMP:%.+]] = icmp slt i{{[0-9]+}} [[T_VAR1_VAL]], [[T_VAR1_PRIV_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi i32
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR1_REF]],
+
+// __kmpc_end_reduce_nowait(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]]
+// CHECK: atomicrmw add i32* [[T_VAR_REF]], i32 [[T_VAR_PRIV_VAL]] monotonic
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_PRIV]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: [[T_VAR1_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_PRIV]]
+// CHECK: atomicrmw min i32* [[T_VAR1_REF]], i32 [[T_VAR1_PRIV_VAL]] monotonic
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (i{{[0-9]+}}*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to i{{[0-9]+}}*
+// t_var_rhs = (i{{[0-9]+}}*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i32 0, i32 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to i{{[0-9]+}}*
+
+// var_lhs = (S<i{{[0-9]+}}>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_INT_TY]]*
+// var_rhs = (S<i{{[0-9]+}}>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_INT_TY]]*
+
+// var1_lhs = (S<i{{[0-9]+}}>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_INT_TY]]*
+// var1_rhs = (S<i{{[0-9]+}}>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_INT_TY]]*
+
+// t_var1_lhs = (i{{[0-9]+}}*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to i{{[0-9]+}}*
+// t_var1_rhs = (i{{[0-9]+}}*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i32 0, i32 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to i{{[0-9]+}}*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: [[T_VAR_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_LHS]],
+// CHECK: [[T_VAR_RHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_RHS]],
+// CHECK: [[UP:%.+]] = add nsw i{{[0-9]+}} [[T_VAR_LHS_VAL]], [[T_VAR_RHS_VAL]]
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR_LHS]],
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
+// CHECK: [[VAR1_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br i1 [[VAR1_BOOL]], label %[[TRUE:.+]], label %[[END2:.+]]
+// CHECK: [[TRUE]]
+// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_RHS]])
+// CHECK: [[VAR1_REDUCTION_BOOL:%.+]] = icmp ne i{{[0-9]+}} [[TO_INT]], 0
+// CHECK: br label %[[END2]]
+// CHECK: [[END2]]
+// CHECK: [[COND_LVALUE:%.+]] = phi i1 [ false, %{{.+}} ], [ [[VAR1_REDUCTION_BOOL]], %[[TRUE]] ]
+// CHECK: [[CONV:%.+]] = zext i1 [[COND_LVALUE]] to i32
+// CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
+// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
+// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
+// CHECK: [[T_VAR1_RHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_RHS]],
+// CHECK: [[CMP:%.+]] = icmp slt i{{[0-9]+}} [[T_VAR1_LHS_VAL]], [[T_VAR1_RHS_VAL]]
+// CHECK: br i1 [[CMP]]
+// CHECK: [[UP:%.+]] = phi i32
+// CHECK: store i{{[0-9]+}} [[UP]], i{{[0-9]+}}* [[T_VAR1_LHS]],
+// CHECK: ret void
+
+#endif
+
diff --git a/test/OpenMP/sections_reduction_messages.cpp b/test/OpenMP/sections_reduction_messages.cpp
index 8c4bdcc2e8fc..a0f56d5a1e2a 100644
--- a/test/OpenMP/sections_reduction_messages.cpp
+++ b/test/OpenMP/sections_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
 
 void foo() {
 }
@@ -11,7 +11,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;               // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i]; // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];       // expected-note 2 {{'q' defined here}}
-  T fl;                    // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp parallel
 #pragma omp sections reduction // expected-error {{expected '(' after 'reduction'}}
   {
@@ -111,12 +111,12 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp sections reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp sections reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   {
     foo();
   }
@@ -141,7 +141,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp sections reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   {
     foo();
   }
@@ -171,7 +171,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp sections reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   {
     foo();
   }
@@ -191,7 +191,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp sections reduction(+ : o) // expected-error {{no viable overloaded '='}}
   {
     foo();
   }
@@ -235,18 +235,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;           // expected-note 2 {{'j' defined here}}
   S3 &p = k;            // expected-note 2 {{'p' defined here}}
   const int &r = da[i]; // expected-note {{'r' defined here}}
   int &q = qa[i];       // expected-note {{'q' defined here}}
-  float fl;             // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp parallel
 #pragma omp sections reduction // expected-error {{expected '(' after 'reduction'}}
   {
@@ -313,7 +321,7 @@ int main(int argc, char **argv) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp sections reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   {
     foo();
   }
@@ -343,7 +351,7 @@ int main(int argc, char **argv) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp sections reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   {
     foo();
   }
@@ -358,17 +366,17 @@ int main(int argc, char **argv) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp sections reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+#pragma omp sections reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp sections reduction(+ : o) // expected-error {{no viable overloaded '='}}
   {
     foo();
   }
diff --git a/test/OpenMP/simd_aligned_messages.cpp b/test/OpenMP/simd_aligned_messages.cpp
index dfa6b541c909..408cc2e360b5 100644
--- a/test/OpenMP/simd_aligned_messages.cpp
+++ b/test/OpenMP/simd_aligned_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
 
 struct B {
   static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
diff --git a/test/OpenMP/simd_ast_print.cpp b/test/OpenMP/simd_ast_print.cpp
index 46286123750d..069862b6dfbc 100644
--- a/test/OpenMP/simd_ast_print.cpp
+++ b/test/OpenMP/simd_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/simd_codegen.cpp b/test/OpenMP/simd_codegen.cpp
index b8073c2948e0..586aaa5732f6 100644
--- a/test/OpenMP/simd_codegen.cpp
+++ b/test/OpenMP/simd_codegen.cpp
@@ -1,23 +1,27 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 //
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
+long long get_val() { return 0; }
+double *g_ptr;
+
 // CHECK-LABEL: define {{.*void}} @{{.*}}simple{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void simple(float *a, float *b, float *c, float *d) {
   #pragma omp simd
 // CHECK: store i32 0, i32* [[OMP_IV:%[^,]+]]
 
-// CHECK: [[IV:%.+]] = load i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID:[0-9]+]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID:[0-9]+]]
 // CHECK-NEXT: [[CMP:%.+]] = icmp slt i32 [[IV]], 6
 // CHECK-NEXT: br i1 [[CMP]], label %[[SIMPLE_LOOP1_BODY:.+]], label %[[SIMPLE_LOOP1_END:[^,]+]]
   for (int i = 3; i < 32; i += 5) {
 // CHECK: [[SIMPLE_LOOP1_BODY]]
 // Start of body: calculate i from IV:
-// CHECK: [[IV1_1:%.+]] = load i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID]]
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID]]
 // CHECK: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 5
 // CHECK-NEXT: [[CALC_I_2:%.+]] = add nsw i32 3, [[CALC_I_1]]
 // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID]]
@@ -25,71 +29,127 @@ void simple(float *a, float *b, float *c, float *d) {
 // End of body: store into a[i]:
 // CHECK: store float [[RESULT:%.+]], float* {{%.+}}{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID]]
     a[i] = b[i] * c[i] * d[i];
-// CHECK: [[IV1_2:%.+]] = load i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID]]
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID]]
 // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
 // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP1_ID]]
 // br label %{{.+}}, !llvm.loop !{{.+}}
   }
 // CHECK: [[SIMPLE_LOOP1_END]]
 
-  #pragma omp simd
+  long long k = get_val();
+
+  #pragma omp simd linear(k : 3)
+// CHECK: [[K0:%.+]] = call {{.*}}i64 @{{.*}}get_val
+// CHECK-NEXT: store i64 [[K0]], i64* [[K_VAR:%[^,]+]]
+// CHECK: [[K0LOAD:%.+]] = load i64, i64* [[K_VAR]]
+// CHECK-NEXT: store i64 [[K0LOAD]], i64* [[LIN0:%[^,]+]]
 // CHECK: store i32 0, i32* [[OMP_IV2:%[^,]+]]
 
-// CHECK: [[IV2:%.+]] = load i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID:[0-9]+]]
+// CHECK: [[IV2:%.+]] = load i32, i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID:[0-9]+]]
 // CHECK-NEXT: [[CMP2:%.+]] = icmp slt i32 [[IV2]], 9
 // CHECK-NEXT: br i1 [[CMP2]], label %[[SIMPLE_LOOP2_BODY:.+]], label %[[SIMPLE_LOOP2_END:[^,]+]]
   for (int i = 10; i > 1; i--) {
 // CHECK: [[SIMPLE_LOOP2_BODY]]
 // Start of body: calculate i from IV:
-// CHECK: [[IV2_0:%.+]] = load i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
+// CHECK: [[IV2_0:%.+]] = load i32, i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
 // FIXME: It is interesting, why the following "mul 1" was not constant folded?
 // CHECK-NEXT: [[IV2_1:%.+]] = mul nsw i32 [[IV2_0]], 1
 // CHECK-NEXT: [[LC_I_1:%.+]] = sub nsw i32 10, [[IV2_1]]
 // CHECK-NEXT: store i32 [[LC_I_1]], i32* {{.+}}, !llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
-    a[i]++;
-// CHECK: [[IV2_2:%.+]] = load i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
+//
+// CHECK-NEXT: [[LIN0_1:%.+]] = load i64, i64* [[LIN0]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
+// CHECK-NEXT: [[IV2_2:%.+]] = load i32, i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
+// CHECK-NEXT: [[LIN_MUL1:%.+]] = mul nsw i32 [[IV2_2]], 3
+// CHECK-NEXT: [[LIN_EXT1:%.+]] = sext i32 [[LIN_MUL1]] to i64
+// CHECK-NEXT: [[LIN_ADD1:%.+]] = add nsw i64 [[LIN0_1]], [[LIN_EXT1]]
+// Update of the privatized version of linear variable!
+// CHECK-NEXT: store i64 [[LIN_ADD1]], i64* [[K_PRIVATIZED:%[^,]+]]
+    a[k]++;
+    k = k + 3;
+// CHECK: [[IV2_2:%.+]] = load i32, i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
 // CHECK-NEXT: [[ADD2_2:%.+]] = add nsw i32 [[IV2_2]], 1
 // CHECK-NEXT: store i32 [[ADD2_2]], i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
 // br label {{.+}}, !llvm.loop ![[SIMPLE_LOOP2_ID]]
   }
 // CHECK: [[SIMPLE_LOOP2_END]]
+//
+// Update linear vars after loop, as the loop was operating on a private version.
+// CHECK: [[LIN0_2:%.+]] = load i64, i64* [[LIN0]]
+// CHECK-NEXT: [[LIN_ADD2:%.+]] = add nsw i64 [[LIN0_2]], 27
+// CHECK-NEXT: store i64 [[LIN_ADD2]], i64* [[K_VAR]]
+//
+
+  int lin = 12;
+  #pragma omp simd linear(lin : get_val()), linear(g_ptr)
+
+// Init linear private var.
+// CHECK: store i32 12, i32* [[LIN_VAR:%[^,]+]]
+// CHECK: [[LIN_LOAD:%.+]] = load i32, i32* [[LIN_VAR]]
+// CHECK-NEXT: store i32 [[LIN_LOAD]], i32* [[LIN_START:%[^,]+]]
+// CHECK: [[GLIN_LOAD:%.+]] = load double*, double** [[GLIN_VAR:@[^,]+]]
+// CHECK-NEXT: store double* [[GLIN_LOAD]], double** [[GLIN_START:%[^,]+]]
 
-  #pragma omp simd
 // CHECK: store i64 0, i64* [[OMP_IV3:%[^,]+]]
 
-// CHECK: [[IV3:%.+]] = load i64* [[OMP_IV3]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID:[0-9]+]]
+// Remember linear step.
+// CHECK: [[CALL_VAL:%.+]] = invoke
+// CHECK: store i64 [[CALL_VAL]], i64* [[LIN_STEP:%[^,]+]]
+
+// CHECK: [[IV3:%.+]] = load i64, i64* [[OMP_IV3]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID:[0-9]+]]
 // CHECK-NEXT: [[CMP3:%.+]] = icmp ult i64 [[IV3]], 4
 // CHECK-NEXT: br i1 [[CMP3]], label %[[SIMPLE_LOOP3_BODY:.+]], label %[[SIMPLE_LOOP3_END:[^,]+]]
   for (unsigned long long it = 2000; it >= 600; it-=400) {
 // CHECK: [[SIMPLE_LOOP3_BODY]]
 // Start of body: calculate it from IV:
-// CHECK: [[IV3_0:%.+]] = load i64* [[OMP_IV3]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+// CHECK: [[IV3_0:%.+]] = load i64, i64* [[OMP_IV3]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
 // CHECK-NEXT: [[LC_IT_1:%.+]] = mul i64 [[IV3_0]], 400
 // CHECK-NEXT: [[LC_IT_2:%.+]] = sub i64 2000, [[LC_IT_1]]
 // CHECK-NEXT: store i64 [[LC_IT_2]], i64* {{.+}}, !llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
-    a[it]++;
-// CHECK: [[IV3_2:%.+]] = load i64* [[OMP_IV3]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+//
+// Linear start and step are used to calculate current value of the linear variable.
+// CHECK: [[LINSTART:.+]] = load i32, i32* [[LIN_START]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+// CHECK: [[LINSTEP:.+]] = load i64, i64* [[LIN_STEP]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+// CHECK-NOT: store i32 {{.+}}, i32* [[LIN_VAR]],{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+// CHECK: [[GLINSTART:.+]] = load double*, double** [[GLIN_START]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+// CHECK-NEXT: [[IV3_1:%.+]] = load i64, i64* [[OMP_IV3]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+// CHECK-NEXT: [[MUL:%.+]] = mul i64 [[IV3_1]], 1
+// CHECK: [[GEP:%.+]] = getelementptr{{.*}}[[GLINSTART]]
+// CHECK-NEXT: store double* [[GEP]], double** [[G_PTR_CUR:%[^,]+]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+    *g_ptr++ = 0.0;
+// CHECK: [[GEP_VAL:%.+]] = load double{{.*}}[[G_PTR_CUR]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+// CHECK: store double{{.*}}[[GEP_VAL]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+    a[it + lin]++;
+// CHECK: [[FLT_INC:%.+]] = fadd float
+// CHECK-NEXT: store float [[FLT_INC]],{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
+// CHECK: [[IV3_2:%.+]] = load i64, i64* [[OMP_IV3]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
 // CHECK-NEXT: [[ADD3_2:%.+]] = add i64 [[IV3_2]], 1
 // CHECK-NEXT: store i64 [[ADD3_2]], i64* [[OMP_IV3]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP3_ID]]
   }
 // CHECK: [[SIMPLE_LOOP3_END]]
+//
+// Linear start and step are used to calculate final value of the linear variables.
+// CHECK: [[LINSTART:.+]] = load i32, i32* [[LIN_START]]
+// CHECK: [[LINSTEP:.+]] = load i64, i64* [[LIN_STEP]]
+// CHECK: store i32 {{.+}}, i32* [[LIN_VAR]],
+// CHECK: [[GLINSTART:.+]] = load double*, double** [[GLIN_START]]
+// CHECK: store double* {{.*}}[[GLIN_VAR]]
 
   #pragma omp simd
 // CHECK: store i32 0, i32* [[OMP_IV4:%[^,]+]]
 
-// CHECK: [[IV4:%.+]] = load i32* [[OMP_IV4]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP4_ID:[0-9]+]]
+// CHECK: [[IV4:%.+]] = load i32, i32* [[OMP_IV4]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP4_ID:[0-9]+]]
 // CHECK-NEXT: [[CMP4:%.+]] = icmp slt i32 [[IV4]], 4
 // CHECK-NEXT: br i1 [[CMP4]], label %[[SIMPLE_LOOP4_BODY:.+]], label %[[SIMPLE_LOOP4_END:[^,]+]]
   for (short it = 6; it <= 20; it-=-4) {
 // CHECK: [[SIMPLE_LOOP4_BODY]]
 // Start of body: calculate it from IV:
-// CHECK: [[IV4_0:%.+]] = load i32* [[OMP_IV4]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP4_ID]]
+// CHECK: [[IV4_0:%.+]] = load i32, i32* [[OMP_IV4]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP4_ID]]
 // CHECK-NEXT: [[LC_IT_1:%.+]] = mul nsw i32 [[IV4_0]], 4
 // CHECK-NEXT: [[LC_IT_2:%.+]] = add nsw i32 6, [[LC_IT_1]]
 // CHECK-NEXT: [[LC_IT_3:%.+]] = trunc i32 [[LC_IT_2]] to i16
 // CHECK-NEXT: store i16 [[LC_IT_3]], i16* {{.+}}, !llvm.mem.parallel_loop_access ![[SIMPLE_LOOP4_ID]]
 
-// CHECK: [[IV4_2:%.+]] = load i32* [[OMP_IV4]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP4_ID]]
+// CHECK: [[IV4_2:%.+]] = load i32, i32* [[OMP_IV4]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP4_ID]]
 // CHECK-NEXT: [[ADD4_2:%.+]] = add nsw i32 [[IV4_2]], 1
 // CHECK-NEXT: store i32 [[ADD4_2]], i32* [[OMP_IV4]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP4_ID]]
   }
@@ -98,46 +158,28 @@ void simple(float *a, float *b, float *c, float *d) {
   #pragma omp simd
 // CHECK: store i32 0, i32* [[OMP_IV5:%[^,]+]]
 
-// CHECK: [[IV5:%.+]] = load i32* [[OMP_IV5]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP5_ID:[0-9]+]]
+// CHECK: [[IV5:%.+]] = load i32, i32* [[OMP_IV5]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP5_ID:[0-9]+]]
 // CHECK-NEXT: [[CMP5:%.+]] = icmp slt i32 [[IV5]], 26
 // CHECK-NEXT: br i1 [[CMP5]], label %[[SIMPLE_LOOP5_BODY:.+]], label %[[SIMPLE_LOOP5_END:[^,]+]]
   for (unsigned char it = 'z'; it >= 'a'; it+=-1) {
 // CHECK: [[SIMPLE_LOOP5_BODY]]
 // Start of body: calculate it from IV:
-// CHECK: [[IV5_0:%.+]] = load i32* [[OMP_IV5]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP5_ID]]
+// CHECK: [[IV5_0:%.+]] = load i32, i32* [[OMP_IV5]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP5_ID]]
 // CHECK-NEXT: [[IV5_1:%.+]] = mul nsw i32 [[IV5_0]], 1
 // CHECK-NEXT: [[LC_IT_1:%.+]] = sub nsw i32 122, [[IV5_1]]
 // CHECK-NEXT: [[LC_IT_2:%.+]] = trunc i32 [[LC_IT_1]] to i8
 // CHECK-NEXT: store i8 [[LC_IT_2]], i8* {{.+}}, !llvm.mem.parallel_loop_access ![[SIMPLE_LOOP5_ID]]
 
-// CHECK: [[IV5_2:%.+]] = load i32* [[OMP_IV5]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP5_ID]]
+// CHECK: [[IV5_2:%.+]] = load i32, i32* [[OMP_IV5]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP5_ID]]
 // CHECK-NEXT: [[ADD5_2:%.+]] = add nsw i32 [[IV5_2]], 1
 // CHECK-NEXT: store i32 [[ADD5_2]], i32* [[OMP_IV5]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP5_ID]]
   }
 // CHECK: [[SIMPLE_LOOP5_END]]
 
+// CHECK-NOT: mul i32 %{{.+}}, 10
   #pragma omp simd
-// FIXME: I think we would get wrong result using 'unsigned' in the loop below.
-// So we'll need to add zero trip test for 'unsigned' counters.
-//
-// CHECK: store i32 0, i32* [[OMP_IV6:%[^,]+]]
-
-// CHECK: [[IV6:%.+]] = load i32* [[OMP_IV6]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP6_ID:[0-9]+]]
-// CHECK-NEXT: [[CMP6:%.+]] = icmp slt i32 [[IV6]], -8
-// CHECK-NEXT: br i1 [[CMP6]], label %[[SIMPLE_LOOP6_BODY:.+]], label %[[SIMPLE_LOOP6_END:[^,]+]]
-  for (int i=100; i<10; i+=10) {
-// CHECK: [[SIMPLE_LOOP6_BODY]]
-// Start of body: calculate i from IV:
-// CHECK: [[IV6_0:%.+]] = load i32* [[OMP_IV6]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP6_ID]]
-// CHECK-NEXT: [[LC_IT_1:%.+]] = mul nsw i32 [[IV6_0]], 10
-// CHECK-NEXT: [[LC_IT_2:%.+]] = add nsw i32 100, [[LC_IT_1]]
-// CHECK-NEXT: store i32 [[LC_IT_2]], i32* {{.+}}, !llvm.mem.parallel_loop_access ![[SIMPLE_LOOP6_ID]]
-
-// CHECK: [[IV6_2:%.+]] = load i32* [[OMP_IV6]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP6_ID]]
-// CHECK-NEXT: [[ADD6_2:%.+]] = add nsw i32 [[IV6_2]], 1
-// CHECK-NEXT: store i32 [[ADD6_2]], i32* [[OMP_IV6]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP6_ID]]
+  for (unsigned i=100; i<10; i+=10) {
   }
-// CHECK: [[SIMPLE_LOOP6_END]]
 
   int A;
   #pragma omp simd lastprivate(A)
@@ -145,37 +187,32 @@ void simple(float *a, float *b, float *c, float *d) {
 // Test checks that one iteration is separated in presence of lastprivate.
 //
 // CHECK: store i64 0, i64* [[OMP_IV7:%[^,]+]]
-// CHECK: br i1 true, label %[[SIMPLE_IF7_THEN:.+]], label %[[SIMPLE_IF7_END:[^,]+]]
-// CHECK: [[SIMPLE_IF7_THEN]]
 // CHECK: br label %[[SIMD_LOOP7_COND:[^,]+]]
 // CHECK: [[SIMD_LOOP7_COND]]
-// CHECK-NEXT: [[IV7:%.+]] = load i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID:[0-9]+]]
+// CHECK-NEXT: [[IV7:%.+]] = load i64, i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID:[0-9]+]]
 // CHECK-NEXT: [[CMP7:%.+]] = icmp slt i64 [[IV7]], 6
 // CHECK-NEXT: br i1 [[CMP7]], label %[[SIMPLE_LOOP7_BODY:.+]], label %[[SIMPLE_LOOP7_END:[^,]+]]
   for (long long i = -10; i < 10; i += 3) {
 // CHECK: [[SIMPLE_LOOP7_BODY]]
 // Start of body: calculate i from IV:
-// CHECK: [[IV7_0:%.+]] = load i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID]]
+// CHECK: [[IV7_0:%.+]] = load i64, i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID]]
 // CHECK-NEXT: [[LC_IT_1:%.+]] = mul nsw i64 [[IV7_0]], 3
 // CHECK-NEXT: [[LC_IT_2:%.+]] = add nsw i64 -10, [[LC_IT_1]]
 // CHECK-NEXT: store i64 [[LC_IT_2]], i64* {{.+}}, !llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID]]
     A = i;
-// CHECK: [[IV7_2:%.+]] = load i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID]]
+// CHECK: [[IV7_2:%.+]] = load i64, i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID]]
 // CHECK-NEXT: [[ADD7_2:%.+]] = add nsw i64 [[IV7_2]], 1
 // CHECK-NEXT: store i64 [[ADD7_2]], i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID]]
   }
 // CHECK: [[SIMPLE_LOOP7_END]]
 // Separated last iteration.
-// CHECK: [[IV7_4:%.+]] = load i64* [[OMP_IV7]]
+// CHECK: [[IV7_4:%.+]] = load i64, i64* [[OMP_IV7]]
 // CHECK-NEXT: [[LC_FIN_1:%.+]] = mul nsw i64 [[IV7_4]], 3
 // CHECK-NEXT: [[LC_FIN_2:%.+]] = add nsw i64 -10, [[LC_FIN_1]]
 // CHECK-NEXT: store i64 [[LC_FIN_2]], i64* [[ADDR_I:%[^,]+]]
-// CHECK: [[LOAD_I:%.+]] = load i64* [[ADDR_I]]
+// CHECK: [[LOAD_I:%.+]] = load i64, i64* [[ADDR_I]]
 // CHECK-NEXT: [[CONV_I:%.+]] = trunc i64 [[LOAD_I]] to i32
 //
-// CHECK: br label %[[SIMPLE_IF7_END]]
-// CHECK: [[SIMPLE_IF7_END]]
-//
 
 // CHECK: ret void
 }
@@ -197,24 +234,24 @@ int templ1(T a, T *z) {
 // CHECK-LABEL: define {{.*i32}} @{{.*}}templ1{{.*}}(float {{.+}}, float* {{.+}})
 // CHECK: store i64 0, i64* [[T1_OMP_IV:[^,]+]]
 // ...
-// CHECK: [[IV:%.+]] = load i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID:[0-9]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID:[0-9]+]]
 // CHECK-NEXT: [[CMP1:%.+]] = icmp slt i64 [[IV]], 16
 // CHECK-NEXT: br i1 [[CMP1]], label %[[T1_BODY:.+]], label %[[T1_END:[^,]+]]
 // CHECK: [[T1_BODY]]
 // Loop counters i and j updates:
-// CHECK: [[IV1:%.+]] = load i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
+// CHECK: [[IV1:%.+]] = load i64, i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
 // CHECK-NEXT: [[I_1:%.+]] = sdiv i64 [[IV1]], 4
 // CHECK-NEXT: [[I_1_MUL1:%.+]] = mul nsw i64 [[I_1]], 1
 // CHECK-NEXT: [[I_1_ADD0:%.+]] = add nsw i64 0, [[I_1_MUL1]]
 // CHECK-NEXT: [[I_2:%.+]] = trunc i64 [[I_1_ADD0]] to i32
 // CHECK-NEXT: store i32 [[I_2]], i32* {{%.+}}{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
-// CHECK: [[IV2:%.+]] = load i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
+// CHECK: [[IV2:%.+]] = load i64, i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
 // CHECK-NEXT: [[J_1:%.+]] = srem i64 [[IV2]], 4
 // CHECK-NEXT: [[J_2:%.+]] = mul nsw i64 [[J_1]], 2
 // CHECK-NEXT: [[J_2_ADD0:%.+]] = add nsw i64 0, [[J_2]]
 // CHECK-NEXT: store i64 [[J_2_ADD0]], i64* {{%.+}}{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
 // simd.for.inc:
-// CHECK: [[IV3:%.+]] = load i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
+// CHECK: [[IV3:%.+]] = load i64, i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
 // CHECK-NEXT: [[INC:%.+]] = add nsw i64 [[IV3]], 1
 // CHECK-NEXT: store i64 [[INC]], i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
 // CHECK-NEXT: br label {{%.+}}
@@ -261,34 +298,34 @@ void iter_simple(IterDouble ia, IterDouble ib, IterDouble ic) {
 //
 // CHECK: store i32 0, i32* [[IT_OMP_IV:%[^,]+]]
 // Calculate number of iterations before the loop body.
-// CHECK: [[DIFF1:%.+]] = call {{.*}}i32 @{{.*}}IterDouble{{.*}}
-// CHECK-NEXT: [[DIFF2:%.+]] = sub nsw i32 [[DIFF1]], 1
+// CHECK: [[DIFF1:%.+]] = invoke {{.*}}i32 @{{.*}}IterDouble{{.*}}
+// CHECK: [[DIFF2:%.+]] = sub nsw i32 [[DIFF1]], 1
 // CHECK-NEXT: [[DIFF3:%.+]] = add nsw i32 [[DIFF2]], 1
 // CHECK-NEXT: [[DIFF4:%.+]] = sdiv i32 [[DIFF3]], 1
 // CHECK-NEXT: [[DIFF5:%.+]] = sub nsw i32 [[DIFF4]], 1
 // CHECK-NEXT: store i32 [[DIFF5]], i32* [[OMP_LAST_IT:%[^,]+]]{{.+}}
   #pragma omp simd
 
-// CHECK: [[IV:%.+]] = load i32* [[IT_OMP_IV]]{{.+}} !llvm.mem.parallel_loop_access ![[ITER_LOOP_ID:[0-9]+]]
-// CHECK-NEXT: [[LAST_IT:%.+]] = load i32* [[OMP_LAST_IT]]{{.+}}!llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[IT_OMP_IV]]{{.+}} !llvm.mem.parallel_loop_access ![[ITER_LOOP_ID:[0-9]+]]
+// CHECK-NEXT: [[LAST_IT:%.+]] = load i32, i32* [[OMP_LAST_IT]]{{.+}}!llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
 // CHECK-NEXT: [[NUM_IT:%.+]] = add nsw i32 [[LAST_IT]], 1
 // CHECK-NEXT: [[CMP:%.+]] = icmp slt i32 [[IV]], [[NUM_IT]]
 // CHECK-NEXT: br i1 [[CMP]], label %[[IT_BODY:[^,]+]], label %[[IT_END:[^,]+]]
   for (IterDouble i = ia; i < ib; ++i) {
 // CHECK: [[IT_BODY]]
 // Start of body: calculate i from index:
-// CHECK: [[IV1:%.+]] = load i32* [[IT_OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
+// CHECK: [[IV1:%.+]] = load i32, i32* [[IT_OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
 // Call of operator+ (i, IV).
-// CHECK: {{%.+}} = call {{.+}} @{{.*}}IterDouble{{.*}}!llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
+// CHECK: {{%.+}} = invoke {{.+}} @{{.*}}IterDouble{{.*}}
 // ... loop body ...
    *i = *ic * 0.5;
 // Float multiply and save result.
 // CHECK: [[MULR:%.+]] = fmul double {{%.+}}, 5.000000e-01
-// CHECK-NEXT: call {{.+}} @{{.*}}IterDouble{{.*}}
+// CHECK-NEXT: invoke {{.+}} @{{.*}}IterDouble{{.*}}
 // CHECK: store double [[MULR:%.+]], double* [[RESULT_ADDR:%.+]], !llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
    ++ic;
 //
-// CHECK: [[IV2:%.+]] = load i32* [[IT_OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
+// CHECK: [[IV2:%.+]] = load i32, i32* [[IT_OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
 // CHECK-NEXT: [[ADD2:%.+]] = add nsw i32 [[IV2]], 1
 // CHECK-NEXT: store i32 [[ADD2]], i32* [[IT_OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[ITER_LOOP_ID]]
 // br label %{{.*}}, !llvm.loop ![[ITER_LOOP_ID]]
@@ -308,7 +345,7 @@ void collapsed(float *a, float *b, float *c, float *d) {
 //
   #pragma omp simd collapse(4)
 
-// CHECK: [[IV:%.+]] = load i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID:[0-9]+]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID:[0-9]+]]
 // CHECK-NEXT: [[CMP:%.+]] = icmp ult i32 [[IV]], 120
 // CHECK-NEXT: br i1 [[CMP]], label %[[COLL1_BODY:[^,]+]], label %[[COLL1_END:[^,]+]]
   for (i = 1; i < 3; i++) // 2 iterations
@@ -318,25 +355,25 @@ void collapsed(float *a, float *b, float *c, float *d) {
         {
 // CHECK: [[COLL1_BODY]]
 // Start of body: calculate i from index:
-// CHECK: [[IV1:%.+]] = load i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK: [[IV1:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
 // Calculation of the loop counters values.
 // CHECK: [[CALC_I_1:%.+]] = udiv i32 [[IV1]], 60
 // CHECK-NEXT: [[CALC_I_1_MUL1:%.+]] = mul i32 [[CALC_I_1]], 1
 // CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 1, [[CALC_I_1_MUL1]]
 // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
-// CHECK: [[IV1_2:%.+]] = load i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
 // CHECK-NEXT: [[CALC_J_1:%.+]] = udiv i32 [[IV1_2]], 20
 // CHECK-NEXT: [[CALC_J_2:%.+]] = urem i32 [[CALC_J_1]], 3
 // CHECK-NEXT: [[CALC_J_2_MUL1:%.+]] = mul i32 [[CALC_J_2]], 1
 // CHECK-NEXT: [[CALC_J_3:%.+]] = add i32 2, [[CALC_J_2_MUL1]]
 // CHECK-NEXT: store i32 [[CALC_J_3]], i32* [[LC_J:.+]]
-// CHECK: [[IV1_3:%.+]] = load i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK: [[IV1_3:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
 // CHECK-NEXT: [[CALC_K_1:%.+]] = udiv i32 [[IV1_3]], 5
 // CHECK-NEXT: [[CALC_K_2:%.+]] = urem i32 [[CALC_K_1]], 4
 // CHECK-NEXT: [[CALC_K_2_MUL1:%.+]] = mul i32 [[CALC_K_2]], 1
 // CHECK-NEXT: [[CALC_K_3:%.+]] = add i32 3, [[CALC_K_2_MUL1]]
 // CHECK-NEXT: store i32 [[CALC_K_3]], i32* [[LC_K:.+]]
-// CHECK: [[IV1_4:%.+]] = load i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK: [[IV1_4:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
 // CHECK-NEXT: [[CALC_L_1:%.+]] = urem i32 [[IV1_4]], 5
 // CHECK-NEXT: [[CALC_L_1_MUL1:%.+]] = mul i32 [[CALC_L_1]], 1
 // CHECK-NEXT: [[CALC_L_2:%.+]] = add i32 4, [[CALC_L_1_MUL1]]
@@ -347,7 +384,7 @@ void collapsed(float *a, float *b, float *c, float *d) {
 // CHECK: store float [[RESULT:%.+]], float* [[RESULT_ADDR:%.+]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
     float res = b[j] * c[k];
     a[i] = res * d[l];
-// CHECK: [[IV2:%.+]] = load i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK: [[IV2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
 // CHECK-NEXT: [[ADD2:%.+]] = add i32 [[IV2]], 1
 // CHECK-NEXT: store i32 [[ADD2]], i32* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
 // br label %{{[^,]+}}, !llvm.loop ![[COLL1_LOOP_ID]]
@@ -361,18 +398,22 @@ void collapsed(float *a, float *b, float *c, float *d) {
 }
 
 extern char foo();
+extern double globalfloat;
 
 // CHECK-LABEL: define {{.*void}} @{{.*}}widened{{.*}}
 void widened(float *a, float *b, float *c, float *d) {
   int i; // outer loop counter
   short j; // inner loop counter
+  globalfloat = 1.0;
+  int localint = 1;
+// CHECK: store double {{.+}}, double* [[GLOBALFLOAT:@.+]]
 // Counter is widened to 64 bits.
 // CHECK: store i64 0, i64* [[OMP_IV:[^,]+]]
 //
-  #pragma omp simd collapse(2)
+  #pragma omp simd collapse(2) private(globalfloat, localint)
 
-// CHECK: [[IV:%.+]] = load i64* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID:[0-9]+]]
-// CHECK-NEXT: [[LI:%.+]] = load i64* [[OMP_LI:%[^,]+]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID:[0-9]+]]
+// CHECK-NEXT: [[LI:%.+]] = load i64, i64* [[OMP_LI:%[^,]+]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
 // CHECK-NEXT: [[NUMIT:%.+]] = add nsw i64 [[LI]], 1
 // CHECK-NEXT: [[CMP:%.+]] = icmp slt i64 [[IV]], [[NUMIT]]
 // CHECK-NEXT: br i1 [[CMP]], label %[[WIDE1_BODY:[^,]+]], label %[[WIDE1_END:[^,]+]]
@@ -381,27 +422,57 @@ void widened(float *a, float *b, float *c, float *d) {
   {
 // CHECK: [[WIDE1_BODY]]
 // Start of body: calculate i from index:
-// CHECK: [[IV1:%.+]] = load i64* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
+// CHECK: [[IV1:%.+]] = load i64, i64* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
 // Calculation of the loop counters values...
 // CHECK: store i32 {{[^,]+}}, i32* [[LC_I:.+]]
-// CHECK: [[IV1_2:%.+]] = load i64* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
 // CHECK: store i16 {{[^,]+}}, i16* [[LC_J:.+]]
 // ... loop body ...
-// End of body: store into a[i]:
-// CHECK: store float [[RESULT:%.+]], float* [[RESULT_ADDR:%.+]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
+//
+// Here we expect store into private double var, not global
+// CHECK-NOT: store double {{.+}}, double* [[GLOBALFLOAT]]
+    globalfloat = (float)j/i;
     float res = b[j] * c[j];
+// Store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* [[RESULT_ADDR:%.+]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
     a[i] = res * d[i];
-// CHECK: [[IV2:%.+]] = load i64* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
+// Then there's a store into private var localint:
+// CHECK: store i32 {{.+}}, i32* [[LOCALINT:%[^,]+]]{{.+}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
+    localint = (int)j;
+// CHECK: [[IV2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
 // CHECK-NEXT: [[ADD2:%.+]] = add nsw i64 [[IV2]], 1
 // CHECK-NEXT: store i64 [[ADD2]], i64* [[OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[WIDE1_LOOP_ID]]
+//
 // br label %{{[^,]+}}, !llvm.loop ![[WIDE1_LOOP_ID]]
 // CHECK: [[WIDE1_END]]
   }
 // i,j are updated.
 // CHECK: store i32 3, i32* [[I:%[^,]+]]
 // CHECK: store i16
+//
+// Here we expect store into original localint, not its privatized version.
+// CHECK-NOT: store i32 {{.+}}, i32* [[LOCALINT]]
+  localint = (int)j;
 // CHECK: ret void
 }
 
+// TERM_DEBUG-LABEL: bar
+int bar() {return 0;};
+
+// TERM_DEBUG-LABEL: parallel_simd
+void parallel_simd(float *a) {
+#pragma omp parallel
+#pragma omp simd
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     invoke i32 {{.*}}bar{{.*}}()
+  // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     [[TERM_LPAD]]
+  // TERM_DEBUG:     call void @__clang_call_terminate
+  // TERM_DEBUG:     unreachable
+  for (unsigned i = 131071; i <= 2147483647; i += 127)
+    a[i] += bar();
+}
+// TERM_DEBUG: !{{[0-9]+}} = !DILocation(line: [[@LINE-11]],
 #endif // HEADER
 
diff --git a/test/OpenMP/simd_collapse_messages.cpp b/test/OpenMP/simd_collapse_messages.cpp
index 56523b33726c..56ff201d112d 100644
--- a/test/OpenMP/simd_collapse_messages.cpp
+++ b/test/OpenMP/simd_collapse_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/simd_lastprivate_messages.cpp b/test/OpenMP/simd_lastprivate_messages.cpp
index 55f60583506d..ca26db0d1447 100644
--- a/test/OpenMP/simd_lastprivate_messages.cpp
+++ b/test/OpenMP/simd_lastprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -15,15 +15,16 @@ class S2 {
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator=(const S2 &) const;
   static float S2s; // expected-note {{static data member is predetermined as shared}}
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
 const S2 b;
 const S2 ba[5];
-class S3 { // expected-note {{'S3' declared here}}
+class S3 {
   int a;
-  S3 &operator=(const S3 &s3);
+  S3 &operator=(const S3 &s3); // expected-note {{implicitly declared private here}}
 
 public:
   S3() : a(0) {}
@@ -32,17 +33,17 @@ public:
 const S3 c;         // expected-note {{global variable is predetermined as shared}}
 const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
 extern const int f; // expected-note {{global variable is predetermined as shared}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4();          // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note {{'S5' declared here}}
+class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
 
 public:
   S5(const S5 &s5) : a(s5.a) {}
@@ -52,6 +53,14 @@ public:
 S3 h;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 template <class I, class C>
 int foomain(I argc, C **argv) {
   I e(4);
@@ -91,7 +100,7 @@ int foomain(I argc, C **argv) {
 #pragma omp simd lastprivate(e, g)
   for (int k = 0; k < argc; ++k)
     ++k;
-#pragma omp simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+#pragma omp simd lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp simd firstprivate(i) // expected-error {{unexpected OpenMP clause 'firstprivate' in directive '#pragma omp simd'}}
@@ -121,9 +130,9 @@ int foomain(I argc, C **argv) {
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
-  S4 e(4);               // expected-note {{'e' defined here}}
-  S5 g(5);               // expected-note {{'g' defined here}}
-  S3 m;                  // expected-note {{'m' defined here}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
   int i;
   int &j = i;                // expected-note {{'j' defined here}}
 #pragma omp simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
@@ -181,10 +190,10 @@ int main(int argc, char **argv) {
 #pragma omp simd firstprivate(g) // expected-error {{unexpected OpenMP clause 'firstprivate' in directive '#pragma omp simd'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp simd lastprivate(e, g) // expected-error 2 {{lastprivate variable must have an accessible, unambiguous default constructor}}
+#pragma omp simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp simd lastprivate(m) // expected-error {{lastprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
diff --git a/test/OpenMP/simd_linear_messages.cpp b/test/OpenMP/simd_linear_messages.cpp
index b8b783107993..78f905fd9460 100644
--- a/test/OpenMP/simd_linear_messages.cpp
+++ b/test/OpenMP/simd_linear_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 namespace X {
   int x;
@@ -148,6 +148,14 @@ template<class I, class C> int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   double darr[100];
   // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
@@ -177,7 +185,7 @@ int main(int argc, char **argv) {
   for (int k = 0; k < argc; ++k) ++k;
   // expected-error@+2 {{linear variable with incomplete type 'S1'}}
   // expected-error@+1 {{const-qualified variable cannot be linear}}
-  #pragma omp simd linear (a, b) 
+  #pragma omp simd linear(a, b)
   for (int k = 0; k < argc; ++k) ++k;
   #pragma omp simd linear (argv[1]) // expected-error {{expected variable name}}
   for (int k = 0; k < argc; ++k) ++k;
@@ -185,7 +193,7 @@ int main(int argc, char **argv) {
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
   #pragma omp simd linear(e, g)
   for (int k = 0; k < argc; ++k) ++k;
-  #pragma omp simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  #pragma omp simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
   for (int k = 0; k < argc; ++k) ++k;
   #pragma omp parallel
   {
diff --git a/test/OpenMP/simd_loop_messages.cpp b/test/OpenMP/simd_loop_messages.cpp
index ce64842d68a9..fe8e70a9b8a0 100644
--- a/test/OpenMP/simd_loop_messages.cpp
+++ b/test/OpenMP/simd_loop_messages.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
 
 static int sii;
-#pragma omp threadprivate(sii) // expected-note {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
 static int globalii;
 
 int test_iteration_spaces() {
@@ -252,7 +252,6 @@ int test_iteration_spaces() {
 
   #pragma omp parallel
   {
-    // expected-error@+2 {{loop iteration variable in the associated loop of 'omp simd' directive may not be threadprivate or thread local, predetermined as linear}}
     #pragma omp simd
     for (sii = 0; sii < 10; sii+=1)
       c[sii] = a[sii];
@@ -260,7 +259,6 @@ int test_iteration_spaces() {
 
   #pragma omp parallel
   {
-    // expected-error@+2 {{loop iteration variable in the associated loop of 'omp simd' directive may not be a variable with global storage without being explicitly marked as linear}}
     #pragma omp simd
     for (globalii = 0; globalii < 10; globalii+=1)
       c[globalii] = a[globalii];
@@ -268,7 +266,6 @@ int test_iteration_spaces() {
 
   #pragma omp parallel
   {
-// expected-error@+3 {{loop iteration variable in the associated loop of 'omp simd' directive may not be a variable with global storage without being explicitly marked as lastprivate}}
 #pragma omp simd collapse(2)
     for (ii = 0; ii < 10; ii += 1)
     for (globalii = 0; globalii < 10; globalii += 1)
diff --git a/test/OpenMP/simd_metadata.c b/test/OpenMP/simd_metadata.c
index 4552669c80b7..2a95fef160f4 100644
--- a/test/OpenMP/simd_metadata.c
+++ b/test/OpenMP/simd_metadata.c
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 -fopenmp=libiomp5 -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=X86
+// RUN: %clang_cc1 -fopenmp -triple x86_64-unknown-unknown -target-feature +avx -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=X86-AVX
+// RUN: %clang_cc1 -fopenmp -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=PPC
+// RUN: %clang_cc1 -fopenmp -triple powerpc64-unknown-unknown -target-abi elfv1-qpx -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=PPC-QPX
 
 void h1(float *c, float *a, double b[], int size)
 {
@@ -11,11 +13,21 @@ void h1(float *c, float *a, double b[], int size)
 // CHECK-NEXT:    [[C_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[C_MASKEDPTR]], 0
 // CHECK-NEXT:    call void @llvm.assume(i1 [[C_MASKCOND]])
 // CHECK:         [[A_PTRINT:%.+]] = ptrtoint
-// CHECK-NEXT:    [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+
+// X86-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+// X86-AVX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 31
+// PPC-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+// PPC-QPX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+
 // CHECK-NEXT:    [[A_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[A_MASKEDPTR]], 0
 // CHECK-NEXT:    call void @llvm.assume(i1 [[A_MASKCOND]])
 // CHECK:         [[B_PTRINT:%.+]] = ptrtoint
-// CHECK-NEXT:    [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+
+// X86-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+// X86-AVX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
+// PPC-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+// PPC-QPX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
+
 // CHECK-NEXT:    [[B_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[B_MASKEDPTR]], 0
 // CHECK-NEXT:    call void @llvm.assume(i1 [[B_MASKCOND]])
   for (int i = 0; i < size; ++i) {
diff --git a/test/OpenMP/simd_misc_messages.c b/test/OpenMP/simd_misc_messages.c
index f94bb3870ca2..0b4dd7f3b533 100644
--- a/test/OpenMP/simd_misc_messages.c
+++ b/test/OpenMP/simd_misc_messages.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
 
 // expected-error@+1 {{unexpected OpenMP directive '#pragma omp simd'}}
 #pragma omp simd
diff --git a/test/OpenMP/simd_private_messages.cpp b/test/OpenMP/simd_private_messages.cpp
index 56922e888b99..adef373b6a4e 100644
--- a/test/OpenMP/simd_private_messages.cpp
+++ b/test/OpenMP/simd_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -85,6 +85,14 @@ template<class I, class C> int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
@@ -112,7 +120,7 @@ int main(int argc, char **argv) {
   for (int k = 0; k < argc; ++k) ++k;
   #pragma omp simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   for (int k = 0; k < argc; ++k) ++k;
-  #pragma omp simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  #pragma omp simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   for (int k = 0; k < argc; ++k) ++k;
   #pragma omp simd shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp simd'}}
   for (int k = 0; k < argc; ++k) ++k;
diff --git a/test/OpenMP/simd_reduction_messages.cpp b/test/OpenMP/simd_reduction_messages.cpp
index 347a5c1a4105..aeb2b23bb0ca 100644
--- a/test/OpenMP/simd_reduction_messages.cpp
+++ b/test/OpenMP/simd_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -11,7 +11,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;                       // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
-  T fl;                            // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp simd reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
     foo();
@@ -96,10 +96,10 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name}}
@@ -114,7 +114,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified variable cannot be reduction}}
@@ -132,7 +132,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp simd reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}} expected-error {{a reduction variable with array type 'const float [5]'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
@@ -144,7 +144,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -180,18 +180,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;                      // expected-note 2 {{'j' defined here}}
   S3 &p = k;                       // expected-note 2 {{'p' defined here}}
   const int &r = da[i];            // expected-note {{'r' defined here}}
   int &q = qa[i];                  // expected-note {{'q' defined here}}
-  float fl;                        // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp simd reduction // expected-error {{expected '(' after 'reduction'}}
   for (int i = 0; i < 10; ++i)
     foo();
@@ -231,7 +239,7 @@ int main(int argc, char **argv) {
 #pragma omp simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp simd reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}}
@@ -249,7 +257,7 @@ int main(int argc, char **argv) {
 #pragma omp simd reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
@@ -258,13 +266,13 @@ int main(int argc, char **argv) {
 #pragma omp simd reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+#pragma omp simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
diff --git a/test/OpenMP/simd_safelen_messages.cpp b/test/OpenMP/simd_safelen_messages.cpp
index 0e7e80dd976c..b7300c337532 100644
--- a/test/OpenMP/simd_safelen_messages.cpp
+++ b/test/OpenMP/simd_safelen_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/single_ast_print.cpp b/test/OpenMP/single_ast_print.cpp
index 65a007e271c4..8eb35174a77f 100644
--- a/test/OpenMP/single_ast_print.cpp
+++ b/test/OpenMP/single_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -14,10 +14,16 @@ T tmain(T argc) {
   static T a;
 // CHECK: static T a;
 #pragma omp parallel private(g)
-#pragma omp single private(argc, b), firstprivate(c, d), nowait copyprivate(g)
+#pragma omp single private(argc, b), firstprivate(c, d), nowait
   foo();
   // CHECK-NEXT: #pragma omp parallel private(g)
-  // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(c,d) nowait copyprivate(g)
+  // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(c,d) nowait
+  // CHECK-NEXT: foo();
+#pragma omp parallel private(g)
+#pragma omp single private(argc, b), firstprivate(c, d), copyprivate(g)
+  foo();
+  // CHECK-NEXT: #pragma omp parallel private(g)
+  // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(c,d) copyprivate(g)
   // CHECK-NEXT: foo();
   return T();
 }
@@ -27,10 +33,16 @@ int main(int argc, char **argv) {
   static int a;
 // CHECK: static int a;
 #pragma omp parallel private(g)
-#pragma omp single private(argc, b), firstprivate(argv, c), nowait copyprivate(g)
+#pragma omp single private(argc, b), firstprivate(argv, c), nowait
+  foo();
+  // CHECK-NEXT: #pragma omp parallel private(g)
+  // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(argv,c) nowait
+  // CHECK-NEXT: foo();
+#pragma omp parallel private(g)
+#pragma omp single private(argc, b), firstprivate(c, d), copyprivate(g)
   foo();
   // CHECK-NEXT: #pragma omp parallel private(g)
-  // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(argv,c) nowait copyprivate(g)
+  // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(c,d) copyprivate(g)
   // CHECK-NEXT: foo();
   return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
 }
diff --git a/test/OpenMP/single_codegen.cpp b/test/OpenMP/single_codegen.cpp
new file mode 100644
index 000000000000..0593b2aa7513
--- /dev/null
+++ b/test/OpenMP/single_codegen.cpp
@@ -0,0 +1,189 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+class TestClass {
+public:
+  int a;
+  TestClass() : a(0) {}
+  TestClass(const TestClass &C) : a(C.a) {}
+  TestClass &operator=(const TestClass &) { return *this;}
+  ~TestClass(){};
+};
+
+// CHECK-DAG:   [[TEST_CLASS_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK:       [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK:       [[IMPLICIT_BARRIER_SINGLE_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
+
+// CHECK:       define void [[FOO:@.+]]()
+
+TestClass tc;
+TestClass tc2[2];
+#pragma omp threadprivate(tc, tc2)
+
+void foo() {}
+
+// CHECK-LABEL: @main
+// TERM_DEBUG-LABEL: @main
+int main() {
+  // CHECK-DAG: [[A_ADDR:%.+]] = alloca i8
+  // CHECK-DAG: [[A2_ADDR:%.+]] = alloca [2 x i8]
+  // CHECK-DAG: [[C_ADDR:%.+]] = alloca [[TEST_CLASS_TY]]
+  char a;
+  char a2[2];
+  TestClass c;
+
+// CHECK:       [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:@.+]])
+// CHECK-DAG:   [[DID_IT:%.+]] = alloca i32,
+// CHECK-DAG:   [[COPY_LIST:%.+]] = alloca [5 x i8*],
+
+// CHECK:       [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT:  [[IS_SINGLE:%.+]] = icmp ne i32 [[RES]], 0
+// CHECK-NEXT:  br i1 [[IS_SINGLE]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
+// CHECK:       [[THEN]]
+// CHECK-NEXT:  store i8 2, i8* [[A_ADDR]]
+// CHECK-NEXT:  call void @__kmpc_end_single([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT:  br label {{%?}}[[EXIT]]
+// CHECK:       [[EXIT]]
+// CHECK-NOT:   call {{.+}} @__kmpc_cancel_barrier
+#pragma omp single nowait
+  a = 2;
+// CHECK:       [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT:  [[IS_SINGLE:%.+]] = icmp ne i32 [[RES]], 0
+// CHECK-NEXT:  br i1 [[IS_SINGLE]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
+// CHECK:       [[THEN]]
+// CHECK-NEXT:  store i8 2, i8* [[A_ADDR]]
+// CHECK-NEXT:  call void @__kmpc_end_single([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT:  br label {{%?}}[[EXIT]]
+// CHECK:       [[EXIT]]
+// CHECK:       call{{.*}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[IMPLICIT_BARRIER_SINGLE_LOC]], i32 [[GTID]])
+#pragma omp single
+  a = 2;
+// CHECK:       store i32 0, i32* [[DID_IT]]
+// CHECK:       [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT:  [[IS_SINGLE:%.+]] = icmp ne i32 [[RES]], 0
+// CHECK-NEXT:  br i1 [[IS_SINGLE]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
+// CHECK:       [[THEN]]
+// CHECK-NEXT:  invoke void [[FOO]]()
+// CHECK:       to label {{%?}}[[CONT:.+]] unwind
+// CHECK:       [[CONT]]
+// CHECK:       store i32 1, i32* [[DID_IT]]
+// CHECK:       call void @__kmpc_end_single([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK-NEXT:  br label {{%?}}[[EXIT]]
+// CHECK:       [[EXIT]]
+// CHECK:       [[A_PTR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[COPY_LIST]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK:       store i8* [[A_ADDR]], i8** [[A_PTR_REF]],
+// CHECK:       [[C_PTR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[COPY_LIST]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK:       [[C_PTR_REF_VOID_PTR:%.+]] = bitcast [[TEST_CLASS_TY]]* [[C_ADDR]] to i8*
+// CHECK:       store i8* [[C_PTR_REF_VOID_PTR]], i8** [[C_PTR_REF]],
+// CHECK:       [[TC_PTR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[COPY_LIST]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK:       [[TC_THREADPRIVATE_ADDR_VOID_PTR:%.+]] = call{{.*}} i8* @__kmpc_threadprivate_cached
+// CHECK:       [[TC_THREADPRIVATE_ADDR:%.+]] = bitcast i8* [[TC_THREADPRIVATE_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
+// CHECK:       [[TC_PTR_REF_VOID_PTR:%.+]] = bitcast [[TEST_CLASS_TY]]* [[TC_THREADPRIVATE_ADDR]] to i8*
+// CHECK:       store i8* [[TC_PTR_REF_VOID_PTR]], i8** [[TC_PTR_REF]],
+// CHECK:       [[A2_PTR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[COPY_LIST]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK:       [[BITCAST:%.+]] = bitcast [2 x i8]* [[A2_ADDR]] to i8*
+// CHECK:       store i8* [[BITCAST]], i8** [[A2_PTR_REF]],
+// CHECK:       [[TC2_PTR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[COPY_LIST]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK:       [[TC2_THREADPRIVATE_ADDR_VOID_PTR:%.+]] = call{{.*}} i8* @__kmpc_threadprivate_cached
+// CHECK:       [[TC2_THREADPRIVATE_ADDR:%.+]] = bitcast i8* [[TC2_THREADPRIVATE_ADDR_VOID_PTR]] to [2 x [[TEST_CLASS_TY]]]*
+// CHECK:       [[TC2_PTR_REF_VOID_PTR:%.+]] = bitcast [2 x [[TEST_CLASS_TY]]]* [[TC2_THREADPRIVATE_ADDR]] to i8*
+// CHECK:       store i8* [[TC2_PTR_REF_VOID_PTR]], i8** [[TC2_PTR_REF]],
+// CHECK:       [[COPY_LIST_VOID_PTR:%.+]] = bitcast [5 x i8*]* [[COPY_LIST]] to i8*
+// CHECK:       [[DID_IT_VAL:%.+]] = load i32, i32* [[DID_IT]],
+// CHECK:       call void @__kmpc_copyprivate([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i64 40, i8* [[COPY_LIST_VOID_PTR]], void (i8*, i8*)* [[COPY_FUNC:@.+]], i32 [[DID_IT_VAL]])
+// CHECK-NOT:   call {{.+}} @__kmpc_cancel_barrier
+#pragma omp single copyprivate(a, c, tc, a2, tc2)
+  foo();
+// CHECK-NOT:   call i32 @__kmpc_single
+// CHECK-NOT:   call void @__kmpc_end_single
+  return a;
+}
+
+// CHECK: void [[COPY_FUNC]](i8*, i8*)
+// CHECK: store i8* %0, i8** [[DST_ADDR_REF:%.+]],
+// CHECK: store i8* %1, i8** [[SRC_ADDR_REF:%.+]],
+// CHECK: [[DST_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[DST_ADDR_REF]],
+// CHECK: [[DST_ADDR:%.+]] = bitcast i8* [[DST_ADDR_VOID_PTR]] to [5 x i8*]*
+// CHECK: [[SRC_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[SRC_ADDR_REF]],
+// CHECK: [[SRC_ADDR:%.+]] = bitcast i8* [[SRC_ADDR_VOID_PTR]] to [5 x i8*]*
+// CHECK: [[DST_A_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DST_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[DST_A_ADDR:%.+]] = load i8*, i8** [[DST_A_ADDR_REF]],
+// CHECK: [[SRC_A_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[SRC_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SRC_A_ADDR:%.+]] = load i8*, i8** [[SRC_A_ADDR_REF]],
+// CHECK: [[SRC_A_VAL:%.+]] = load i8, i8* [[SRC_A_ADDR]],
+// CHECK: store i8 [[SRC_A_VAL]], i8* [[DST_A_ADDR]],
+// CHECK: [[DST_C_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DST_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[DST_C_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[DST_C_ADDR_REF]],
+// CHECK: [[DST_C_ADDR:%.+]] = bitcast i8* [[DST_C_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
+// CHECK: [[SRC_C_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[SRC_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[SRC_C_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[SRC_C_ADDR_REF]],
+// CHECK: [[SRC_C_ADDR:%.+]] = bitcast i8* [[SRC_C_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
+// CHECK: call{{.*}} [[TEST_CLASS_TY_ASSIGN:@.+]]([[TEST_CLASS_TY]]* [[DST_C_ADDR]], [[TEST_CLASS_TY]]* {{.*}}[[SRC_C_ADDR]])
+// CHECK: [[DST_TC_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DST_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[DST_TC_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[DST_TC_ADDR_REF]],
+// CHECK: [[DST_TC_ADDR:%.+]] = bitcast i8* [[DST_TC_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
+// CHECK: [[SRC_TC_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[SRC_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[SRC_TC_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[SRC_TC_ADDR_REF]],
+// CHECK: [[SRC_TC_ADDR:%.+]] = bitcast i8* [[SRC_TC_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
+// CHECK: call{{.*}} [[TEST_CLASS_TY_ASSIGN]]([[TEST_CLASS_TY]]* [[DST_TC_ADDR]], [[TEST_CLASS_TY]]* {{.*}}[[SRC_TC_ADDR]])
+// CHECK: [[DST_A2_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DST_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[DST_A2_ADDR:%.+]] = load i8*, i8** [[DST_A2_ADDR_REF]],
+// CHECK: [[SRC_A2_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[SRC_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[SRC_A2_ADDR:%.+]] = load i8*, i8** [[SRC_A2_ADDR_REF]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[DST_A2_ADDR]], i8* [[SRC_A2_ADDR]], i{{[0-9]+}} 2, i{{[0-9]+}} 1, i1 false)
+// CHECK: [[DST_TC2_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DST_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: [[DST_TC2_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[DST_TC2_ADDR_REF]],
+// CHECK: [[DST_TC2_ADDR:%.+]] = bitcast i8* [[DST_TC2_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
+// CHECK: [[SRC_TC2_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[SRC_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: [[SRC_TC2_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[SRC_TC2_ADDR_REF]],
+// CHECK: [[SRC_TC2_ADDR:%.+]] = bitcast i8* [[SRC_TC2_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
+// CHECK: br i1
+// CHECK: call{{.*}} [[TEST_CLASS_TY_ASSIGN]]([[TEST_CLASS_TY]]* %{{.+}}, [[TEST_CLASS_TY]]* {{.*}})
+// CHECK: br i1
+// CHECK: ret void
+
+// CHECK-LABEL:      parallel_single
+// TERM_DEBUG-LABEL: parallel_single
+void parallel_single() {
+#pragma omp parallel
+#pragma omp single
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call i32 @__kmpc_single({{.+}}), !dbg [[DBG_LOC_START:![0-9]+]]
+  // TERM_DEBUG:     invoke void {{.*}}foo{{.*}}()
+  // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_end_single({{.+}}), !dbg [[DBG_LOC_END:![0-9]+]]
+  // TERM_DEBUG:     [[TERM_LPAD]]
+  // TERM_DEBUG:     call void @__clang_call_terminate
+  // TERM_DEBUG:     unreachable
+  foo();
+}
+// TERM_DEBUG-DAG: [[DBG_LOC_START]] = !DILocation(line: [[@LINE-12]],
+// TERM_DEBUG-DAG: [[DBG_LOC_END]] = !DILocation(line: [[@LINE-3]],
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St &operator=(const St &) { return *this; };
+  ~St() {}
+};
+
+void array_func(int n, int a[n], St s[2]) {
+// ARRAY: call void @__kmpc_copyprivate(%ident_t* @{{.+}}, i32 %{{.+}}, i64 16, i8* %{{.+}}, void (i8*, i8*)* [[CPY:@.+]], i32 %{{.+}})
+#pragma omp single copyprivate(a, s)
+  ;
+}
+// ARRAY: define internal void [[CPY]]
+// ARRAY: store i32* %{{.+}}, i32** %{{.+}},
+// ARRAY: store %struct.St* %{{.+}}, %struct.St** %{{.+}},
+#endif
diff --git a/test/OpenMP/single_copyprivate_messages.cpp b/test/OpenMP/single_copyprivate_messages.cpp
index 7bb145c6d325..4714753f494e 100644
--- a/test/OpenMP/single_copyprivate_messages.cpp
+++ b/test/OpenMP/single_copyprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -18,18 +18,18 @@ public:
   S3() : a(0) {}
   S3 &operator=(S3 &s3) { return *this; }
 };
-class S4 { // expected-note 2 {{'S4' declared here}}
+class S4 {
   int a;
   S4();
-  S4 &operator=(const S4 &s4);
+  S4 &operator=(const S4 &s4); // expected-note 3 {{implicitly declared private here}}
 
 public:
   S4(int v) : a(v) {}
 };
-class S5 { // expected-note 2 {{'S5' declared here}}
+class S5 {
   int a;
   S5() : a(0) {}
-  S5 &operator=(const S5 &s5) { return *this; }
+  S5 &operator=(const S5 &s5) { return *this; } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S5(int v) : a(v) {}
@@ -37,13 +37,14 @@ public:
 
 S2 k;
 S3 h;
-S4 l(3); // expected-note 2 {{'l' defined here}}
-S5 m(4); // expected-note 2 {{'m' defined here}}
+S4 l(3);
+S5 m(4);
 #pragma omp threadprivate(h, k, l, m)
 
 template <class T, class C>
 T tmain(T argc, C **argv) {
   T i;
+  static T TA;
 #pragma omp parallel
 #pragma omp single copyprivate // expected-error {{expected '(' after 'copyprivate'}}
 #pragma omp parallel
@@ -57,7 +58,7 @@ T tmain(T argc, C **argv) {
 #pragma omp parallel
 #pragma omp single copyprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
 #pragma omp parallel
-#pragma omp single copyprivate(l) // expected-error {{copyprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp single copyprivate(l) // expected-error 2 {{'operator=' is a private member of 'S4'}}
 #pragma omp parallel
 #pragma omp single copyprivate(S1) // expected-error {{'S1' does not refer to a value}}
 #pragma omp parallel
@@ -65,7 +66,7 @@ T tmain(T argc, C **argv) {
 #pragma omp parallel // expected-note {{implicitly determined as shared}}
 #pragma omp single copyprivate(i) // expected-error {{copyprivate variable must be threadprivate or private in the enclosing context}}
 #pragma omp parallel
-#pragma omp single copyprivate(m) // expected-error {{copyprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp single copyprivate(m) // expected-error 2 {{'operator=' is a private member of 'S5'}}
   foo();
 #pragma omp parallel private(i)
   {
@@ -95,12 +96,31 @@ T tmain(T argc, C **argv) {
 #pragma omp parallel
 #pragma omp single firstprivate(i) copyprivate(i) // expected-error {{firstprivate variable cannot be copyprivate}} expected-note {{defined as firstprivate}}
   foo();
+#pragma omp parallel private(TA)
+  {
+#pragma omp single copyprivate(TA)
+    TA = 99;
+  }
 
   return T();
 }
 
+void bar(S4 a[2], int n, int b[n]) {
+#pragma omp single copyprivate(a, b)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   int i;
+  static int intA;
 #pragma omp parallel
 #pragma omp single copyprivate // expected-error {{expected '(' after 'copyprivate'}}
 #pragma omp parallel
@@ -114,7 +134,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel
 #pragma omp single copyprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
 #pragma omp parallel
-#pragma omp single copyprivate(l) // expected-error {{copyprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp single copyprivate(l, B::x) // expected-error {{'operator=' is a private member of 'S4'}}
 #pragma omp parallel
 #pragma omp single copyprivate(S1) // expected-error {{'S1' does not refer to a value}}
 #pragma omp parallel
@@ -122,7 +142,7 @@ int main(int argc, char **argv) {
 #pragma omp parallel // expected-note {{implicitly determined as shared}}
 #pragma omp single copyprivate(i) // expected-error {{copyprivate variable must be threadprivate or private in the enclosing context}}
 #pragma omp parallel
-#pragma omp single copyprivate(m) // expected-error {{copyprivate variable must have an accessible, unambiguous copy assignment operator}}
+#pragma omp single copyprivate(m) // expected-error {{'operator=' is a private member of 'S5'}}
   foo();
 #pragma omp parallel private(i)
   {
@@ -152,6 +172,13 @@ int main(int argc, char **argv) {
 #pragma omp parallel
 #pragma omp single firstprivate(i) copyprivate(i) // expected-error {{firstprivate variable cannot be copyprivate}} expected-note {{defined as firstprivate}}
   foo();
+#pragma omp single copyprivate(i) nowait // expected-error {{the 'copyprivate' clause must not be used with the 'nowait' clause}} expected-note {{'nowait' clause is here}}
+  foo();
+#pragma omp parallel private(intA)
+  {
+#pragma omp single copyprivate(intA)
+    intA = 99;
+  }
 
   return tmain(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
 }
diff --git a/test/OpenMP/single_firstprivate_codegen.cpp b/test/OpenMP/single_firstprivate_codegen.cpp
new file mode 100644
index 000000000000..e30c00df19be
--- /dev/null
+++ b/test/OpenMP/single_firstprivate_codegen.cpp
@@ -0,0 +1,251 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g = 1212;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp parallel
+#pragma omp single firstprivate(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+// CHECK: [[TEST:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> test;
+// CHECK-DAG: [[T_VAR:@.+]] = global i{{[0-9]+}} 333,
+int t_var = 333;
+// CHECK-DAG: [[VEC:@.+]] = global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+int vec[] = {1, 2};
+// CHECK-DAG: [[S_ARR:@.+]] = global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+S<float> s_arr[] = {1, 2};
+// CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> var(3);
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
+
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: ([[S_FLOAT_TY]]*)* [[S_FLOAT_TY_DESTR:@[^ ]+]] {{[^,]+}}, {{.+}}([[S_FLOAT_TY]]* [[TEST]]
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+// LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+// LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp single firstprivate(g)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: call i32 @__kmpc_single(
+    // LAMBDA: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G]]
+    // LAMBDA: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    g = 1;
+    // LAMBDA: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_end_single(
+    // LAMBDA: call i32 @__kmpc_cancel_barrier(
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+// BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+// BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp single firstprivate(g)
+   {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // BLOCKS: call i32 @__kmpc_single(
+    // BLOCKS: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G]]
+    // BLOCKS: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    g = 1;
+    // BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_end_single(
+    // BLOCKS: call i32 @__kmpc_cancel_barrier(
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp single firstprivate(t_var, vec, s_arr, var) nowait
+  {
+    {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    }
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+
+// CHECK: call i32 @__kmpc_single(
+// firstprivate t_var(t_var)
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// firstprivate vec(vec)
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+
+// firstprivate s_arr(s_arr)
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: getelementptr inbounds ([2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+
+// firstprivate var(var)
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+
+// ~(firstprivate var), ~(firstprivate s_arr)
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_end_single(
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call i32 @__kmpc_single(
+// firstprivate t_var(t_var)
+// CHECK: [[T_VAR_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_PTR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+
+// firstprivate vec(vec)
+// CHECK: [[VEC_PTR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PTR_REF:%.+]],
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+
+// firstprivate s_arr(s_arr)
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+
+// firstprivate var(var)
+// CHECK: [[VAR_REF_PTR:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_REF_PTR]],
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+
+// ~(firstprivate var), ~(firstprivate s_arr)
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+
+// CHECK: call void @__kmpc_end_single(
+
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[SINGLE_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: ret void
+#endif
+
diff --git a/test/OpenMP/single_firstprivate_messages.cpp b/test/OpenMP/single_firstprivate_messages.cpp
index 9f6f16083544..6f21a42208f4 100644
--- a/test/OpenMP/single_firstprivate_messages.cpp
+++ b/test/OpenMP/single_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -132,6 +132,14 @@ int foomain(int argc, char **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -203,7 +211,7 @@ int main(int argc, char **argv) {
 #pragma omp single firstprivate(m) // OK
   foo();
 #pragma omp parallel
-#pragma omp single firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+#pragma omp single firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
   foo();
 #pragma omp parallel
 #pragma omp single private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
diff --git a/test/OpenMP/single_misc_messages.c b/test/OpenMP/single_misc_messages.c
index 7c10ca0a2887..2c922ddfd4f0 100644
--- a/test/OpenMP/single_misc_messages.c
+++ b/test/OpenMP/single_misc_messages.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fopenmp=libiomp5 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
 
 void foo();
 
diff --git a/test/OpenMP/single_private_codegen.cpp b/test/OpenMP/single_private_codegen.cpp
new file mode 100644
index 000000000000..4f78d082b82b
--- /dev/null
+++ b/test/OpenMP/single_private_codegen.cpp
@@ -0,0 +1,182 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[CAP_MAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]* }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK: [[CAP_TMAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp parallel
+#pragma omp single private(t_var, vec, s_arr, s_arr, var, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* %{{.+}})
+#pragma omp parallel
+#pragma omp single private(g)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+    // LAMBDA: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    g = 1;
+    // LAMBDA: call i32 @__kmpc_single(
+    // LAMBDA: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store double* [[G_PRIVATE_ADDR]], double** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: call{{( x86_thiscallcc)?}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    // LAMBDA: call void @__kmpc_end_single(
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+      // LAMBDA: store volatile double 2.0{{.+}}, double* [[G_REF]]
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i8* {{.+}})
+#pragma omp parallel
+#pragma omp single private(g)
+  {
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* %{{.+}}, i32* %{{.+}}, %{{.+}}* [[ARG:%.+]])
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+    // BLOCKS: store %{{.+}}* [[ARG]], %{{.+}}** [[ARG_REF:%.+]],
+    g = 1;
+    // BLOCKS: call i32 @__kmpc_single(
+    // BLOCKS: store volatile double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: double* [[G_PRIVATE_ADDR]]
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: call void @__kmpc_end_single(
+    ^{
+      // BLOCKS: define {{.+}} void {{@.+}}(i8*
+      g = 2;
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: store volatile double 2.0{{.+}}, double*
+      // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+      // BLOCKS: ret
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+#pragma omp parallel
+#pragma omp single private(t_var, vec, s_arr, s_arr, var, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: %{{.+}} = bitcast [[CAP_MAIN_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_MAIN_TY]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_MAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK: call i32 @__kmpc_single(
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_end_single(
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[CAP_TMAIN_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* [[GTID_ADDR:%.+]], i{{[0-9]+}}* %{{.+}}, [[CAP_TMAIN_TY]]* %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK-NOT: alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK: call i32 @__kmpc_single(
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: call void @__kmpc_end_single(
+// CHECK: ret void
+#endif
+
diff --git a/test/OpenMP/single_private_messages.cpp b/test/OpenMP/single_private_messages.cpp
index 8bdc48f1a506..0e04e0f5aff1 100644
--- a/test/OpenMP/single_private_messages.cpp
+++ b/test/OpenMP/single_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -91,6 +91,14 @@ int foomain(I argc, C **argv) {
   return 0;
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
@@ -118,7 +126,7 @@ int main(int argc, char **argv) {
   foo();
 #pragma omp single private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   foo();
-#pragma omp single private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp single private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   foo();
 #pragma omp single shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp single'}}
   foo();
diff --git a/test/OpenMP/target_ast_print.cpp b/test/OpenMP/target_ast_print.cpp
index c57c04955f8d..6955cf31112e 100644
--- a/test/OpenMP/target_ast_print.cpp
+++ b/test/OpenMP/target_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/target_if_messages.cpp b/test/OpenMP/target_if_messages.cpp
index 50a9ed2a202f..5193b64ff720 100644
--- a/test/OpenMP/target_if_messages.cpp
+++ b/test/OpenMP/target_if_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
diff --git a/test/OpenMP/target_messages.cpp b/test/OpenMP/target_messages.cpp
index fb492281e2b4..ebac51ae0d32 100644
--- a/test/OpenMP/target_messages.cpp
+++ b/test/OpenMP/target_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -o - %s
 
 void foo() {
 }
diff --git a/test/OpenMP/task_ast_print.cpp b/test/OpenMP/task_ast_print.cpp
index 2b43c0b0896d..55407c10baae 100644
--- a/test/OpenMP/task_ast_print.cpp
+++ b/test/OpenMP/task_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/task_codegen.cpp b/test/OpenMP/task_codegen.cpp
new file mode 100644
index 000000000000..1c28fbc0568e
--- /dev/null
+++ b/test/OpenMP/task_codegen.cpp
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[STRUCT_SHAREDS:%.+]] = type { i8*, [[STRUCT_S:%.+]]* }
+// CHECK-DAG: [[KMP_TASK_T:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+struct S {
+  int a;
+  S() : a(0) {}
+  S(const S &s) : a(s.a) {}
+  ~S() {}
+};
+int a;
+// CHECK-LABEL : @main
+int main() {
+// CHECK: [[B:%.+]] = alloca i8
+// CHECK: [[S:%.+]] = alloca [[STRUCT_S]]
+  char b;
+  S s;
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T]]* @{{.+}})
+// CHECK: [[B_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS]], [[STRUCT_SHAREDS]]* [[CAPTURES:%.+]], i32 0, i32 0
+// CHECK: store i8* [[B]], i8** [[B_REF]]
+// CHECK: [[S_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS]], [[STRUCT_SHAREDS]]* [[CAPTURES]], i32 0, i32 1
+// CHECK: store [[STRUCT_S]]* [[S]], [[STRUCT_S]]** [[S_REF]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 32, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
+// CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS]]* [[CAPTURES]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 16, i32 8, i1 false)
+// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 3
+// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+#pragma omp task shared(a, b, s)
+  {
+    a = 15;
+    b = a;
+    s.a = 10;
+  }
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
+// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+#pragma omp task untied
+  {
+    a = 1;
+  }
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
+// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+#pragma omp task final(true)
+  {
+    a = 2;
+  }
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.*}}, i32 0, i32 3
+// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+  const bool flag = false;
+#pragma omp task final(flag)
+  {
+    a = 3;
+  }
+// CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]]
+// CHECK: [[CMP:%.+]] = icmp ne i8 [[B_VAL]], 0
+// CHECK: [[FINAL:%.+]] = select i1 [[CMP]], i32 2, i32 0
+// CHECK: [[FLAGS:%.+]] = or i32 [[FINAL]], 1
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 [[FLAGS]], i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY5:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
+// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+#pragma omp task final(b)
+  {
+    a = 4;
+  }
+  return a;
+}
+// CHECK: define internal i32 [[TASK_ENTRY1]](i32, [[KMP_TASK_T]]{{.*}}*)
+// CHECK: store i32 15, i32* [[A_PTR:@.+]]
+// CHECK: [[A_VAL:%.+]] = load i32, i32* [[A_PTR]]
+// CHECK: [[A_VAL_I8:%.+]] = trunc i32 [[A_VAL]] to i8
+// CHECK: store i8 [[A_VAL_I8]], i8* %{{.+}}
+// CHECK: store i32 10, i32* %{{.+}}
+
+// CHECK: define internal i32 [[TASK_ENTRY2]](i32, [[KMP_TASK_T]]{{.*}}*)
+// CHECK: store i32 1, i32* [[A_PTR:@.+]]
+
+// CHECK: define internal i32 [[TASK_ENTRY3]](i32, [[KMP_TASK_T]]{{.*}}*)
+// CHECK: store i32 2, i32* [[A_PTR:@.+]]
+
+// CHECK: define internal i32 [[TASK_ENTRY4]](i32, [[KMP_TASK_T]]{{.*}}*)
+// CHECK: store i32 3, i32* [[A_PTR:@.+]]
+
+// CHECK: define internal i32 [[TASK_ENTRY5]](i32, [[KMP_TASK_T]]{{.*}}*)
+// CHECK: store i32 4, i32* [[A_PTR:@.+]]
+#endif
+
diff --git a/test/OpenMP/task_default_messages.cpp b/test/OpenMP/task_default_messages.cpp
index 8da6b1aeea17..b3df295a8fa2 100644
--- a/test/OpenMP/task_default_messages.cpp
+++ b/test/OpenMP/task_default_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/task_final_messages.cpp b/test/OpenMP/task_final_messages.cpp
index 0b52e6aca150..a217ea2ef30e 100644
--- a/test/OpenMP/task_final_messages.cpp
+++ b/test/OpenMP/task_final_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
diff --git a/test/OpenMP/task_firstprivate_codegen.cpp b/test/OpenMP/task_firstprivate_codegen.cpp
new file mode 100644
index 000000000000..e1358d9ecc6a
--- /dev/null
+++ b/test/OpenMP/task_firstprivate_codegen.cpp
@@ -0,0 +1,433 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]* }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [[S_DOUBLE_TY]], [2 x [[S_DOUBLE_TY]]], i32, [2 x i32]
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test(ttt);
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp task firstprivate(t_var, vec, s_arr, s_arr, var, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 40, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_ADDR_REF]]
+// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_REF]]
+// LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+// LAMBDA: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
+// LAMBDA: ret
+#pragma omp task firstprivate(g)
+  {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store volatile double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}*)
+    g = 1;
+    // LAMBDA: store volatile double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 40, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_REF:%.+]] = load double*, double** [[G_ADDR_REF]]
+  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_REF]]
+  // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+  // BLOCKS: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
+  // BLOCKS: ret
+#pragma omp task firstprivate(g)
+  {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store volatile double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}*)
+    g = 1;
+    // BLOCKS: store volatile double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test(ttt);
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp task firstprivate(var, t_var, s_arr, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 72, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_MAIN_TY]]*
+
+// Constructors for s_arr and var.
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VAR_REF:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[VAR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}}[[VAR_REF]],
+
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 2
+// CHECK: load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[T_VAR_ADDR_REF]],
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_REF]],
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+
+// Start task.
+// CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]*)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]])
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]*)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 56, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_TMAIN_TY]]*
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[T_VAR_ADDR_REF]],
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_REF]],
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+
+// Start task.
+// CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]*)
+
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]*)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call i32 @__kmpc_omp_task(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp task firstprivate(a, s)
+  ;
+}
+#endif
+
diff --git a/test/OpenMP/task_firstprivate_messages.cpp b/test/OpenMP/task_firstprivate_messages.cpp
index 6c5ccfca57bf..0126d47c50f9 100644
--- a/test/OpenMP/task_firstprivate_messages.cpp
+++ b/test/OpenMP/task_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
@@ -51,6 +51,19 @@ public:
 S3 h;
 #pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
 
+void bar(int n, int b[n]) {
+#pragma omp task firstprivate(b)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -74,7 +87,7 @@ int main(int argc, char **argv) {
 #pragma omp task firstprivate(S2::S2s)
 #pragma omp task firstprivate(S2::S2sc)
 #pragma omp task firstprivate(e, g)          // expected-error 2 {{calling a private constructor of class 'S4'}} expected-error 2 {{calling a private constructor of class 'S5'}}
-#pragma omp task firstprivate(h)             // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+#pragma omp task firstprivate(h, B::x)       // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
 #pragma omp task private(i), firstprivate(i) // expected-error {{private variable cannot be firstprivate}} expected-note{{defined as private}}
   foo();
 #pragma omp task shared(i)
diff --git a/test/OpenMP/task_if_codegen.cpp b/test/OpenMP/task_if_codegen.cpp
new file mode 100644
index 000000000000..d4fd6bb1ebb3
--- /dev/null
+++ b/test/OpenMP/task_if_codegen.cpp
@@ -0,0 +1,133 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+void fn1();
+void fn2();
+void fn3();
+void fn4();
+void fn5();
+void fn6();
+
+int Arg;
+
+// CHECK-LABEL: define void @{{.+}}gtid_test
+void gtid_test() {
+// CHECK:  call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 1, {{.+}}* [[GTID_TEST_REGION1:@.+]] to void
+#pragma omp parallel
+#pragma omp task if (false)
+  gtid_test();
+// CHECK: ret void
+}
+
+// CHECK: define internal void [[GTID_TEST_REGION1]](i32* [[GTID_PARAM:%.+]], i
+// CHECK: store i32* [[GTID_PARAM]], i32** [[GTID_ADDR_REF:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_REF]]
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(
+// CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
+// CHECK: call void @__kmpc_omp_task_begin_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: call i32 [[GTID_TEST_REGION2:@.+]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
+// CHECK: call void @__kmpc_omp_task_complete_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: ret void
+
+// CHECK: define internal i32 [[GTID_TEST_REGION2]](
+// CHECK: call void @{{.+}}gtid_test
+// CHECK: ret i32
+
+template <typename T>
+int tmain(T Arg) {
+#pragma omp task if (true)
+  fn1();
+#pragma omp task if (false)
+  fn2();
+#pragma omp task if (Arg)
+  fn3();
+  return 0;
+}
+
+// CHECK-LABEL: @main
+int main() {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN4:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+#pragma omp task if (true)
+  fn4();
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(
+// CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
+// CHECK: call void @__kmpc_omp_task_begin_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: call i32 [[CAP_FN5:@.+]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
+// CHECK: call void @__kmpc_omp_task_complete_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+#pragma omp task if (false)
+  fn5();
+
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN6:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
+// CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
+// CHECK: [[OMP_THEN]]
+// CHECK: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: br label %[[OMP_END:.+]]
+// CHECK: [[OMP_ELSE]]
+// CHECK: call void @__kmpc_omp_task_begin_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: call i32 [[CAP_FN6:@.+]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
+// CHECK: call void @__kmpc_omp_task_complete_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: br label %[[OMP_END]]
+// CHECK: [[OMP_END]]
+#pragma omp task if (Arg)
+  fn6();
+  // CHECK: = call {{.*}}i{{.+}} @{{.+}}tmain
+  return tmain(Arg);
+}
+
+// CHECK: define internal i32 [[CAP_FN4]]
+// CHECK: call void @{{.+}}fn4
+// CHECK: ret i32
+
+// CHECK: define internal i32 [[CAP_FN5]]
+// CHECK: call void @{{.+}}fn5
+// CHECK: ret i32
+
+// CHECK: define internal i32 [[CAP_FN6]]
+// CHECK: call void @{{.+}}fn6
+// CHECK: ret i32
+
+// CHECK-LABEL: define {{.+}} @{{.+}}tmain
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32,  %{{[^*]+}}*)* [[CAP_FN1:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(
+// CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
+// CHECK: call void @__kmpc_omp_task_begin_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: call i32 [[CAP_FN2:@.+]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
+// CHECK: call void @__kmpc_omp_task_complete_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN3:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
+// CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
+// CHECK: [[OMP_THEN]]
+// CHECK: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: br label %[[OMP_END:.+]]
+// CHECK: [[OMP_ELSE]]
+// CHECK: call void @__kmpc_omp_task_begin_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: call i32 [[CAP_FN3:@.+]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
+// CHECK: call void @__kmpc_omp_task_complete_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
+// CHECK: br label %[[OMP_END]]
+// CHECK: [[OMP_END]]
+
+// CHECK: define internal i32 [[CAP_FN1]]
+// CHECK: call void @{{.+}}fn1
+// CHECK: ret i32
+
+// CHECK: define internal i32 [[CAP_FN2]]
+// CHECK: call void @{{.+}}fn2
+// CHECK: ret i32
+
+// CHECK: define internal i32 [[CAP_FN3]]
+// CHECK: call void @{{.+}}fn3
+// CHECK: ret i32
+
+#endif
diff --git a/test/OpenMP/task_if_messages.cpp b/test/OpenMP/task_if_messages.cpp
index 51900c0be2e5..ae6bb13093d9 100644
--- a/test/OpenMP/task_if_messages.cpp
+++ b/test/OpenMP/task_if_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
diff --git a/test/OpenMP/task_messages.cpp b/test/OpenMP/task_messages.cpp
index b02b43c2b395..ec67998afdce 100644
--- a/test/OpenMP/task_messages.cpp
+++ b/test/OpenMP/task_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 -o - %s
 
 void foo() {
 }
diff --git a/test/OpenMP/task_private_codegen.cpp b/test/OpenMP/task_private_codegen.cpp
new file mode 100644
index 000000000000..3a9ed7c58d63
--- /dev/null
+++ b/test/OpenMP/task_private_codegen.cpp
@@ -0,0 +1,388 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+
+// It doesn't pass on win32. Investigating.
+// REQUIRES: shell
+
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]* }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [[S_DOUBLE_TY]], [2 x [[S_DOUBLE_TY]]], i32, [2 x i32]
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp task private(t_var, vec, s_arr, s_arr, var, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 40, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// LAMBDA: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
+// LAMBDA: ret
+#pragma omp task private(g)
+  {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store volatile double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}*)
+    g = 1;
+    // LAMBDA: store volatile double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 40, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
+  // BLOCKS: ret
+#pragma omp task private(g)
+  {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store volatile double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}*)
+    g = 1;
+    // BLOCKS: store volatile double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp task private(var, t_var, s_arr, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 72, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+
+// Start task.
+// CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]*)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]])
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]*)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 56, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+
+// Start task.
+// CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]*)
+
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]*)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St &operator=(const St &) { return *this; };
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call i32 @__kmpc_omp_task(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp task private(a, s)
+  ;
+}
+#endif
+
diff --git a/test/OpenMP/task_private_messages.cpp b/test/OpenMP/task_private_messages.cpp
index 9a3bb757681c..c2c1f519d0c1 100644
--- a/test/OpenMP/task_private_messages.cpp
+++ b/test/OpenMP/task_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
@@ -14,7 +14,7 @@ class S2 {
 
 public:
   S2() : a(0) {}
-  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static float S2s;
 };
 const S2 b;
 const S2 ba[5];
@@ -45,6 +45,19 @@ public:
 int threadvar;
 #pragma omp threadprivate(threadvar) // expected-note {{defined as threadprivate or thread local}}
 
+void bar(int n, int b[n]) {
+#pragma omp task private(b)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
@@ -65,9 +78,9 @@ int main(int argc, char **argv) {
 #pragma omp task private(ba)
 #pragma omp task private(ca)           // expected-error {{shared variable cannot be private}}
 #pragma omp task private(da)           // expected-error {{shared variable cannot be private}}
-#pragma omp task private(S2::S2s)      // expected-error {{shared variable cannot be private}}
+#pragma omp task private(S2::S2s)
 #pragma omp task private(e, g)         // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
-#pragma omp task private(threadvar)    // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp task private(threadvar, B::x)    // expected-error 2 {{threadprivate or thread local variable cannot be private}}
 #pragma omp task shared(i), private(i) // expected-error {{shared variable cannot be private}} expected-note {{defined as shared}}
   foo();
 #pragma omp task firstprivate(i) private(i) // expected-error {{firstprivate variable cannot be private}} expected-note {{defined as firstprivate}}
diff --git a/test/OpenMP/task_shared_messages.cpp b/test/OpenMP/task_shared_messages.cpp
index 747923721b9a..bf3b8ba28373 100644
--- a/test/OpenMP/task_shared_messages.cpp
+++ b/test/OpenMP/task_shared_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 void foo() {
 }
@@ -48,6 +48,14 @@ public:
 S3 h;
 #pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -83,7 +91,7 @@ int main(int argc, char **argv) {
   foo();
 #pragma omp task shared(e, g)
   foo();
-#pragma omp task shared(h)             // expected-error {{threadprivate or thread local variable cannot be shared}}
+#pragma omp task shared(h, B::x)             // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
   foo();
 #pragma omp task private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
   foo();
diff --git a/test/OpenMP/taskwait_ast_print.cpp b/test/OpenMP/taskwait_ast_print.cpp
index 994262242eb7..b5b0f0dd8d31 100644
--- a/test/OpenMP/taskwait_ast_print.cpp
+++ b/test/OpenMP/taskwait_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/taskwait_codegen.cpp b/test/OpenMP/taskwait_codegen.cpp
new file mode 100644
index 000000000000..85f20e80561f
--- /dev/null
+++ b/test/OpenMP/taskwait_codegen.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+template <class T>
+T tmain(T argc) {
+  static T a;
+#pragma omp taskwait
+  return a + argc;
+}
+int main(int argc, char **argv) {
+#pragma omp taskwait
+  return tmain(argc);
+}
+
+// CHECK-LABEL: @main
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// CHECK: call i32 @__kmpc_omp_taskwait(%{{.+}}* @{{.+}}, i32 [[GTID]])
+
+// CHECK-LABEL: tmain
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%{{.+}}* @{{.+}})
+// CHECK: call i32 @__kmpc_omp_taskwait(%{{.+}}* @{{.+}}, i32 [[GTID]])
+
+
+#endif
diff --git a/test/OpenMP/taskwait_messages.cpp b/test/OpenMP/taskwait_messages.cpp
index f325c7328372..084051354045 100644
--- a/test/OpenMP/taskwait_messages.cpp
+++ b/test/OpenMP/taskwait_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 template <class T>
 T tmain(T argc) {
diff --git a/test/OpenMP/taskyield_ast_print.cpp b/test/OpenMP/taskyield_ast_print.cpp
index 4c3ca47cba45..87679092d18c 100644
--- a/test/OpenMP/taskyield_ast_print.cpp
+++ b/test/OpenMP/taskyield_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/taskyield_codegen.cpp b/test/OpenMP/taskyield_codegen.cpp
new file mode 100644
index 000000000000..6815a0348e1c
--- /dev/null
+++ b/test/OpenMP/taskyield_codegen.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
+
+void foo() {}
+
+template <class T>
+T tmain(T argc) {
+  static T a;
+#pragma omp taskyield
+  return a + argc;
+}
+
+// CHECK-LABEL: @main
+int main(int argc, char **argv) {
+  static int a;
+#pragma omp taskyield
+  // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T]]* @{{.+}})
+  // CHECK: call i32 @__kmpc_omp_taskyield([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0)
+  // CHECK: call {{.+}} [[TMAIN_INT:@.+]](i{{[0-9][0-9]}}
+  // CHECK: call {{.+}} [[TMAIN_CHAR:@.+]](i{{[0-9]}}
+  return tmain(argc) + tmain(argv[0][0]) + a;
+}
+
+// CHECK: define {{.+}} [[TMAIN_INT]](
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T]]* @{{.+}})
+// CHECK: call i32 @__kmpc_omp_taskyield([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0)
+
+// CHECK: define {{.+}} [[TMAIN_CHAR]](
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T]]* @{{.+}})
+// CHECK: call i32 @__kmpc_omp_taskyield([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0)
+
+#endif
diff --git a/test/OpenMP/taskyield_messages.cpp b/test/OpenMP/taskyield_messages.cpp
index 7c355595d7ac..23c0b5339012 100644
--- a/test/OpenMP/taskyield_messages.cpp
+++ b/test/OpenMP/taskyield_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
 
 template <class T>
 T tmain(T argc) {
diff --git a/test/OpenMP/teams_ast_print.cpp b/test/OpenMP/teams_ast_print.cpp
index 394ec73a841c..6cea914c8626 100644
--- a/test/OpenMP/teams_ast_print.cpp
+++ b/test/OpenMP/teams_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/test/OpenMP/teams_default_messages.cpp b/test/OpenMP/teams_default_messages.cpp
index 4f5a267150f9..83909ccfe051 100644
--- a/test/OpenMP/teams_default_messages.cpp
+++ b/test/OpenMP/teams_default_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -o - %s
 
 void foo();
 
diff --git a/test/OpenMP/teams_firstprivate_messages.cpp b/test/OpenMP/teams_firstprivate_messages.cpp
index 3d4a21999eea..ac5197749bb4 100644
--- a/test/OpenMP/teams_firstprivate_messages.cpp
+++ b/test/OpenMP/teams_firstprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -49,6 +49,14 @@ public:
 S3 h;
 #pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = {0};
@@ -105,7 +113,7 @@ int main(int argc, char **argv) {
 #pragma omp teams firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   foo();
 #pragma omp target
-#pragma omp teams firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+#pragma omp teams firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
   foo();
 #pragma omp target
 #pragma omp teams private(i), firstprivate(i) // expected-error {{private variable cannot be firstprivate}} expected-note{{defined as private}}
diff --git a/test/OpenMP/teams_messages.cpp b/test/OpenMP/teams_messages.cpp
index 56ed548a0377..3a42a0e8dd6a 100644
--- a/test/OpenMP/teams_messages.cpp
+++ b/test/OpenMP/teams_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -o - %s
 
 void foo() {
 }
diff --git a/test/OpenMP/teams_private_messages.cpp b/test/OpenMP/teams_private_messages.cpp
index 16ecb74ea0cb..f50060ff29b3 100644
--- a/test/OpenMP/teams_private_messages.cpp
+++ b/test/OpenMP/teams_private_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -13,7 +13,7 @@ class S2 {
   mutable int a;
 public:
   S2():a(0) { }
-  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static float S2s;
 };
 const S2 b;
 const S2 ba[5];
@@ -41,6 +41,14 @@ public:
 int threadvar;
 #pragma omp threadprivate(threadvar) // expected-note {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5; // expected-note {{constant variable is predetermined as shared}}
   const int da[5] = { 0 }; // expected-note {{constant variable is predetermined as shared}}
@@ -88,13 +96,13 @@ int main(int argc, char **argv) {
   #pragma omp teams private(da) // expected-error {{shared variable cannot be private}}
   foo();
   #pragma omp target
-  #pragma omp teams private(S2::S2s) // expected-error {{shared variable cannot be private}}
+  #pragma omp teams private(S2::S2s)
   foo();
   #pragma omp target
   #pragma omp teams private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
   foo();
   #pragma omp target
-  #pragma omp teams private(threadvar) // expected-error {{threadprivate or thread local variable cannot be private}}
+  #pragma omp teams private(threadvar, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
   foo();
   #pragma omp target
   #pragma omp teams shared(i), private(i) // expected-error {{shared variable cannot be private}} expected-note {{defined as shared}}
diff --git a/test/OpenMP/teams_reduction_messages.cpp b/test/OpenMP/teams_reduction_messages.cpp
index afedfc3e46fd..93c4b141591f 100644
--- a/test/OpenMP/teams_reduction_messages.cpp
+++ b/test/OpenMP/teams_reduction_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -o - %s
 
 void foo() {
 }
@@ -11,12 +11,12 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+=(const S2 &arg) { return (*this); }
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
   S2(S2 &s2) : a(s2.a) {}
-  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static float S2s;
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
@@ -28,17 +28,17 @@ class S3 {
 public:
   S3() : a(0) {}
   S3(const S3 &s3) : a(s3.a) {}
-  S3 operator+=(const S3 &arg1) { return arg1; }
+  S3 operator+(const S3 &arg1) { return arg1; }
 };
-int operator+=(const S3 &arg1, const S3 &arg2) { return 5; }
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
 S3 c;               // expected-note 2 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
-class S4 {          // expected-note {{'S4' declared here}}
+class S4 {
   int a;
-  S4();
+  S4(); // expected-note {{implicitly declared private here}}
   S4(const S4 &s4);
-  S4 &operator+=(const S4 &arg) { return (*this); }
+  S4 &operator+(const S4 &arg) { return (*this); }
 
 public:
   S4(int v) : a(v) {}
@@ -46,26 +46,26 @@ public:
 S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
 class S5 {
   int a;
-  S5() : a(0) {}
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
   S5(const S5 &s5) : a(s5.a) {}
-  S5 &operator+=(const S5 &arg);
+  S5 &operator+(const S5 &arg);
 
 public:
   S5(int v) : a(v) {}
 };
-class S6 {
+class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
   int a;
 
 public:
   S6() : a(6) {}
   operator int() { return 6; }
-} o; // expected-note 2 {{'o' defined here}}
+} o;
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
 
 template <class T>       // expected-note {{declared here}}
-T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
+T tmain(T argc) {
   const T d = T();       // expected-note 4 {{'d' defined here}}
   const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
   T qa[5] = {T()};
@@ -74,7 +74,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   S3 &p = k;                   // expected-note 2 {{'p' defined here}}
   const T &r = da[(int)i];     // expected-note 2 {{'r' defined here}}
   T &q = qa[(int)i];           // expected-note 2 {{'q' defined here}}
-  T fl;                        // expected-note {{'fl' defined here}}
+  T fl;
 #pragma omp target
 #pragma omp teams reduction // expected-error {{expected '(' after 'reduction'}}
   foo();
@@ -97,10 +97,10 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp teams reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp teams reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp teams reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name}}
@@ -115,7 +115,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp teams reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}}
+#pragma omp teams reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified variable cannot be reduction}}
@@ -133,10 +133,10 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp teams reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}} expected-error {{a reduction variable with array type 'const float [5]'}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp teams reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp teams reduction(&& : S2::S2s)
   foo();
 #pragma omp target
 #pragma omp teams reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
@@ -145,7 +145,7 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
 #pragma omp teams reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp teams reduction(+ : o) // expected-error {{no viable overloaded '='}}
   foo();
 #pragma omp target
 #pragma omp teams private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -183,18 +183,26 @@ T tmain(T argc) {        // expected-note 2 {{'argc' defined here}}
   return T();
 }
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;       // expected-note 2 {{'d' defined here}}
   const int da[5] = {0}; // expected-note {{'da' defined here}}
   int qa[5] = {0};
-  S4 e(4); // expected-note {{'e' defined here}}
-  S5 g(5); // expected-note {{'g' defined here}}
+  S4 e(4);
+  S5 g(5);
   int i;
   int &j = i;                  // expected-note 2 {{'j' defined here}}
   S3 &p = k;                   // expected-note 2 {{'p' defined here}}
   const int &r = da[i];        // expected-note {{'r' defined here}}
   int &q = qa[i];              // expected-note {{'q' defined here}}
-  float fl;                    // expected-note {{'fl' defined here}}
+  float fl;
 #pragma omp target
 #pragma omp teams reduction // expected-error {{expected '(' after 'reduction'}}
   foo();
@@ -235,7 +243,7 @@ int main(int argc, char **argv) {
 #pragma omp teams reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}}
+#pragma omp teams reduction(+ : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(min : a, b, c, d, f) // expected-error {{reduction variable with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}}
@@ -253,22 +261,22 @@ int main(int argc, char **argv) {
 #pragma omp teams reduction(- : da) // expected-error {{a reduction variable with array type 'const int [5]'}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(^ : fl) // expected-error {{variable of type 'float' is not valid for specified reduction operation}}
+#pragma omp teams reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+#pragma omp teams reduction(&& : S2::S2s)
   foo();
 #pragma omp target
 #pragma omp teams reduction(&& : S2::S2sc) // expected-error {{const-qualified variable cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(& : e, g) // expected-error {{reduction variable must have an accessible, unambiguous default constructor}} expected-error {{variable of type 'S5' is not valid for specified reduction operation}}
+#pragma omp teams reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+#pragma omp teams reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : o) // expected-error {{variable of type 'class S6' is not valid for specified reduction operation}}
+#pragma omp teams reduction(+ : o) // expected-error {{no viable overloaded '='}}
   foo();
 #pragma omp target
 #pragma omp teams private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
diff --git a/test/OpenMP/teams_shared_messages.cpp b/test/OpenMP/teams_shared_messages.cpp
index ce2f917e207c..e14ba1e32f47 100644
--- a/test/OpenMP/teams_shared_messages.cpp
+++ b/test/OpenMP/teams_shared_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 %s
+// RUN: %clang_cc1 -verify -fopenmp %s
 
 void foo() {
 }
@@ -44,6 +44,14 @@ public:
 S3 h;
 #pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
 
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
 int main(int argc, char **argv) {
   const int d = 5;
   const int da[5] = { 0 };
@@ -94,7 +102,7 @@ int main(int argc, char **argv) {
   #pragma omp teams shared(e, g)
   foo();
   #pragma omp target
-  #pragma omp teams shared(h) // expected-error {{threadprivate or thread local variable cannot be shared}}
+  #pragma omp teams shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
   foo();
   #pragma omp target
   #pragma omp teams private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
diff --git a/test/OpenMP/threadprivate_ast_print.cpp b/test/OpenMP/threadprivate_ast_print.cpp
index 4d0d40e213f2..be2a79c52cbc 100644
--- a/test/OpenMP/threadprivate_ast_print.cpp
+++ b/test/OpenMP/threadprivate_ast_print.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -22,6 +22,8 @@ int a, b;
 // CHECK: int a;
 // CHECK: int b;
 #pragma omp threadprivate(a)
+#pragma omp threadprivate(a)
+// CHECK-NEXT: #pragma omp threadprivate(a)
 // CHECK-NEXT: #pragma omp threadprivate(a)
 #pragma omp threadprivate(d, b)
 // CHECK-NEXT: #pragma omp threadprivate(d,b)
diff --git a/test/OpenMP/threadprivate_codegen.cpp b/test/OpenMP/threadprivate_codegen.cpp
index 98b7917714b8..eea49443f8c9 100644
--- a/test/OpenMP/threadprivate_codegen.cpp
+++ b/test/OpenMP/threadprivate_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -DBODY -triple x86_64-unknown-unknown -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp=libiomp5 -DBODY -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK-DEBUG %s
+// RUN: %clang_cc1 -verify -fopenmp -DBODY -triple x86_64-unknown-unknown -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -DBODY -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK-DEBUG %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -103,7 +103,7 @@ struct S5 {
 
 // CHECK-DAG:  [[GS1:@.+]] = internal global [[S1]] zeroinitializer
 // CHECK-DAG:  [[GS1]].cache. = common global i8** null
-// CHECK-DAG:  [[DEFAULT_LOC:@.+]] = private unnamed_addr constant [[IDENT]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8]* {{@.+}}, i32 0, i32 0) }
+// CHECK-DAG:  [[DEFAULT_LOC:@.+]] = private unnamed_addr constant [[IDENT]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* {{@.+}}, i32 0, i32 0) }
 // CHECK-DAG:  [[GS2:@.+]] = internal global [[S2]] zeroinitializer
 // CHECK-DAG:  [[ARR_X:@.+]] = global [2 x [3 x [[S1]]]] zeroinitializer
 // CHECK-DAG:  [[ARR_X]].cache. = common global i8** null
@@ -133,25 +133,25 @@ struct S5 {
 // CHECK-DEBUG-DAG: [[ST_FLOAT_ST:@.+]] = linkonce_odr global float 2.300000e+01
 // CHECK-DEBUG-DAG: [[ST_S4_ST:@.+]] = linkonce_odr global %struct.S4 zeroinitializer
 // CHECK-DEBUG-DAG: [[LOC1:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;162;9;;\00"
-// CHECK-DEBUG-DAG: [[LOC2:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;216;9;;\00"
-// CHECK-DEBUG-DAG: [[LOC3:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;303;19;;\00"
-// CHECK-DEBUG-DAG: [[LOC4:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;328;9;;\00"
-// CHECK-DEBUG-DAG: [[LOC5:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;341;9;;\00"
-// CHECK-DEBUG-DAG: [[LOC6:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;358;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC7:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;375;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC8:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;401;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC9:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;422;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC10:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;437;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC11:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;454;27;;\00"
-// CHECK-DEBUG-DAG: [[LOC12:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;471;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC13:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;550;9;;\00"
-// CHECK-DEBUG-DAG: [[LOC14:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;567;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC15:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;593;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC16:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;614;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC17:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;629;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC18:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;646;27;;\00"
-// CHECK-DEBUG-DAG: [[LOC19:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;663;10;;\00"
-// CHECK-DEBUG-DAG: [[LOC20:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;275;9;;\00"
+// CHECK-DEBUG-DAG: [[LOC2:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;217;9;;\00"
+// CHECK-DEBUG-DAG: [[LOC3:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;304;19;;\00"
+// CHECK-DEBUG-DAG: [[LOC4:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;329;9;;\00"
+// CHECK-DEBUG-DAG: [[LOC5:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;342;9;;\00"
+// CHECK-DEBUG-DAG: [[LOC6:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;359;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC7:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;376;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC8:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;402;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC9:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;423;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC10:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;438;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC11:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;455;27;;\00"
+// CHECK-DEBUG-DAG: [[LOC12:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;472;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC13:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;551;9;;\00"
+// CHECK-DEBUG-DAG: [[LOC14:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;568;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC15:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;594;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC16:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;615;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC17:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;630;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC18:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;647;27;;\00"
+// CHECK-DEBUG-DAG: [[LOC19:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;foobar;664;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC20:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;276;9;;\00"
 
 struct Static {
   static S3 s;
@@ -160,19 +160,20 @@ struct Static {
 
 static S1 gs1(5);
 #pragma omp threadprivate(gs1)
+#pragma omp threadprivate(gs1)
 // CHECK:      define {{.*}} [[S1_CTOR:@.*]]([[S1]]* {{.*}},
 // CHECK:      define {{.*}} [[S1_DTOR:@.*]]([[S1]]* {{.*}})
 // CHECK:      define internal {{.*}}i8* [[GS1_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[S1]]*
 // CHECK-NEXT: call {{.*}} [[S1_CTOR]]([[S1]]* [[RES]], {{.*}} 5)
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      ret i8* [[ARG]]
 // CHECK-NEXT: }
 // CHECK:      define internal {{.*}}void [[GS1_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[S1]]*
 // CHECK-NEXT: call {{.*}} [[S1_DTOR]]([[S1]]* [[RES]])
 // CHECK-NEXT: ret void
@@ -182,22 +183,22 @@ static S1 gs1(5);
 // CHECK-NEXT: ret void
 // CHECK-NEXT: }
 // CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG:      store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC1]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG:      store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC1]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
 // CHECK-DEBUG:      @__kmpc_global_thread_num
 // CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i8* (i8*)* [[GS1_CTOR:@\.__kmpc_global_ctor_\..*]], i8* (i8*, i8*)* null, void (i8*)* [[GS1_DTOR:@\.__kmpc_global_dtor_\..*]])
 // CHECK-DEBUG:      define internal {{.*}}i8* [[GS1_CTOR]](i8*)
 // CHECK-DEBUG:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK-DEBUG:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[S1]]*
 // CHECK-DEBUG-NEXT: call {{.*}} [[S1_CTOR:@.+]]([[S1]]* [[RES]], {{.*}} 5)
-// CHECK-DEBUG:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG:      ret i8* [[ARG]]
 // CHECK-DEBUG-NEXT: }
 // CHECK-DEBUG:      define {{.*}} [[S1_CTOR]]([[S1]]* {{.*}},
 // CHECK-DEBUG:      define internal {{.*}}void [[GS1_DTOR]](i8*)
 // CHECK-DEBUG:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK-DEBUG:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[S1]]*
 // CHECK-DEBUG-NEXT: call {{.*}} [[S1_DTOR:@.+]]([[S1]]* [[RES]])
 // CHECK-DEBUG-NEXT: ret void
@@ -216,34 +217,34 @@ S1 arr_x[2][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
 #pragma omp threadprivate(arr_x)
 // CHECK:      define internal {{.*}}i8* [[ARR_X_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[RES:%.*]] = bitcast i8* [[ARG]] to [2 x [3 x [[S1]]]]*
-// CHECK:      [[ARR1:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]]* [[RES]], i{{.*}} 0, i{{.*}} 0
-// CHECK:      [[ARR:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR1]], i{{.*}} 0, i{{.*}} 0
+// CHECK:      [[ARR1:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[RES]], i{{.*}} 0, i{{.*}} 0
+// CHECK:      [[ARR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR1]], i{{.*}} 0, i{{.*}} 0
 // CHECK:      invoke {{.*}} [[S1_CTOR]]([[S1]]* [[ARR]], [[INT]] {{.*}}1)
-// CHECK:      [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]]* [[ARR]], i{{.*}} 1
+// CHECK:      [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR]], i{{.*}} 1
 // CHECK:      invoke {{.*}} [[S1_CTOR]]([[S1]]* [[ARR_ELEMENT]], [[INT]] {{.*}}2)
-// CHECK:      [[ARR_ELEMENT2:%.*]] = getelementptr inbounds [[S1]]* [[ARR_ELEMENT]], i{{.*}} 1
+// CHECK:      [[ARR_ELEMENT2:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_ELEMENT]], i{{.*}} 1
 // CHECK:      invoke {{.*}} [[S1_CTOR]]([[S1]]* [[ARR_ELEMENT2]], [[INT]] {{.*}}3)
-// CHECK:      [[ARR_ELEMENT3:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR1]], i{{.*}} 1
-// CHECK:      [[ARR_:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR_ELEMENT3]], i{{.*}} 0, i{{.*}} 0
+// CHECK:      [[ARR_ELEMENT3:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR1]], i{{.*}} 1
+// CHECK:      [[ARR_:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_ELEMENT3]], i{{.*}} 0, i{{.*}} 0
 // CHECK:      invoke {{.*}} [[S1_CTOR]]([[S1]]* [[ARR_]], [[INT]] {{.*}}4)
-// CHECK:      [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]]* [[ARR_]], i{{.*}} 1
+// CHECK:      [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_]], i{{.*}} 1
 // CHECK:      invoke {{.*}} [[S1_CTOR]]([[S1]]* [[ARR_ELEMENT]], [[INT]] {{.*}}5)
-// CHECK:      [[ARR_ELEMENT2:%.*]] = getelementptr inbounds [[S1]]* [[ARR_ELEMENT]], i{{.*}} 1
+// CHECK:      [[ARR_ELEMENT2:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_ELEMENT]], i{{.*}} 1
 // CHECK:      invoke {{.*}} [[S1_CTOR]]([[S1]]* [[ARR_ELEMENT2]], [[INT]] {{.*}}6)
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      ret i8* [[ARG]]
 // CHECK:      }
 // CHECK:      define internal {{.*}}void [[ARR_X_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[ARR_BEGIN:%.*]] = bitcast i8* [[ARG]] to [[S1]]*
-// CHECK-NEXT: [[ARR_CUR:%.*]] = getelementptr inbounds [[S1]]* [[ARR_BEGIN]], i{{.*}} 6
+// CHECK-NEXT: [[ARR_CUR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_BEGIN]], i{{.*}} 6
 // CHECK-NEXT: br label %[[ARR_LOOP:.*]]
 // CHECK:      {{.*}}[[ARR_LOOP]]{{.*}}
 // CHECK-NEXT: [[ARR_ELEMENTPAST:%.*]] = phi [[S1]]* [ [[ARR_CUR]], {{.*}} ], [ [[ARR_ELEMENT:%.*]], {{.*}} ]
-// CHECK-NEXT: [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]]* [[ARR_ELEMENTPAST]], i{{.*}} -1
+// CHECK-NEXT: [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_ELEMENTPAST]], i{{.*}} -1
 // CHECK-NEXT: invoke {{.*}} [[S1_DTOR]]([[S1]]* [[ARR_ELEMENT]])
 // CHECK:      [[ARR_DONE:%.*]] = icmp eq [[S1]]* [[ARR_ELEMENT]], [[ARR_BEGIN]]
 // CHECK-NEXT: br i1 [[ARR_DONE]], label %[[ARR_EXIT:.*]], label %[[ARR_LOOP]]
@@ -255,8 +256,8 @@ S1 arr_x[2][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
 // CHECK-NEXT: ret void
 // CHECK-NEXT: }
 // CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG:      store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC2]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG:      store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC2]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
 // CHECK-DEBUG:      @__kmpc_global_thread_num
 // CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i8* (i8*)* [[ARR_X_CTOR:@\.__kmpc_global_ctor_\..*]], i8* (i8*, i8*)* null, void (i8*)* [[ARR_X_DTOR:@\.__kmpc_global_dtor_\..*]])
 // CHECK-DEBUG:      define internal {{.*}}i8* [[ARR_X_CTOR]](i8*)
@@ -307,171 +308,171 @@ int main() {
 // CHECK:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i8* (i8*)* [[SM_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[SM_DTOR:@\.__kmpc_global_dtor_\..+]])
 // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
 // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
-// CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
-// CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+// CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+// CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
 // CHECK-NEXT: invoke {{.*}} [[SMAIN_CTOR:.*]]([[SMAIN]]* [[SM]], [[INT]] {{.*}}[[GS1_A]])
 // CHECK:      call {{.*}}void @__cxa_guard_release
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
 // CHECK-DEBUG-NEXT: [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
 // CHECK-DEBUG:      call {{.*}}i{{.*}} @__cxa_guard_acquire
 // CHECK-DEBUG:      call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
 // CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i8* (i8*)* [[SM_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[SM_DTOR:@\.__kmpc_global_dtor_\..+]])
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
 // CHECK-DEBUG:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
 // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
-// CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
-// CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+// CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+// CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
 // CHECK-DEBUG-NEXT: invoke {{.*}} [[SMAIN_CTOR:.*]]([[SMAIN]]* [[SM]], [[INT]] {{.*}}[[GS1_A]])
 // CHECK-DEBUG:      call {{.*}}void @__cxa_guard_release
 #pragma omp threadprivate(sm)
   // CHECK:      [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[STATIC_S]].cache.)
   // CHECK-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
-  // CHECK-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[STATIC_S_A:%.*]] = load [[INT]]* [[STATIC_S_A_ADDR]]
+  // CHECK-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]], [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[STATIC_S_A:%.*]] = load [[INT]], [[INT]]* [[STATIC_S_A_ADDR]]
   // CHECK-NEXT: store [[INT]] [[STATIC_S_A]], [[INT]]* [[RES_ADDR:[^,]+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC5]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC5]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
-  // CHECK-DEBUG-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[STATIC_S_A:%.*]] = load [[INT]]* [[STATIC_S_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]], [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[STATIC_S_A:%.*]] = load [[INT]], [[INT]]* [[STATIC_S_A_ADDR]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[STATIC_S_A]], [[INT]]* [[RES_ADDR:[^,]+]]
   Res = Static::s.a;
   // CHECK:      [[SM_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[SM]].cache.)
   // CHECK-NEXT: [[SM_ADDR:%.*]] = bitcast i8* [[SM_TEMP_ADDR]] to [[SMAIN]]*
-  // CHECK-NEXT: [[SM_A_ADDR:%.*]] = getelementptr inbounds [[SMAIN]]* [[SM_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[SM_A:%.*]] = load [[INT]]* [[SM_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[SM_A_ADDR:%.*]] = getelementptr inbounds [[SMAIN]], [[SMAIN]]* [[SM_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[SM_A:%.*]] = load [[INT]], [[INT]]* [[SM_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[SM_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC6]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC6]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[SM_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[SM_ADDR:%.*]] = bitcast i8* [[SM_TEMP_ADDR]] to [[SMAIN]]*
-  // CHECK-DEBUG-NEXT: [[SM_A_ADDR:%.*]] = getelementptr inbounds [[SMAIN]]* [[SM_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[SM_A:%.*]] = load [[INT]]* [[SM_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[SM_A_ADDR:%.*]] = getelementptr inbounds [[SMAIN]], [[SMAIN]]* [[SM_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[SM_A:%.*]] = load [[INT]], [[INT]]* [[SM_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[SM_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += sm.a;
   // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
   // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
-  // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS1_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC7]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC7]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
-  // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS1_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += gs1.a;
-  // CHECK:      [[GS2_A:%.*]] = load [[INT]]* getelementptr inbounds ([[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK:      [[GS2_A:%.*]] = load [[INT]], [[INT]]* getelementptr inbounds ([[S2]], [[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS2_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[GS2_A:%.*]] = load [[INT]]* getelementptr inbounds ([[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG:      [[GS2_A:%.*]] = load [[INT]], [[INT]]* getelementptr inbounds ([[S2]], [[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS2_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += gs2.a;
   // CHECK:      [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS3]].cache.)
   // CHECK-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
-  // CHECK-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[GS3_A:%.*]] = load [[INT]]* [[GS3_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]], [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[GS3_A:%.*]] = load [[INT]], [[INT]]* [[GS3_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS3_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC8]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC8]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
-  // CHECK-DEBUG-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[GS3_A:%.*]] = load [[INT]]* [[GS3_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]], [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[GS3_A:%.*]] = load [[INT]], [[INT]]* [[GS3_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS3_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += gs3.a;
   // CHECK:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ARR_X]].cache.)
   // CHECK-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
-  // CHECK-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
-  // CHECK-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
-  // CHECK-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]]* [[ARR_X_1_1_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]], [[INT]]* [[ARR_X_1_1_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC9]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC9]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
-  // CHECK-DEBUG-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
-  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
-  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]]* [[ARR_X_1_1_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]], [[INT]]* [[ARR_X_1_1_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += arr_x[1][1].a;
   // CHECK:      [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_INT_ST]].cache.)
   // CHECK-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
-  // CHECK-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]]* [[ST_INT_ST_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]], [[INT]]* [[ST_INT_ST_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC10]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC10]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
-  // CHECK-DEBUG-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]]* [[ST_INT_ST_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]], [[INT]]* [[ST_INT_ST_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += ST<int>::st;
   // CHECK:      [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_FLOAT_ST]].cache.)
   // CHECK-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
-  // CHECK-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float* [[ST_FLOAT_ST_ADDR]]
+  // CHECK-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float, float* [[ST_FLOAT_ST_ADDR]]
   // CHECK-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC11]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC11]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
-  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float* [[ST_FLOAT_ST_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float, float* [[ST_FLOAT_ST_ADDR]]
   // CHECK-DEBUG-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += static_cast<int>(ST<float>::st);
   // CHECK:      [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_S4_ST]].cache.)
   // CHECK-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
-  // CHECK-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]]* [[ST_S4_ST_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]], [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]], [[INT]]* [[ST_S4_ST_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_S4_ST_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC12]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC12]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
-  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]]* [[ST_S4_ST_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]], [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]], [[INT]]* [[ST_S4_ST_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_S4_ST_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += ST<S4>::st.a;
-  // CHECK:      [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK:      [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: ret [[INT]] [[RES]]
-  // CHECK-DEBUG:      [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG:      [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: ret [[INT]] [[RES]]
   return Res;
 }
@@ -480,20 +481,20 @@ int main() {
 // CHECK:      define internal {{.*}}i8* [[SM_CTOR]](i8*)
 // CHECK:      [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[DEFAULT_LOC]])
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[SMAIN]]*
 // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
 // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
-// CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
-// CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+// CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+// CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
 // CHECK-NEXT: call {{.*}} [[SMAIN_CTOR:@.+]]([[SMAIN]]* [[RES]], [[INT]] {{.*}}[[GS1_A]])
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-NEXT: ret i8* [[ARG]]
 // CHECK-NEXT: }
 // CHECK:      define {{.*}} [[SMAIN_CTOR]]([[SMAIN]]* {{.*}},
 // CHECK:      define internal {{.*}}void [[SM_DTOR]](i8*)
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[SMAIN]]*
 // CHECK-NEXT: call {{.*}} [[SMAIN_DTOR:@.+]]([[SMAIN]]* [[RES]])
 // CHECK-NEXT: ret void
@@ -501,20 +502,20 @@ int main() {
 // CHECK:      define {{.*}} [[SMAIN_DTOR]]([[SMAIN]]* {{.*}})
 // CHECK-DEBUG:      define internal {{.*}}i8* [[SM_CTOR]](i8*)
 // CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
 // CHECK-DEBUG-NEXT: [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
 // CHECK-DEBUG:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK-DEBUG:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[SMAIN]]*
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
 // CHECK-DEBUG:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
 // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
-// CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
-// CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+// CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+// CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
 // CHECK-DEBUG-NEXT: call {{.*}} [[SMAIN_CTOR:@.+]]([[SMAIN]]* [[RES]], [[INT]] {{.*}}[[GS1_A]])
-// CHECK-DEBUG:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG-NEXT: ret i8* [[ARG]]
 // CHECK-DEBUG-NEXT: }
 // CHECK-DEBUG:      define {{.*}} [[SMAIN_CTOR]]([[SMAIN]]* {{.*}},
@@ -534,136 +535,136 @@ int foobar() {
   // CHECK:      [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[DEFAULT_LOC]])
   // CHECK:      [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[STATIC_S]].cache.)
   // CHECK-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
-  // CHECK-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[STATIC_S_A:%.*]] = load [[INT]]* [[STATIC_S_A_ADDR]]
+  // CHECK-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]], [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[STATIC_S_A:%.*]] = load [[INT]], [[INT]]* [[STATIC_S_A_ADDR]]
   // CHECK-NEXT: store [[INT]] [[STATIC_S_A]], [[INT]]* [[RES_ADDR:[^,]+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC13]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC13]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC13]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC13]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
-  // CHECK-DEBUG-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[STATIC_S_A:%.*]] = load [[INT]]* [[STATIC_S_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]], [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[STATIC_S_A:%.*]] = load [[INT]], [[INT]]* [[STATIC_S_A_ADDR]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[STATIC_S_A]], [[INT]]* [[RES_ADDR:[^,]+]]
   Res = Static::s.a;
   // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
   // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
-  // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS1_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC14]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC14]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
-  // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS1_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += gs1.a;
-  // CHECK:      [[GS2_A:%.*]] = load [[INT]]* getelementptr inbounds ([[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK:      [[GS2_A:%.*]] = load [[INT]], [[INT]]* getelementptr inbounds ([[S2]], [[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS2_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[GS2_A:%.*]] = load [[INT]]* getelementptr inbounds ([[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG:      [[GS2_A:%.*]] = load [[INT]], [[INT]]* getelementptr inbounds ([[S2]], [[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS2_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += gs2.a;
   // CHECK:      [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS3]].cache.)
   // CHECK-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
-  // CHECK-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[GS3_A:%.*]] = load [[INT]]* [[GS3_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]], [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[GS3_A:%.*]] = load [[INT]], [[INT]]* [[GS3_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS3_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC15]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC15]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
-  // CHECK-DEBUG-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[GS3_A:%.*]] = load [[INT]]* [[GS3_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]], [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[GS3_A:%.*]] = load [[INT]], [[INT]]* [[GS3_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS3_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += gs3.a;
   // CHECK:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ARR_X]].cache.)
   // CHECK-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
-  // CHECK-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
-  // CHECK-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
-  // CHECK-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]]* [[ARR_X_1_1_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]], [[INT]]* [[ARR_X_1_1_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC16]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC16]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
-  // CHECK-DEBUG-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
-  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
-  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]]* [[ARR_X_1_1_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]], [[INT]]* [[ARR_X_1_1_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += arr_x[1][1].a;
   // CHECK:      [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_INT_ST]].cache.)
   // CHECK-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
-  // CHECK-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]]* [[ST_INT_ST_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]], [[INT]]* [[ST_INT_ST_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC17]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC17]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
-  // CHECK-DEBUG-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]]* [[ST_INT_ST_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]], [[INT]]* [[ST_INT_ST_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += ST<int>::st;
   // CHECK:      [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_FLOAT_ST]].cache.)
   // CHECK-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
-  // CHECK-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float* [[ST_FLOAT_ST_ADDR]]
+  // CHECK-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float, float* [[ST_FLOAT_ST_ADDR]]
   // CHECK-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC18]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC18]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
-  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float* [[ST_FLOAT_ST_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float, float* [[ST_FLOAT_ST_ADDR]]
   // CHECK-DEBUG-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += static_cast<int>(ST<float>::st);
   // CHECK:      [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_S4_ST]].cache.)
   // CHECK-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
-  // CHECK-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]]* [[ST_S4_ST_A_ADDR]]
-  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]], [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]], [[INT]]* [[ST_S4_ST_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_S4_ST_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC19]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC19]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
-  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
-  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]]* [[ST_S4_ST_A_ADDR]]
-  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]], [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]], [[INT]]* [[ST_S4_ST_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_S4_ST_A]]
   // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += ST<S4>::st.a;
-  // CHECK:      [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK:      [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: ret [[INT]] [[RES]]
-  // CHECK-DEBUG:      [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG:      [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-DEBUG-NEXT: ret [[INT]] [[RES]]
   return Res;
 }
@@ -672,24 +673,24 @@ int foobar() {
 // CHECK:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i8* (i8*)* [[ST_S4_ST_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[ST_S4_ST_DTOR:@\.__kmpc_global_dtor_\..+]])
 // CHECK:      define internal {{.*}}i8* [[ST_S4_ST_CTOR]](i8*)
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[S4]]*
 // CHECK-NEXT: call {{.*}} [[S4_CTOR:@.+]]([[S4]]* [[RES]], {{.*}} 23)
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-NEXT: ret i8* [[ARG]]
 // CHECK-NEXT: }
 // CHECK:      define {{.*}} [[S4_CTOR]]([[S4]]* {{.*}},
 // CHECK:      define internal {{.*}}void [[ST_S4_ST_DTOR]](i8*)
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
-// CHECK:      [[ARG:%.+]] = load i8** [[ARG_ADDR]]
+// CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[S4]]*
 // CHECK-NEXT: call {{.*}} [[S4_DTOR:@.+]]([[S4]]* [[RES]])
 // CHECK-NEXT: ret void
 // CHECK-NEXT: }
 // CHECK:      define {{.*}} [[S4_DTOR]]([[S4]]* {{.*}})
 // CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC20]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC20]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
 // CHECK-DEBUG:      @__kmpc_global_thread_num
 // CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i8* (i8*)* [[ST_S4_ST_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[ST_S4_ST_DTOR:@\.__kmpc_global_dtor_\..+]])
 // CHECK-DEBUG:      define internal {{.*}}i8* [[ST_S4_ST_CTOR]](i8*)
diff --git a/test/OpenMP/threadprivate_messages.cpp b/test/OpenMP/threadprivate_messages.cpp
index 27b36fbe024c..70ab5441aba0 100644
--- a/test/OpenMP/threadprivate_messages.cpp
+++ b/test/OpenMP/threadprivate_messages.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp=libiomp5 -ferror-limit 100 %s
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -emit-llvm -o - %s
 
 #pragma omp threadprivate // expected-error {{expected '(' after 'threadprivate'}}
 #pragma omp threadprivate( // expected-error {{expected identifier}} expected-error {{expected ')'}} expected-note {{to match this '('}}
@@ -17,7 +17,7 @@ int a; // expected-note {{'a' defined here}}
 
 #pragma omp threadprivate(a)
 #pragma omp threadprivate(u) // expected-error {{use of undeclared identifier 'u'}}
-#pragma omp threadprivate(d, a) // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'a'}}
+#pragma omp threadprivate(d, a)
 int foo() { // expected-note {{declared here}}
   static int l;
 #pragma omp threadprivate(l)) // expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
@@ -25,21 +25,20 @@ int foo() { // expected-note {{declared here}}
 }
 
 #pragma omp threadprivate (a) (
-// expected-error@-1 {{'#pragma omp threadprivate' must precede all references to variable 'a'}} expected-warning@-1 {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
-#pragma omp threadprivate (a) [ // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'a'}} expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
-#pragma omp threadprivate (a) { // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'a'}} expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
-#pragma omp threadprivate (a) ) // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'a'}} expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
-#pragma omp threadprivate (a) ] // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'a'}} expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
-#pragma omp threadprivate (a) } // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'a'}} expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
+// expected-warning@-1 {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
+#pragma omp threadprivate (a) [ // expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
+#pragma omp threadprivate (a) { // expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
+#pragma omp threadprivate (a) ) // expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
+#pragma omp threadprivate (a) ] // expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
+#pragma omp threadprivate (a) } // expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
 #pragma omp threadprivate a // expected-error {{expected '(' after 'threadprivate'}}
-#pragma omp threadprivate(d // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{'#pragma omp threadprivate' must precede all references to variable 'd'}}
-#pragma omp threadprivate(d)) // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'd'}} expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
+#pragma omp threadprivate(d // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp threadprivate(d)) // expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
 int x, y;
 #pragma omp threadprivate(x)) // expected-warning {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
 #pragma omp threadprivate(y)),
 // expected-warning@-1 {{extra tokens at the end of '#pragma omp threadprivate' are ignored}}
 #pragma omp threadprivate(a,d)
-// expected-error@-1 {{'#pragma omp threadprivate' must precede all references to variable 'a'}} expected-error@-1 {{'#pragma omp threadprivate' must precede all references to variable 'd'}}
 #pragma omp threadprivate(d.a) // expected-error {{expected identifier}}
 #pragma omp threadprivate((float)a) // expected-error {{expected unqualified-id}}
 int foa; // expected-note {{'foa' declared here}}
@@ -73,8 +72,8 @@ namespace ns {
 #pragma omp threadprivate (m)
 }
 #pragma omp threadprivate (m) // expected-error {{use of undeclared identifier 'm'}}
-#pragma omp threadprivate (ns::m) // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'ns::m'}}
-#pragma omp threadprivate (ns:m) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}} expected-error {{'#pragma omp threadprivate' must precede all references to variable 'ns::m'}}
+#pragma omp threadprivate (ns::m)
+#pragma omp threadprivate (ns:m) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
 
 const int h = 12;
 const volatile int i = 10;
@@ -104,10 +103,10 @@ int o; // expected-note {{candidate found by name lookup is 'o'}}
 namespace {
 int o; // expected-note {{candidate found by name lookup is '(anonymous namespace)::o'}}
 #pragma omp threadprivate (o)
-#pragma omp threadprivate (o) // expected-error {{'#pragma omp threadprivate' must precede all references to variable '(anonymous namespace)::o'}}
+#pragma omp threadprivate (o)
 }
 #pragma omp threadprivate (o) // expected-error {{reference to 'o' is ambiguous}}
-#pragma omp threadprivate (::o) // expected-error {{'#pragma omp threadprivate' must precede all references to variable 'o'}}
+#pragma omp threadprivate (::o)
 
 int main(int argc, char **argv) { // expected-note {{'argc' defined here}}
author	Dimitry Andric <dim@FreeBSD.org>	2015-05-27 18:47:56 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2015-05-27 18:47:56 +0000
commit	5e20cdd81c44a443562a09007668ffdf76c455af (patch)
tree	dbbd4047878da71c1a706e26ce05b4e7791b14cc /test/OpenMP
parent	d5f23b0b7528b5c3caed1ba14f897cc4aaa9e3c3 (diff)