22 files changed, 21688 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
new file mode 100644
index 000000000000..d92ee11c2e1a
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -0,0 +1,3337 @@
+//===- AddressSanitizer.cpp - memory error detector -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+// Details of the algorithm:
+//  https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iomanip>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asan"
+
+static const uint64_t kDefaultShadowScale = 3;
+static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kDynamicShadowSentinel =
+    std::numeric_limits<uint64_t>::max();
+static const uint64_t kSmallX86_64ShadowOffsetBase = 0x7FFFFFFF;  // < 2G.
+static const uint64_t kSmallX86_64ShadowOffsetAlignMask = ~0xFFFULL;
+static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000;
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 44;
+static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52;
+static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
+static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
+static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
+static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
+static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
+static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSDKasan_ShadowOffset64 = 0xdfff900000000000;
+static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
+static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
+static const uint64_t kEmscriptenShadowOffset = 0;
+
+static const uint64_t kMyriadShadowScale = 5;
+static const uint64_t kMyriadMemoryOffset32 = 0x80000000ULL;
+static const uint64_t kMyriadMemorySize32 = 0x20000000ULL;
+static const uint64_t kMyriadTagShift = 29;
+static const uint64_t kMyriadDDRTag = 4;
+static const uint64_t kMyriadCacheBitMask32 = 0x40000000ULL;
+
+// The shadow memory space is dynamically allocated.
+static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
+
+static const size_t kMinStackMallocSize = 1 << 6;   // 64B
+static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
+static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
+static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
+
+static const char *const kAsanModuleCtorName = "asan.module_ctor";
+static const char *const kAsanModuleDtorName = "asan.module_dtor";
+static const uint64_t kAsanCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50;
+static const char *const kAsanReportErrorTemplate = "__asan_report_";
+static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
+static const char *const kAsanUnregisterGlobalsName =
+    "__asan_unregister_globals";
+static const char *const kAsanRegisterImageGlobalsName =
+  "__asan_register_image_globals";
+static const char *const kAsanUnregisterImageGlobalsName =
+  "__asan_unregister_image_globals";
+static const char *const kAsanRegisterElfGlobalsName =
+  "__asan_register_elf_globals";
+static const char *const kAsanUnregisterElfGlobalsName =
+  "__asan_unregister_elf_globals";
+static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
+static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
+static const char *const kAsanInitName = "__asan_init";
+static const char *const kAsanVersionCheckNamePrefix =
+    "__asan_version_mismatch_check_v";
+static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
+static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
+static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
+static const int kMaxAsanStackMallocSizeClass = 10;
+static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
+static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
+static const char *const kAsanGenPrefix = "___asan_gen_";
+static const char *const kODRGenPrefix = "__odr_asan_gen_";
+static const char *const kSanCovGenPrefix = "__sancov_gen_";
+static const char *const kAsanSetShadowPrefix = "__asan_set_shadow_";
+static const char *const kAsanPoisonStackMemoryName =
+    "__asan_poison_stack_memory";
+static const char *const kAsanUnpoisonStackMemoryName =
+    "__asan_unpoison_stack_memory";
+
+// ASan version script has __asan_* wildcard. Triple underscore prevents a
+// linker (gold) warning about attempting to export a local symbol.
+static const char *const kAsanGlobalsRegisteredFlagName =
+    "___asan_globals_registered";
+
+static const char *const kAsanOptionDetectUseAfterReturn =
+    "__asan_option_detect_stack_use_after_return";
+
+static const char *const kAsanShadowMemoryDynamicAddress =
+    "__asan_shadow_memory_dynamic_address";
+
+static const char *const kAsanAllocaPoison = "__asan_alloca_poison";
+static const char *const kAsanAllocasUnpoison = "__asan_allocas_unpoison";
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+static const unsigned kAllocaRzSize = 32;
+
+// Command-line flags.
+
+static cl::opt<bool> ClEnableKasan(
+    "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClRecover(
+    "asan-recover",
+    cl::desc("Enable recovery mode (continue-after-error)."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInsertVersionCheck(
+    "asan-guard-against-version-mismatch",
+    cl::desc("Guard against compiler/runtime version mismatch."),
+    cl::Hidden, cl::init(true));
+
+// This flag may need to be replaced with -f[no-]asan-reads.
+static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentWrites(
+    "asan-instrument-writes", cl::desc("instrument write instructions"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+    "asan-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ClAlwaysSlowPath(
+    "asan-always-slow-path",
+    cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClForceDynamicShadow(
+    "asan-force-dynamic-shadow",
+    cl::desc("Load shadow address into a local variable for each function"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClWithIfunc("asan-with-ifunc",
+                cl::desc("Access dynamic shadow through an ifunc global on "
+                         "platforms that support this"),
+                cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClWithIfuncSuppressRemat(
+    "asan-with-ifunc-suppress-remat",
+    cl::desc("Suppress rematerialization of dynamic shadow address by passing "
+             "it through inline asm in prologue."),
+    cl::Hidden, cl::init(true));
+
+// This flag limits the number of instructions to be instrumented
+// in any given BB. Normally, this should be set to unlimited (INT_MAX),
+// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
+// set it to 10000.
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
+    "asan-max-ins-per-bb", cl::init(10000),
+    cl::desc("maximal number of instructions to instrument in any given BB"),
+    cl::Hidden);
+
+// This flag may need to be replaced with -f[no]asan-stack.
+static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
+                             cl::Hidden, cl::init(true));
+static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
+    "asan-max-inline-poisoning-size",
+    cl::desc(
+        "Inline shadow poisoning for blocks up to the given size in bytes."),
+    cl::Hidden, cl::init(64));
+
+static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
+                                      cl::desc("Check stack-use-after-return"),
+                                      cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args",
+                                        cl::desc("Create redzones for byval "
+                                                 "arguments (extra copy "
+                                                 "required)"), cl::Hidden,
+                                        cl::init(true));
+
+static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
+                                     cl::desc("Check stack-use-after-scope"),
+                                     cl::Hidden, cl::init(false));
+
+// This flag may need to be replaced with -f[no]asan-globals.
+static cl::opt<bool> ClGlobals("asan-globals",
+                               cl::desc("Handle global objects"), cl::Hidden,
+                               cl::init(true));
+
+static cl::opt<bool> ClInitializers("asan-initialization-order",
+                                    cl::desc("Handle C++ initializer order"),
+                                    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInvalidPointerPairs(
+    "asan-detect-invalid-pointer-pair",
+    cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClInvalidPointerCmp(
+    "asan-detect-invalid-pointer-cmp",
+    cl::desc("Instrument <, <=, >, >= with pointer operands"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClInvalidPointerSub(
+    "asan-detect-invalid-pointer-sub",
+    cl::desc("Instrument - operations with pointer operands"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<unsigned> ClRealignStack(
+    "asan-realign-stack",
+    cl::desc("Realign stack to the value of this flag (power of two)"),
+    cl::Hidden, cl::init(32));
+
+static cl::opt<int> ClInstrumentationWithCallsThreshold(
+    "asan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented contains more than "
+        "this number of memory accesses, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(7000));
+
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+    "asan-memory-access-callback-prefix",
+    cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+    cl::init("__asan_"));
+
+static cl::opt<bool>
+    ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas",
+                               cl::desc("instrument dynamic allocas"),
+                               cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClSkipPromotableAllocas(
+    "asan-skip-promotable-allocas",
+    cl::desc("Do not instrument promotable allocas"), cl::Hidden,
+    cl::init(true));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = (Mem >> scale) + offset
+
+static cl::opt<int> ClMappingScale("asan-mapping-scale",
+                                   cl::desc("scale of asan shadow mapping"),
+                                   cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t>
+    ClMappingOffset("asan-mapping-offset",
+                    cl::desc("offset of asan shadow mapping [EXPERIMENTAL]"),
+                    cl::Hidden, cl::init(0));
+
+// Optimization flags. Not user visible, used mostly for testing
+// and benchmarking the tool.
+
+static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
+                           cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptSameTemp(
+    "asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptGlobals("asan-opt-globals",
+                                  cl::desc("Don't instrument scalar globals"),
+                                  cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptStack(
+    "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDynamicAllocaStack(
+    "asan-stack-dynamic-alloca",
+    cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<uint32_t> ClForceExperiment(
+    "asan-force-experiment",
+    cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
+    cl::init(0));
+
+static cl::opt<bool>
+    ClUsePrivateAlias("asan-use-private-alias",
+                      cl::desc("Use private aliases for global variables"),
+                      cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClUseOdrIndicator("asan-use-odr-indicator",
+                      cl::desc("Use odr indicators to improve ODR reporting"),
+                      cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClUseGlobalsGC("asan-globals-live-support",
+                   cl::desc("Use linker features to support dead "
+                            "code stripping of globals"),
+                   cl::Hidden, cl::init(true));
+
+// This is on by default even though there is a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool>
+    ClWithComdat("asan-with-comdat",
+                 cl::desc("Place ASan constructors in comdat sections"),
+                 cl::Hidden, cl::init(true));
+
+// Debug flags.
+
+static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+
+static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
+                                 cl::Hidden, cl::init(0));
+
+static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
+
+static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug max inst"),
+                               cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOptimizedAccessesToGlobalVar,
+          "Number of optimized accesses to global vars");
+STATISTIC(NumOptimizedAccessesToStackVar,
+          "Number of optimized accesses to stack vars");
+
+namespace {
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = (mem >> Scale) ADD-or-OR Offset.
+/// If InGlobal is true, then
+///   extern char __asan_shadow[];
+///   shadow = (mem >> Scale) + &__asan_shadow
+struct ShadowMapping {
+  int Scale;
+  uint64_t Offset;
+  bool OrShadowOffset;
+  bool InGlobal;
+};
+
+} // end anonymous namespace
+
+static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
+                                      bool IsKasan) {
+  bool IsAndroid = TargetTriple.isAndroid();
+  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
+  bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+  bool IsNetBSD = TargetTriple.isOSNetBSD();
+  bool IsPS4CPU = TargetTriple.isPS4CPU();
+  bool IsLinux = TargetTriple.isOSLinux();
+  bool IsPPC64 = TargetTriple.getArch() == Triple::ppc64 ||
+                 TargetTriple.getArch() == Triple::ppc64le;
+  bool IsSystemZ = TargetTriple.getArch() == Triple::systemz;
+  bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
+  bool IsMIPS32 = TargetTriple.isMIPS32();
+  bool IsMIPS64 = TargetTriple.isMIPS64();
+  bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb();
+  bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64;
+  bool IsWindows = TargetTriple.isOSWindows();
+  bool IsFuchsia = TargetTriple.isOSFuchsia();
+  bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
+  bool IsEmscripten = TargetTriple.isOSEmscripten();
+
+  ShadowMapping Mapping;
+
+  Mapping.Scale = IsMyriad ? kMyriadShadowScale : kDefaultShadowScale;
+  if (ClMappingScale.getNumOccurrences() > 0) {
+    Mapping.Scale = ClMappingScale;
+  }
+
+  if (LongSize == 32) {
+    if (IsAndroid)
+      Mapping.Offset = kDynamicShadowSentinel;
+    else if (IsMIPS32)
+      Mapping.Offset = kMIPS32_ShadowOffset32;
+    else if (IsFreeBSD)
+      Mapping.Offset = kFreeBSD_ShadowOffset32;
+    else if (IsNetBSD)
+      Mapping.Offset = kNetBSD_ShadowOffset32;
+    else if (IsIOS)
+      Mapping.Offset = kDynamicShadowSentinel;
+    else if (IsWindows)
+      Mapping.Offset = kWindowsShadowOffset32;
+    else if (IsEmscripten)
+      Mapping.Offset = kEmscriptenShadowOffset;
+    else if (IsMyriad) {
+      uint64_t ShadowOffset = (kMyriadMemoryOffset32 + kMyriadMemorySize32 -
+                               (kMyriadMemorySize32 >> Mapping.Scale));
+      Mapping.Offset = ShadowOffset - (kMyriadMemoryOffset32 >> Mapping.Scale);
+    }
+    else
+      Mapping.Offset = kDefaultShadowOffset32;
+  } else {  // LongSize == 64
+    // Fuchsia is always PIE, which means that the beginning of the address
+    // space is always available.
+    if (IsFuchsia)
+      Mapping.Offset = 0;
+    else if (IsPPC64)
+      Mapping.Offset = kPPC64_ShadowOffset64;
+    else if (IsSystemZ)
+      Mapping.Offset = kSystemZ_ShadowOffset64;
+    else if (IsFreeBSD && !IsMIPS64)
+      Mapping.Offset = kFreeBSD_ShadowOffset64;
+    else if (IsNetBSD) {
+      if (IsKasan)
+        Mapping.Offset = kNetBSDKasan_ShadowOffset64;
+      else
+        Mapping.Offset = kNetBSD_ShadowOffset64;
+    } else if (IsPS4CPU)
+      Mapping.Offset = kPS4CPU_ShadowOffset64;
+    else if (IsLinux && IsX86_64) {
+      if (IsKasan)
+        Mapping.Offset = kLinuxKasan_ShadowOffset64;
+      else
+        Mapping.Offset = (kSmallX86_64ShadowOffsetBase &
+                          (kSmallX86_64ShadowOffsetAlignMask << Mapping.Scale));
+    } else if (IsWindows && IsX86_64) {
+      Mapping.Offset = kWindowsShadowOffset64;
+    } else if (IsMIPS64)
+      Mapping.Offset = kMIPS64_ShadowOffset64;
+    else if (IsIOS)
+      Mapping.Offset = kDynamicShadowSentinel;
+    else if (IsAArch64)
+      Mapping.Offset = kAArch64_ShadowOffset64;
+    else
+      Mapping.Offset = kDefaultShadowOffset64;
+  }
+
+  if (ClForceDynamicShadow) {
+    Mapping.Offset = kDynamicShadowSentinel;
+  }
+
+  if (ClMappingOffset.getNumOccurrences() > 0) {
+    Mapping.Offset = ClMappingOffset;
+  }
+
+  // OR-ing shadow offset if more efficient (at least on x86) if the offset
+  // is a power of two, but on ppc64 we have to use add since the shadow
+  // offset is not necessary 1/8-th of the address space.  On SystemZ,
+  // we could OR the constant in a single instruction, but it's more
+  // efficient to load it once and use indexed addressing.
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
+                           !(Mapping.Offset & (Mapping.Offset - 1)) &&
+                           Mapping.Offset != kDynamicShadowSentinel;
+  bool IsAndroidWithIfuncSupport =
+      IsAndroid && !TargetTriple.isAndroidVersionLT(21);
+  Mapping.InGlobal = ClWithIfunc && IsAndroidWithIfuncSupport && IsArmOrThumb;
+
+  return Mapping;
+}
+
+static size_t RedzoneSizeForScale(int MappingScale) {
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  return std::max(32U, 1U << MappingScale);
+}
+
+static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
+  if (TargetTriple.isOSEmscripten()) {
+    return kAsanEmscriptenCtorAndDtorPriority;
+  } else {
+    return kAsanCtorAndDtorPriority;
+  }
+}
+
+namespace {
+
+/// Module analysis for getting various metadata about the module.
+class ASanGlobalsMetadataWrapperPass : public ModulePass {
+public:
+  static char ID;
+
+  ASanGlobalsMetadataWrapperPass() : ModulePass(ID) {
+    initializeASanGlobalsMetadataWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    GlobalsMD = GlobalsMetadata(M);
+    return false;
+  }
+
+  StringRef getPassName() const override {
+    return "ASanGlobalsMetadataWrapperPass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  GlobalsMetadata &getGlobalsMD() { return GlobalsMD; }
+
+private:
+  GlobalsMetadata GlobalsMD;
+};
+
+char ASanGlobalsMetadataWrapperPass::ID = 0;
+
+/// AddressSanitizer: instrument the code in module to find memory bugs.
+struct AddressSanitizer {
+  AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
+                   bool CompileKernel = false, bool Recover = false,
+                   bool UseAfterScope = false)
+      : UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) {
+    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+    this->CompileKernel =
+        ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
+
+    C = &(M.getContext());
+    LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+    TargetTriple = Triple(M.getTargetTriple());
+
+    Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
+  }
+
+  uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
+    uint64_t ArraySize = 1;
+    if (AI.isArrayAllocation()) {
+      const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+      assert(CI && "non-constant array size");
+      ArraySize = CI->getZExtValue();
+    }
+    Type *Ty = AI.getAllocatedType();
+    uint64_t SizeInBytes =
+        AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+    return SizeInBytes * ArraySize;
+  }
+
+  /// Check if we want (and can) handle this alloca.
+  bool isInterestingAlloca(const AllocaInst &AI);
+
+  /// If it is an interesting memory access, return the PointerOperand
+  /// and set IsWrite/Alignment. Otherwise return nullptr.
+  /// MaybeMask is an output parameter for the mask Value, if we're looking at a
+  /// masked load/store.
+  Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
+                                   uint64_t *TypeSize, unsigned *Alignment,
+                                   Value **MaybeMask = nullptr);
+
+  void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, Instruction *I,
+                     bool UseCalls, const DataLayout &DL);
+  void instrumentPointerComparisonOrSubtraction(Instruction *I);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite,
+                         Value *SizeArgument, bool UseCalls, uint32_t Exp);
+  void instrumentUnusualSizeOrAlignment(Instruction *I,
+                                        Instruction *InsertBefore, Value *Addr,
+                                        uint32_t TypeSize, bool IsWrite,
+                                        Value *SizeArgument, bool UseCalls,
+                                        uint32_t Exp);
+  Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                           Value *ShadowValue, uint32_t TypeSize);
+  Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
+                                 bool IsWrite, size_t AccessSizeIndex,
+                                 Value *SizeArgument, uint32_t Exp);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool instrumentFunction(Function &F, const TargetLibraryInfo *TLI);
+  bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
+  void markEscapedLocalAllocas(Function &F);
+
+private:
+  friend struct FunctionStackPoisoner;
+
+  void initializeCallbacks(Module &M);
+
+  bool LooksLikeCodeInBug11395(Instruction *I);
+  bool GlobalIsLinkerInitialized(GlobalVariable *G);
+  bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
+                    uint64_t TypeSize) const;
+
+  /// Helper to cleanup per-function state.
+  struct FunctionStateRAII {
+    AddressSanitizer *Pass;
+
+    FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) {
+      assert(Pass->ProcessedAllocas.empty() &&
+             "last pass forgot to clear cache");
+      assert(!Pass->LocalDynamicShadow);
+    }
+
+    ~FunctionStateRAII() {
+      Pass->LocalDynamicShadow = nullptr;
+      Pass->ProcessedAllocas.clear();
+    }
+  };
+
+  LLVMContext *C;
+  Triple TargetTriple;
+  int LongSize;
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+  FunctionCallee AsanHandleNoReturnFunc;
+  FunctionCallee AsanPtrCmpFunction, AsanPtrSubFunction;
+  Constant *AsanShadowGlobal;
+
+  // These arrays is indexed by AccessIsWrite, Experiment and log2(AccessSize).
+  FunctionCallee AsanErrorCallback[2][2][kNumberOfAccessSizes];
+  FunctionCallee AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes];
+
+  // These arrays is indexed by AccessIsWrite and Experiment.
+  FunctionCallee AsanErrorCallbackSized[2][2];
+  FunctionCallee AsanMemoryAccessCallbackSized[2][2];
+
+  FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset;
+  InlineAsm *EmptyAsm;
+  Value *LocalDynamicShadow = nullptr;
+  const GlobalsMetadata &GlobalsMD;
+  DenseMap<const AllocaInst *, bool> ProcessedAllocas;
+};
+
+class AddressSanitizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  explicit AddressSanitizerLegacyPass(bool CompileKernel = false,
+                                      bool Recover = false,
+                                      bool UseAfterScope = false)
+      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
+        UseAfterScope(UseAfterScope) {
+    initializeAddressSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "AddressSanitizerFunctionPass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ASanGlobalsMetadataWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    GlobalsMetadata &GlobalsMD =
+        getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover,
+                          UseAfterScope);
+    return ASan.instrumentFunction(F, TLI);
+  }
+
+private:
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+};
+
+class ModuleAddressSanitizer {
+public:
+  ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
+                         bool CompileKernel = false, bool Recover = false,
+                         bool UseGlobalsGC = true, bool UseOdrIndicator = false)
+      : GlobalsMD(*GlobalsMD), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+        // Enable aliases as they should have no downside with ODR indicators.
+        UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias),
+        UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator),
+        // Not a typo: ClWithComdat is almost completely pointless without
+        // ClUseGlobalsGC (because then it only works on modules without
+        // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+        // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+        // argument is designed as workaround. Therefore, disable both
+        // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+        // do globals-gc.
+        UseCtorComdat(UseGlobalsGC && ClWithComdat) {
+    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+    this->CompileKernel =
+        ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
+
+    C = &(M.getContext());
+    int LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+    TargetTriple = Triple(M.getTargetTriple());
+    Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
+  }
+
+  bool instrumentModule(Module &);
+
+private:
+  void initializeCallbacks(Module &M);
+
+  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
+  void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
+                             ArrayRef<GlobalVariable *> ExtendedGlobals,
+                             ArrayRef<Constant *> MetadataInitializers);
+  void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
+                            ArrayRef<GlobalVariable *> ExtendedGlobals,
+                            ArrayRef<Constant *> MetadataInitializers,
+                            const std::string &UniqueModuleId);
+  void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M,
+                              ArrayRef<GlobalVariable *> ExtendedGlobals,
+                              ArrayRef<Constant *> MetadataInitializers);
+  void
+  InstrumentGlobalsWithMetadataArray(IRBuilder<> &IRB, Module &M,
+                                     ArrayRef<GlobalVariable *> ExtendedGlobals,
+                                     ArrayRef<Constant *> MetadataInitializers);
+
+  GlobalVariable *CreateMetadataGlobal(Module &M, Constant *Initializer,
+                                       StringRef OriginalName);
+  void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata,
+                                  StringRef InternalSuffix);
+  IRBuilder<> CreateAsanModuleDtor(Module &M);
+
+  bool ShouldInstrumentGlobal(GlobalVariable *G);
+  bool ShouldUseMachOGlobalsSection() const;
+  StringRef getGlobalMetadataSection() const;
+  void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
+  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
+  size_t MinRedzoneSizeForGlobal() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
+  int GetAsanVersion(const Module &M) const;
+
+  const GlobalsMetadata &GlobalsMD;
+  bool CompileKernel;
+  bool Recover;
+  bool UseGlobalsGC;
+  bool UsePrivateAlias;
+  bool UseOdrIndicator;
+  bool UseCtorComdat;
+  Type *IntptrTy;
+  LLVMContext *C;
+  Triple TargetTriple;
+  ShadowMapping Mapping;
+  FunctionCallee AsanPoisonGlobals;
+  FunctionCallee AsanUnpoisonGlobals;
+  FunctionCallee AsanRegisterGlobals;
+  FunctionCallee AsanUnregisterGlobals;
+  FunctionCallee AsanRegisterImageGlobals;
+  FunctionCallee AsanUnregisterImageGlobals;
+  FunctionCallee AsanRegisterElfGlobals;
+  FunctionCallee AsanUnregisterElfGlobals;
+
+  Function *AsanCtorFunction = nullptr;
+  Function *AsanDtorFunction = nullptr;
+};
+
+class ModuleAddressSanitizerLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  explicit ModuleAddressSanitizerLegacyPass(bool CompileKernel = false,
+                                            bool Recover = false,
+                                            bool UseGlobalGC = true,
+                                            bool UseOdrIndicator = false)
+      : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover),
+        UseGlobalGC(UseGlobalGC), UseOdrIndicator(UseOdrIndicator) {
+    initializeModuleAddressSanitizerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "ModuleAddressSanitizer"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ASanGlobalsMetadataWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    GlobalsMetadata &GlobalsMD =
+        getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
+    ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover,
+                                      UseGlobalGC, UseOdrIndicator);
+    return ASanModule.instrumentModule(M);
+  }
+
+private:
+  bool CompileKernel;
+  bool Recover;
+  bool UseGlobalGC;
+  bool UseOdrIndicator;
+};
+
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
+  Function &F;
+  AddressSanitizer &ASan;
+  DIBuilder DIB;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *IntptrPtrTy;
+  ShadowMapping Mapping;
+
+  SmallVector<AllocaInst *, 16> AllocaVec;
+  SmallVector<AllocaInst *, 16> StaticAllocasToMoveUp;
+  SmallVector<Instruction *, 8> RetVec;
+  unsigned StackAlignment;
+
+  FunctionCallee AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
+      AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
+  FunctionCallee AsanSetShadowFunc[0x100] = {};
+  FunctionCallee AsanPoisonStackMemoryFunc, AsanUnpoisonStackMemoryFunc;
+  FunctionCallee AsanAllocaPoisonFunc, AsanAllocasUnpoisonFunc;
+
+  // Stores a place and arguments of poisoning/unpoisoning call for alloca.
+  struct AllocaPoisonCall {
+    IntrinsicInst *InsBefore;
+    AllocaInst *AI;
+    uint64_t Size;
+    bool DoPoison;
+  };
+  SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec;
+  SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec;
+  bool HasUntracedLifetimeIntrinsic = false;
+
+  SmallVector<AllocaInst *, 1> DynamicAllocaVec;
+  SmallVector<IntrinsicInst *, 1> StackRestoreVec;
+  AllocaInst *DynamicAllocaLayout = nullptr;
+  IntrinsicInst *LocalEscapeCall = nullptr;
+
+  // Maps Value to an AllocaInst from which the Value is originated.
+  using AllocaForValueMapTy = DenseMap<Value *, AllocaInst *>;
+  AllocaForValueMapTy AllocaForValue;
+
+  bool HasNonEmptyInlineAsm = false;
+  bool HasReturnsTwiceCall = false;
+  std::unique_ptr<CallInst> EmptyInlineAsm;
+
+  FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
+      : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false),
+        C(ASan.C), IntptrTy(ASan.IntptrTy),
+        IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping),
+        StackAlignment(1 << Mapping.Scale),
+        EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}
+
+  bool runOnFunction() {
+    if (!ClStack) return false;
+
+    if (ClRedzoneByvalArgs)
+      copyArgsPassedByValToAllocas();
+
+    // Collect alloca, ret, lifetime instructions etc.
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
+
+    if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;
+
+    initializeCallbacks(*F.getParent());
+
+    if (HasUntracedLifetimeIntrinsic) {
+      // If there are lifetime intrinsics which couldn't be traced back to an
+      // alloca, we may not know exactly when a variable enters scope, and
+      // therefore should "fail safe" by not poisoning them.
+      StaticAllocaPoisonCallVec.clear();
+      DynamicAllocaPoisonCallVec.clear();
+    }
+
+    processDynamicAllocas();
+    processStaticAllocas();
+
+    if (ClDebugStack) {
+      LLVM_DEBUG(dbgs() << F);
+    }
+    return true;
+  }
+
+  // Arguments marked with the "byval" attribute are implicitly copied without
+  // using an alloca instruction.  To produce redzones for those arguments, we
+  // copy them a second time into memory allocated with an alloca instruction.
+  void copyArgsPassedByValToAllocas();
+
+  // Finds all Alloca instructions and puts
+  // poisoned red zones around all of them.
+  // Then unpoison everything back before the function returns.
+  void processStaticAllocas();
+  void processDynamicAllocas();
+
+  void createDynamicAllocasInitStorage();
+
+  // ----------------------- Visitors.
+  /// Collect all Ret instructions.
+  void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }
+
+  /// Collect all Resume instructions.
+  void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
+
+  /// Collect all CatchReturnInst instructions.
+  void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }
+
+  void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
+                                        Value *SavedStack) {
+    IRBuilder<> IRB(InstBefore);
+    Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
+    // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
+    // need to adjust extracted SP to compute the address of the most recent
+    // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
+    // this purpose.
+    if (!isa<ReturnInst>(InstBefore)) {
+      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
+          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
+          {IntptrTy});
+
+      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
+
+      DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
+                                     DynamicAreaOffset);
+    }
+
+    IRB.CreateCall(
+        AsanAllocasUnpoisonFunc,
+        {IRB.CreateLoad(IntptrTy, DynamicAllocaLayout), DynamicAreaPtr});
+  }
+
+  // Unpoison dynamic allocas redzones.
+  void unpoisonDynamicAllocas() {
+    for (auto &Ret : RetVec)
+      unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout);
+
+    for (auto &StackRestoreInst : StackRestoreVec)
+      unpoisonDynamicAllocasBeforeInst(StackRestoreInst,
+                                       StackRestoreInst->getOperand(0));
+  }
+
+  // Deploy and poison redzones around dynamic alloca call. To do this, we
+  // should replace this call with another one with changed parameters and
+  // replace all its uses with new address, so
+  //   addr = alloca type, old_size, align
+  // is replaced by
+  //   new_size = (old_size + additional_size) * sizeof(type)
+  //   tmp = alloca i8, new_size, max(align, 32)
+  //   addr = tmp + 32 (first 32 bytes are for the left redzone).
+  // Additional_size is added to make new memory allocation contain not only
+  // requested memory, but also left, partial and right redzones.
+  void handleDynamicAllocaCall(AllocaInst *AI);
+
+  /// Collect Alloca instructions we want (and can) handle.
+  void visitAllocaInst(AllocaInst &AI) {
+    if (!ASan.isInterestingAlloca(AI)) {
+      if (AI.isStaticAlloca()) {
+        // Skip over allocas that are present *before* the first instrumented
+        // alloca, we don't want to move those around.
+        if (AllocaVec.empty())
+          return;
+
+        StaticAllocasToMoveUp.push_back(&AI);
+      }
+      return;
+    }
+
+    StackAlignment = std::max(StackAlignment, AI.getAlignment());
+    if (!AI.isStaticAlloca())
+      DynamicAllocaVec.push_back(&AI);
+    else
+      AllocaVec.push_back(&AI);
+  }
+
+  /// Collect lifetime intrinsic calls to check for use-after-scope
+  /// errors.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    Intrinsic::ID ID = II.getIntrinsicID();
+    if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
+    if (ID == Intrinsic::localescape) LocalEscapeCall = &II;
+    if (!ASan.UseAfterScope)
+      return;
+    if (!II.isLifetimeStartOrEnd())
+      return;
+    // Found lifetime intrinsic, add ASan instrumentation if necessary.
+    auto *Size = cast<ConstantInt>(II.getArgOperand(0));
+    // If size argument is undefined, don't do anything.
+    if (Size->isMinusOne()) return;
+    // Check that size doesn't saturate uint64_t and can
+    // be stored in IntptrTy.
+    const uint64_t SizeValue = Size->getValue().getLimitedValue();
+    if (SizeValue == ~0ULL ||
+        !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
+      return;
+    // Find alloca instruction that corresponds to llvm.lifetime argument.
+    AllocaInst *AI =
+        llvm::findAllocaForValue(II.getArgOperand(1), AllocaForValue);
+    if (!AI) {
+      HasUntracedLifetimeIntrinsic = true;
+      return;
+    }
+    // We're interested only in allocas we can handle.
+    if (!ASan.isInterestingAlloca(*AI))
+      return;
+    bool DoPoison = (ID == Intrinsic::lifetime_end);
+    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
+    if (AI->isStaticAlloca())
+      StaticAllocaPoisonCallVec.push_back(APC);
+    else if (ClInstrumentDynamicAllocas)
+      DynamicAllocaPoisonCallVec.push_back(APC);
+  }
+
+  void visitCallSite(CallSite CS) {
+    Instruction *I = CS.getInstruction();
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      HasNonEmptyInlineAsm |= CI->isInlineAsm() &&
+                              !CI->isIdenticalTo(EmptyInlineAsm.get()) &&
+                              I != ASan.LocalDynamicShadow;
+      HasReturnsTwiceCall |= CI->canReturnTwice();
+    }
+  }
+
+  // ---------------------- Helpers.
+  void initializeCallbacks(Module &M);
+
+  // Copies bytes from ShadowBytes into shadow memory for indexes where
+  // ShadowMask is not zero. If ShadowMask[i] is zero, we assume that
+  // ShadowBytes[i] is constantly zero and doesn't need to be overwritten.
+  void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+                    IRBuilder<> &IRB, Value *ShadowBase);
+  void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+                    size_t Begin, size_t End, IRBuilder<> &IRB,
+                    Value *ShadowBase);
+  void copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+                          ArrayRef<uint8_t> ShadowBytes, size_t Begin,
+                          size_t End, IRBuilder<> &IRB, Value *ShadowBase);
+
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
+
+  Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
+                               bool Dynamic);
+  PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue,
+                     Instruction *ThenTerm, Value *ValueIfFalse);
+};
+
+} // end anonymous namespace
+
+void LocationMetadata::parse(MDNode *MDN) {
+  assert(MDN->getNumOperands() == 3);
+  MDString *DIFilename = cast<MDString>(MDN->getOperand(0));
+  Filename = DIFilename->getString();
+  LineNo = mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
+  ColumnNo =
+      mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
+}
+
+// FIXME: It would be cleaner to instead attach relevant metadata to the globals
+// we want to sanitize instead and reading this metadata on each pass over a
+// function instead of reading module level metadata at first.
+GlobalsMetadata::GlobalsMetadata(Module &M) {
+  NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
+  if (!Globals)
+    return;
+  for (auto MDN : Globals->operands()) {
+    // Metadata node contains the global and the fields of "Entry".
+    assert(MDN->getNumOperands() == 5);
+    auto *V = mdconst::extract_or_null<Constant>(MDN->getOperand(0));
+    // The optimizer may optimize away a global entirely.
+    if (!V)
+      continue;
+    auto *StrippedV = V->stripPointerCasts();
+    auto *GV = dyn_cast<GlobalVariable>(StrippedV);
+    if (!GV)
+      continue;
+    // We can already have an entry for GV if it was merged with another
+    // global.
+    Entry &E = Entries[GV];
+    if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
+      E.SourceLoc.parse(Loc);
+    if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
+      E.Name = Name->getString();
+    ConstantInt *IsDynInit = mdconst::extract<ConstantInt>(MDN->getOperand(3));
+    E.IsDynInit |= IsDynInit->isOne();
+    ConstantInt *IsBlacklisted =
+        mdconst::extract<ConstantInt>(MDN->getOperand(4));
+    E.IsBlacklisted |= IsBlacklisted->isOne();
+  }
+}
+
+AnalysisKey ASanGlobalsMetadataAnalysis::Key;
+
+GlobalsMetadata ASanGlobalsMetadataAnalysis::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  return GlobalsMetadata(M);
+}
+
+AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover,
+                                           bool UseAfterScope)
+    : CompileKernel(CompileKernel), Recover(Recover),
+      UseAfterScope(UseAfterScope) {}
+
+PreservedAnalyses AddressSanitizerPass::run(Function &F,
+                                            AnalysisManager<Function> &AM) {
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  auto &MAM = MAMProxy.getManager();
+  Module &M = *F.getParent();
+  if (auto *R = MAM.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
+    const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+    AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope);
+    if (Sanitizer.instrumentFunction(F, TLI))
+      return PreservedAnalyses::none();
+    return PreservedAnalyses::all();
+  }
+
+  report_fatal_error(
+      "The ASanGlobalsMetadataAnalysis is required to run before "
+      "AddressSanitizer can run");
+  return PreservedAnalyses::all();
+}
+
+ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(bool CompileKernel,
+                                                       bool Recover,
+                                                       bool UseGlobalGC,
+                                                       bool UseOdrIndicator)
+    : CompileKernel(CompileKernel), Recover(Recover), UseGlobalGC(UseGlobalGC),
+      UseOdrIndicator(UseOdrIndicator) {}
+
+PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
+                                                  AnalysisManager<Module> &AM) {
+  GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M);
+  ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover,
+                                   UseGlobalGC, UseOdrIndicator);
+  if (Sanitizer.instrumentModule(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS(ASanGlobalsMetadataWrapperPass, "asan-globals-md",
+                "Read metadata to mark which globals should be instrumented "
+                "when running ASan.",
+                false, true)
+
+char AddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    AddressSanitizerLegacyPass, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(ASanGlobalsMetadataWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    AddressSanitizerLegacyPass, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
+
+FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
+                                                       bool Recover,
+                                                       bool UseAfterScope) {
+  assert(!CompileKernel || Recover);
+  return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope);
+}
+
+char ModuleAddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS(
+    ModuleAddressSanitizerLegacyPass, "asan-module",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
+    "ModulePass",
+    false, false)
+
+ModulePass *llvm::createModuleAddressSanitizerLegacyPassPass(
+    bool CompileKernel, bool Recover, bool UseGlobalsGC, bool UseOdrIndicator) {
+  assert(!CompileKernel || Recover);
+  return new ModuleAddressSanitizerLegacyPass(CompileKernel, Recover,
+                                              UseGlobalsGC, UseOdrIndicator);
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = countTrailingZeros(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
+/// Create a global describing a source location.
+static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
+                                                       LocationMetadata MD) {
+  Constant *LocData[] = {
+      createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
+  };
+  auto LocStruct = ConstantStruct::getAnon(LocData);
+  auto GV = new GlobalVariable(M, LocStruct->getType(), true,
+                               GlobalValue::PrivateLinkage, LocStruct,
+                               kAsanGenPrefix);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  return GV;
+}
+
+/// Check if \p G has been created by a trusted compiler pass.
+static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
+  // Do not instrument @llvm.global_ctors, @llvm.used, etc.
+  if (G->getName().startswith("llvm."))
+    return true;
+
+  // Do not instrument asan globals.
+  if (G->getName().startswith(kAsanGenPrefix) ||
+      G->getName().startswith(kSanCovGenPrefix) ||
+      G->getName().startswith(kODRGenPrefix))
+    return true;
+
+  // Do not instrument gcov counter arrays.
+  if (G->getName() == "__llvm_gcov_ctr")
+    return true;
+
+  return false;
+}
+
+Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  if (Mapping.Offset == 0) return Shadow;
+  // (Shadow >> scale) | offset
+  Value *ShadowBase;
+  if (LocalDynamicShadow)
+    ShadowBase = LocalDynamicShadow;
+  else
+    ShadowBase = ConstantInt::get(IntptrTy, Mapping.Offset);
+  if (Mapping.OrShadowOffset)
+    return IRB.CreateOr(Shadow, ShadowBase);
+  else
+    return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
+// Instrument memset/memmove/memcpy
+void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        AsanMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+/// Check if we want (and can) handle this alloca.
+bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+  auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI);
+
+  if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
+    return PreviouslySeenAllocaInfo->getSecond();
+
+  bool IsInteresting =
+      (AI.getAllocatedType()->isSized() &&
+       // alloca() may be called with 0 size, ignore it.
+       ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(AI) > 0) &&
+       // We are only interested in allocas not promotable to registers.
+       // Promotable allocas are common under -O0.
+       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
+       // inalloca allocas are not treated as static, and we don't want
+       // dynamic alloca instrumentation for them as well.
+       !AI.isUsedWithInAlloca() &&
+       // swifterror allocas are register promoted by ISel
+       !AI.isSwiftError());
+
+  ProcessedAllocas[&AI] = IsInteresting;
+  return IsInteresting;
+}
+
+Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
+                                                   bool *IsWrite,
+                                                   uint64_t *TypeSize,
+                                                   unsigned *Alignment,
+                                                   Value **MaybeMask) {
+  // Skip memory accesses inserted by another instrumentation.
+  if (I->hasMetadata("nosanitize")) return nullptr;
+
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (LocalDynamicShadow == I)
+    return nullptr;
+
+  Value *PtrOperand = nullptr;
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads) return nullptr;
+    *IsWrite = false;
+    *TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    *Alignment = LI->getAlignment();
+    PtrOperand = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    *Alignment = SI->getAlignment();
+    PtrOperand = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    *Alignment = 0;
+    PtrOperand = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    *Alignment = 0;
+    PtrOperand = XCHG->getPointerOperand();
+  } else if (auto CI = dyn_cast<CallInst>(I)) {
+    auto *F = dyn_cast<Function>(CI->getCalledValue());
+    if (F && (F->getName().startswith("llvm.masked.load.") ||
+              F->getName().startswith("llvm.masked.store."))) {
+      unsigned OpOffset = 0;
+      if (F->getName().startswith("llvm.masked.store.")) {
+        if (!ClInstrumentWrites)
+          return nullptr;
+        // Masked store has an initial operand for the value.
+        OpOffset = 1;
+        *IsWrite = true;
+      } else {
+        if (!ClInstrumentReads)
+          return nullptr;
+        *IsWrite = false;
+      }
+
+      auto BasePtr = CI->getOperand(0 + OpOffset);
+      auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      *TypeSize = DL.getTypeStoreSizeInBits(Ty);
+      if (auto AlignmentConstant =
+              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        *Alignment = (unsigned)AlignmentConstant->getZExtValue();
+      else
+        *Alignment = 1; // No alignment guarantees. We probably got Undef
+      if (MaybeMask)
+        *MaybeMask = CI->getOperand(2 + OpOffset);
+      PtrOperand = BasePtr;
+    }
+  }
+
+  if (PtrOperand) {
+    // Do not instrument acesses from different address spaces; we cannot deal
+    // with them.
+    Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return nullptr;
+
+    // Ignore swifterror addresses.
+    // swifterror memory addresses are mem2reg promoted by instruction
+    // selection. As such they cannot have regular uses like an instrumentation
+    // function and it makes no sense to track them as memory.
+    if (PtrOperand->isSwiftError())
+      return nullptr;
+  }
+
+  // Treat memory accesses to promotable allocas as non-interesting since they
+  // will not cause memory violations. This greatly speeds up the instrumented
+  // executable at -O0.
+  if (ClSkipPromotableAllocas)
+    if (auto AI = dyn_cast_or_null<AllocaInst>(PtrOperand))
+      return isInterestingAlloca(*AI) ? AI : nullptr;
+
+  return PtrOperand;
+}
+
+static bool isPointerOperand(Value *V) {
+  return V->getType()->isPointerTy() || isa<PtrToIntInst>(V);
+}
+
+// This is a rough heuristic; it may cause both false positives and
+// false negatives. The proper implementation requires cooperation with
+// the frontend.
+static bool isInterestingPointerComparison(Instruction *I) {
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) {
+    if (!Cmp->isRelational())
+      return false;
+  } else {
+    return false;
+  }
+  return isPointerOperand(I->getOperand(0)) &&
+         isPointerOperand(I->getOperand(1));
+}
+
+// This is a rough heuristic; it may cause both false positives and
+// false negatives. The proper implementation requires cooperation with
+// the frontend.
+static bool isInterestingPointerSubtraction(Instruction *I) {
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    if (BO->getOpcode() != Instruction::Sub)
+      return false;
+  } else {
+    return false;
+  }
+  return isPointerOperand(I->getOperand(0)) &&
+         isPointerOperand(I->getOperand(1));
+}
+
+bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
+  // If a global variable does not have dynamic initialization we don't
+  // have to instrument it.  However, if a global does not have initializer
+  // at all, we assume it has dynamic initializer (in other TU).
+  //
+  // FIXME: Metadata should be attched directly to the global directly instead
+  // of being added to llvm.asan.globals.
+  return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
+}
+
+void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
+    Instruction *I) {
+  IRBuilder<> IRB(I);
+  FunctionCallee F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
+  Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
+  for (Value *&i : Param) {
+    if (i->getType()->isPointerTy())
+      i = IRB.CreatePointerCast(i, IntptrTy);
+  }
+  IRB.CreateCall(F, Param);
+}
+
+static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
+                                Instruction *InsertBefore, Value *Addr,
+                                unsigned Alignment, unsigned Granularity,
+                                uint32_t TypeSize, bool IsWrite,
+                                Value *SizeArgument, bool UseCalls,
+                                uint32_t Exp) {
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
+  // if the data is properly aligned.
+  if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
+       TypeSize == 128) &&
+      (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
+    return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
+                                   nullptr, UseCalls, Exp);
+  Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
+                                         IsWrite, nullptr, UseCalls, Exp);
+}
+
+static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
+                                        const DataLayout &DL, Type *IntptrTy,
+                                        Value *Mask, Instruction *I,
+                                        Value *Addr, unsigned Alignment,
+                                        unsigned Granularity, uint32_t TypeSize,
+                                        bool IsWrite, Value *SizeArgument,
+                                        bool UseCalls, uint32_t Exp) {
+  auto *VTy = cast<PointerType>(Addr->getType())->getElementType();
+  uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+  unsigned Num = VTy->getVectorNumElements();
+  auto Zero = ConstantInt::get(IntptrTy, 0);
+  for (unsigned Idx = 0; Idx < Num; ++Idx) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isZero())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to doInstrumentAddress
+        // with InsertBefore == I
+      }
+    } else {
+      IRBuilder<> IRB(I);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
+    }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
+                        Granularity, ElemTypeSize, IsWrite, SizeArgument,
+                        UseCalls, Exp);
+  }
+}
+
+void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                     Instruction *I, bool UseCalls,
+                                     const DataLayout &DL) {
+  bool IsWrite = false;
+  unsigned Alignment = 0;
+  uint64_t TypeSize = 0;
+  Value *MaybeMask = nullptr;
+  Value *Addr =
+      isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment, &MaybeMask);
+  assert(Addr);
+
+  // Optimization experiments.
+  // The experiments can be used to evaluate potential optimizations that remove
+  // instrumentation (assess false negatives). Instead of completely removing
+  // some instrumentation, you set Exp to a non-zero value (mask of optimization
+  // experiments that want to remove instrumentation of this instruction).
+  // If Exp is non-zero, this pass will emit special calls into runtime
+  // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls
+  // make runtime terminate the program in a special way (with a different
+  // exit status). Then you run the new compiler on a buggy corpus, collect
+  // the special terminations (ideally, you don't see them at all -- no false
+  // negatives) and make the decision on the optimization.
+  uint32_t Exp = ClForceExperiment;
+
+  if (ClOpt && ClOptGlobals) {
+    // If initialization order checking is disabled, a simple access to a
+    // dynamically initialized global is always valid.
+    GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL));
+    if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
+        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+      NumOptimizedAccessesToGlobalVar++;
+      return;
+    }
+  }
+
+  if (ClOpt && ClOptStack) {
+    // A direct inbounds access to a stack variable is always valid.
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+      NumOptimizedAccessesToStackVar++;
+      return;
+    }
+  }
+
+  if (IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
+  unsigned Granularity = 1 << Mapping.Scale;
+  if (MaybeMask) {
+    instrumentMaskedLoadOrStore(this, DL, IntptrTy, MaybeMask, I, Addr,
+                                Alignment, Granularity, TypeSize, IsWrite,
+                                nullptr, UseCalls, Exp);
+  } else {
+    doInstrumentAddress(this, I, I, Addr, Alignment, Granularity, TypeSize,
+                        IsWrite, nullptr, UseCalls, Exp);
+  }
+}
+
+Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
+                                                 Value *Addr, bool IsWrite,
+                                                 size_t AccessSizeIndex,
+                                                 Value *SizeArgument,
+                                                 uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp);
+  CallInst *Call = nullptr;
+  if (SizeArgument) {
+    if (Exp == 0)
+      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0],
+                            {Addr, SizeArgument});
+    else
+      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1],
+                            {Addr, SizeArgument, ExpVal});
+  } else {
+    if (Exp == 0)
+      Call =
+          IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr);
+    else
+      Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex],
+                            {Addr, ExpVal});
+  }
+
+  // We don't do Call->setDoesNotReturn() because the BB already has
+  // UnreachableInst at the end.
+  // This EmptyAsm is required to avoid callback merge.
+  IRB.CreateCall(EmptyAsm, {});
+  return Call;
+}
+
+Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                                           Value *ShadowValue,
+                                           uint32_t TypeSize) {
+  size_t Granularity = static_cast<size_t>(1) << Mapping.Scale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte =
+      IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte =
+      IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+                                         Instruction *InsertBefore, Value *Addr,
+                                         uint32_t TypeSize, bool IsWrite,
+                                         Value *SizeArgument, bool UseCalls,
+                                         uint32_t Exp) {
+  bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
+
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
+                     AddrLong);
+    else
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
+                     {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+    return;
+  }
+
+  if (IsMyriad) {
+    // Strip the cache bit and do range check.
+    // AddrLong &= ~kMyriadCacheBitMask32
+    AddrLong = IRB.CreateAnd(AddrLong, ~kMyriadCacheBitMask32);
+    // Tag = AddrLong >> kMyriadTagShift
+    Value *Tag = IRB.CreateLShr(AddrLong, kMyriadTagShift);
+    // Tag == kMyriadDDRTag
+    Value *TagCheck =
+        IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag));
+
+    Instruction *TagCheckTerm =
+        SplitBlockAndInsertIfThen(TagCheck, InsertBefore, false,
+                                  MDBuilder(*C).createBranchWeights(1, 100000));
+    assert(cast<BranchInst>(TagCheckTerm)->isUnconditional());
+    IRB.SetInsertPoint(TagCheckTerm);
+    InsertBefore = TagCheckTerm;
+  }
+
+  Type *ShadowTy =
+      IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *CmpVal = Constant::getNullValue(ShadowTy);
+  Value *ShadowValue =
+      IRB.CreateLoad(ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+
+  Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
+  size_t Granularity = 1ULL << Mapping.Scale;
+  Instruction *CrashTerm = nullptr;
+
+  if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
+    // We use branch weights for the slow path check, to indicate that the slow
+    // path is rarely taken. This seems to be the case for SPEC benchmarks.
+    Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+        Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    assert(cast<BranchInst>(CheckTerm)->isUnconditional());
+    BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+    IRB.SetInsertPoint(CheckTerm);
+    Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
+    if (Recover) {
+      CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false);
+    } else {
+      BasicBlock *CrashBlock =
+        BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
+      CrashTerm = new UnreachableInst(*C, CrashBlock);
+      BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+      ReplaceInstWithInst(CheckTerm, NewTerm);
+    }
+  } else {
+    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover);
+  }
+
+  Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
+                                         AccessSizeIndex, SizeArgument, Exp);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+}
+
+// Instrument unusual size or unusual alignment.
+// We can not do it with a single check, so we do 1-byte check for the first
+// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+// to report the actual access size.
+void AddressSanitizer::instrumentUnusualSizeOrAlignment(
+    Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize,
+    bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
+                     {AddrLong, Size});
+    else
+      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1],
+                     {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+  } else {
+    Value *LastByte = IRB.CreateIntToPtr(
+        IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+        Addr->getType());
+    instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
+  }
+}
+
+void ModuleAddressSanitizer::poisonOneInitializer(Function &GlobalInit,
+                                                  GlobalValue *ModuleName) {
+  // Set up the arguments to our poison/unpoison functions.
+  IRBuilder<> IRB(&GlobalInit.front(),
+                  GlobalInit.front().getFirstInsertionPt());
+
+  // Add a call to poison all external globals before the given function starts.
+  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
+
+  // Add calls to unpoison all globals before each return instruction.
+  for (auto &BB : GlobalInit.getBasicBlockList())
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      CallInst::Create(AsanUnpoisonGlobals, "", RI);
+}
+
+void ModuleAddressSanitizer::createInitializerPoisonCalls(
+    Module &M, GlobalValue *ModuleName) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return;
+
+  ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!CA)
+    return;
+
+  for (Use &OP : CA->operands()) {
+    if (isa<ConstantAggregateZero>(OP)) continue;
+    ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+    // Must have a function or null ptr.
+    if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
+      if (F->getName() == kAsanModuleCtorName) continue;
+      auto *Priority = cast<ConstantInt>(CS->getOperand(0));
+      // Don't instrument CTORs that will run before asan.module_ctor.
+      if (Priority->getLimitedValue() <= GetCtorAndDtorPriority(TargetTriple))
+        continue;
+      poisonOneInitializer(*F, ModuleName);
+    }
+  }
+}
+
+bool ModuleAddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
+  Type *Ty = G->getValueType();
+  LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+
+  // FIXME: Metadata should be attched directly to the global directly instead
+  // of being added to llvm.asan.globals.
+  if (GlobalsMD.get(G).IsBlacklisted) return false;
+  if (!Ty->isSized()) return false;
+  if (!G->hasInitializer()) return false;
+  if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
+  // Two problems with thread-locals:
+  //   - The address of the main thread's copy can't be computed at link-time.
+  //   - Need to poison all copies, not just the main thread's one.
+  if (G->isThreadLocal()) return false;
+  // For now, just ignore this Global if the alignment is large.
+  if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false;
+
+  // For non-COFF targets, only instrument globals known to be defined by this
+  // TU.
+  // FIXME: We can instrument comdat globals on ELF if we are using the
+  // GC-friendly metadata scheme.
+  if (!TargetTriple.isOSBinFormatCOFF()) {
+    if (!G->hasExactDefinition() || G->hasComdat())
+      return false;
+  } else {
+    // On COFF, don't instrument non-ODR linkages.
+    if (G->isInterposable())
+      return false;
+  }
+
+  // If a comdat is present, it must have a selection kind that implies ODR
+  // semantics: no duplicates, any, or exact match.
+  if (Comdat *C = G->getComdat()) {
+    switch (C->getSelectionKind()) {
+    case Comdat::Any:
+    case Comdat::ExactMatch:
+    case Comdat::NoDuplicates:
+      break;
+    case Comdat::Largest:
+    case Comdat::SameSize:
+      return false;
+    }
+  }
+
+  if (G->hasSection()) {
+    StringRef Section = G->getSection();
+
+    // Globals from llvm.metadata aren't emitted, do not instrument them.
+    if (Section == "llvm.metadata") return false;
+    // Do not instrument globals from special LLVM sections.
+    if (Section.find("__llvm") != StringRef::npos || Section.find("__LLVM") != StringRef::npos) return false;
+
+    // Do not instrument function pointers to initialization and termination
+    // routines: dynamic linker will not properly handle redzones.
+    if (Section.startswith(".preinit_array") ||
+        Section.startswith(".init_array") ||
+        Section.startswith(".fini_array")) {
+      return false;
+    }
+
+    // On COFF, if the section name contains '$', it is highly likely that the
+    // user is using section sorting to create an array of globals similar to
+    // the way initialization callbacks are registered in .init_array and
+    // .CRT$XCU. The ATL also registers things in .ATL$__[azm]. Adding redzones
+    // to such globals is counterproductive, because the intent is that they
+    // will form an array, and out-of-bounds accesses are expected.
+    // See https://github.com/google/sanitizers/issues/305
+    // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
+    if (TargetTriple.isOSBinFormatCOFF() && Section.contains('$')) {
+      LLVM_DEBUG(dbgs() << "Ignoring global in sorted section (contains '$'): "
+                        << *G << "\n");
+      return false;
+    }
+
+    if (TargetTriple.isOSBinFormatMachO()) {
+      StringRef ParsedSegment, ParsedSection;
+      unsigned TAA = 0, StubSize = 0;
+      bool TAAParsed;
+      std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
+          Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
+      assert(ErrorCode.empty() && "Invalid section specifier.");
+
+      // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+      // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+      // them.
+      if (ParsedSegment == "__OBJC" ||
+          (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
+        LLVM_DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
+        return false;
+      }
+      // See https://github.com/google/sanitizers/issues/32
+      // Constant CFString instances are compiled in the following way:
+      //  -- the string buffer is emitted into
+      //     __TEXT,__cstring,cstring_literals
+      //  -- the constant NSConstantString structure referencing that buffer
+      //     is placed into __DATA,__cfstring
+      // Therefore there's no point in placing redzones into __DATA,__cfstring.
+      // Moreover, it causes the linker to crash on OS X 10.7
+      if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
+        LLVM_DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
+        return false;
+      }
+      // The linker merges the contents of cstring_literals and removes the
+      // trailing zeroes.
+      if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
+        LLVM_DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// On Mach-O platforms, we emit global metadata in a separate section of the
+// binary in order to allow the linker to properly dead strip. This is only
+// supported on recent versions of ld64.
+bool ModuleAddressSanitizer::ShouldUseMachOGlobalsSection() const {
+  if (!TargetTriple.isOSBinFormatMachO())
+    return false;
+
+  if (TargetTriple.isMacOSX() && !TargetTriple.isMacOSXVersionLT(10, 11))
+    return true;
+  if (TargetTriple.isiOS() /* or tvOS */ && !TargetTriple.isOSVersionLT(9))
+    return true;
+  if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
+    return true;
+
+  return false;
+}
+
+StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const {
+  switch (TargetTriple.getObjectFormat()) {
+  case Triple::COFF:  return ".ASAN$GL";
+  case Triple::ELF:   return "asan_globals";
+  case Triple::MachO: return "__DATA,__asan_globals,regular";
+  case Triple::Wasm:
+  case Triple::XCOFF:
+    report_fatal_error(
+        "ModuleAddressSanitizer not implemented for object file format.");
+  case Triple::UnknownObjectFormat:
+    break;
+  }
+  llvm_unreachable("unsupported object format");
+}
+
+void ModuleAddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  // Declare our poisoning and unpoisoning functions.
+  AsanPoisonGlobals =
+      M.getOrInsertFunction(kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy);
+  AsanUnpoisonGlobals =
+      M.getOrInsertFunction(kAsanUnpoisonGlobalsName, IRB.getVoidTy());
+
+  // Declare functions that register/unregister globals.
+  AsanRegisterGlobals = M.getOrInsertFunction(
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  AsanUnregisterGlobals = M.getOrInsertFunction(
+      kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+
+  // Declare the functions that find globals in a shared object and then invoke
+  // the (un)register function on them.
+  AsanRegisterImageGlobals = M.getOrInsertFunction(
+      kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy);
+  AsanUnregisterImageGlobals = M.getOrInsertFunction(
+      kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy);
+
+  AsanRegisterElfGlobals =
+      M.getOrInsertFunction(kAsanRegisterElfGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, IntptrTy);
+  AsanUnregisterElfGlobals =
+      M.getOrInsertFunction(kAsanUnregisterElfGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, IntptrTy);
+}
+
+// Put the metadata and the instrumented global in the same group. This ensures
+// that the metadata is discarded if the instrumented global is discarded.
+void ModuleAddressSanitizer::SetComdatForGlobalMetadata(
+    GlobalVariable *G, GlobalVariable *Metadata, StringRef InternalSuffix) {
+  Module &M = *G->getParent();
+  Comdat *C = G->getComdat();
+  if (!C) {
+    if (!G->hasName()) {
+      // If G is unnamed, it must be internal. Give it an artificial name
+      // so we can put it in a comdat.
+      assert(G->hasLocalLinkage());
+      G->setName(Twine(kAsanGenPrefix) + "_anon_global");
+    }
+
+    if (!InternalSuffix.empty() && G->hasLocalLinkage()) {
+      std::string Name = G->getName();
+      Name += InternalSuffix;
+      C = M.getOrInsertComdat(Name);
+    } else {
+      C = M.getOrInsertComdat(G->getName());
+    }
+
+    // Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF. Also upgrade private
+    // linkage to internal linkage so that a symbol table entry is emitted. This
+    // is necessary in order to create the comdat group.
+    if (TargetTriple.isOSBinFormatCOFF()) {
+      C->setSelectionKind(Comdat::NoDuplicates);
+      if (G->hasPrivateLinkage())
+        G->setLinkage(GlobalValue::InternalLinkage);
+    }
+    G->setComdat(C);
+  }
+
+  assert(G->hasComdat());
+  Metadata->setComdat(G->getComdat());
+}
+
+// Create a separate metadata global and put it in the appropriate ASan
+// global registration section.
+GlobalVariable *
+ModuleAddressSanitizer::CreateMetadataGlobal(Module &M, Constant *Initializer,
+                                             StringRef OriginalName) {
+  auto Linkage = TargetTriple.isOSBinFormatMachO()
+                     ? GlobalVariable::InternalLinkage
+                     : GlobalVariable::PrivateLinkage;
+  GlobalVariable *Metadata = new GlobalVariable(
+      M, Initializer->getType(), false, Linkage, Initializer,
+      Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
+  Metadata->setSection(getGlobalMetadataSection());
+  return Metadata;
+}
+
+IRBuilder<> ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) {
+  AsanDtorFunction =
+      Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
+                       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+  BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
+
+  return IRBuilder<>(ReturnInst::Create(*C, AsanDtorBB));
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+  auto &DL = M.getDataLayout();
+
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    Constant *Initializer = MetadataInitializers[i];
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, Initializer, G->getName());
+
+    // The MSVC linker always inserts padding when linking incrementally. We
+    // cope with that by aligning each struct to its size, which must be a power
+    // of two.
+    unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
+    assert(isPowerOf2_32(SizeOfGlobalStruct) &&
+           "global metadata will not be padded appropriately");
+    Metadata->setAlignment(assumeAligned(SizeOfGlobalStruct));
+
+    SetComdatForGlobalMetadata(G, Metadata, "");
+  }
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsELF(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers,
+    const std::string &UniqueModuleId) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+  SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, MetadataInitializers[i], G->getName());
+    MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
+    Metadata->setMetadata(LLVMContext::MD_associated, MD);
+    MetadataGlobals[i] = Metadata;
+
+    SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
+  }
+
+  // Update llvm.compiler.used, adding the new metadata globals. This is
+  // needed so that during LTO these variables stay alive.
+  if (!MetadataGlobals.empty())
+    appendToCompilerUsed(M, MetadataGlobals);
+
+  // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+  // to look up the loaded image that contains it. Second, we can store in it
+  // whether registration has already occurred, to prevent duplicate
+  // registration.
+  //
+  // Common linkage ensures that there is only one global per shared library.
+  GlobalVariable *RegisteredFlag = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::CommonLinkage,
+      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
+  // Create start and stop symbols.
+  GlobalVariable *StartELFMetadata = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
+      "__start_" + getGlobalMetadataSection());
+  StartELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
+  GlobalVariable *StopELFMetadata = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
+      "__stop_" + getGlobalMetadataSection());
+  StopELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
+
+  // Create a call to register the globals with the runtime.
+  IRB.CreateCall(AsanRegisterElfGlobals,
+                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+                  IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+                  IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRB_Dtor.CreateCall(AsanUnregisterElfGlobals,
+                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+                       IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+                       IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsMachO(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+  // On recent Mach-O platforms, use a structure which binds the liveness of
+  // the global variable to the metadata struct. Keep the list of "Liveness" GV
+  // created to be added to llvm.compiler.used
+  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
+  SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
+
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    Constant *Initializer = MetadataInitializers[i];
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, Initializer, G->getName());
+
+    // On recent Mach-O platforms, we emit the global metadata in a way that
+    // allows the linker to properly strip dead globals.
+    auto LivenessBinder =
+        ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+                            ConstantExpr::getPointerCast(Metadata, IntptrTy));
+    GlobalVariable *Liveness = new GlobalVariable(
+        M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
+        Twine("__asan_binder_") + G->getName());
+    Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
+    LivenessGlobals[i] = Liveness;
+  }
+
+  // Update llvm.compiler.used, adding the new liveness globals. This is
+  // needed so that during LTO these variables stay alive. The alternative
+  // would be to have the linker handling the LTO symbols, but libLTO
+  // current API does not expose access to the section for each symbol.
+  if (!LivenessGlobals.empty())
+    appendToCompilerUsed(M, LivenessGlobals);
+
+  // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+  // to look up the loaded image that contains it. Second, we can store in it
+  // whether registration has already occurred, to prevent duplicate
+  // registration.
+  //
+  // common linkage ensures that there is only one global per shared library.
+  GlobalVariable *RegisteredFlag = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::CommonLinkage,
+      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
+  IRB.CreateCall(AsanRegisterImageGlobals,
+                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
+                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+  unsigned N = ExtendedGlobals.size();
+  assert(N > 0);
+
+  // On platforms that don't have a custom metadata section, we emit an array
+  // of global metadata structures.
+  ArrayType *ArrayOfGlobalStructTy =
+      ArrayType::get(MetadataInitializers[0]->getType(), N);
+  auto AllGlobals = new GlobalVariable(
+      M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
+      ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");
+  if (Mapping.Scale > 3)
+    AllGlobals->setAlignment(Align(1ULL << Mapping.Scale));
+
+  IRB.CreateCall(AsanRegisterGlobals,
+                 {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                  ConstantInt::get(IntptrTy, N)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRB_Dtor.CreateCall(AsanUnregisterGlobals,
+                      {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                       ConstantInt::get(IntptrTy, N)});
+}
+
+// This function replaces all global variables with new variables that have
+// trailing redzones. It also creates a function that poisons
+// redzones and inserts this function into llvm.global_ctors.
+// Sets *CtorComdat to true if the global registration code emitted into the
+// asan constructor is comdat-compatible.
+bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
+                                               bool *CtorComdat) {
+  *CtorComdat = false;
+
+  SmallVector<GlobalVariable *, 16> GlobalsToChange;
+
+  for (auto &G : M.globals()) {
+    if (ShouldInstrumentGlobal(&G)) GlobalsToChange.push_back(&G);
+  }
+
+  size_t n = GlobalsToChange.size();
+  if (n == 0) {
+    *CtorComdat = true;
+    return false;
+  }
+
+  auto &DL = M.getDataLayout();
+
+  // A global is described by a structure
+  //   size_t beg;
+  //   size_t size;
+  //   size_t size_with_redzone;
+  //   const char *name;
+  //   const char *module_name;
+  //   size_t has_dynamic_init;
+  //   void *source_location;
+  //   size_t odr_indicator;
+  // We initialize an array of such structures and pass it to a run-time call.
+  StructType *GlobalStructTy =
+      StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
+                      IntptrTy, IntptrTy, IntptrTy);
+  SmallVector<GlobalVariable *, 16> NewGlobals(n);
+  SmallVector<Constant *, 16> Initializers(n);
+
+  bool HasDynamicallyInitializedGlobals = false;
+
+  // We shouldn't merge same module names, as this string serves as unique
+  // module ID in runtime.
+  GlobalVariable *ModuleName = createPrivateGlobalForString(
+      M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
+
+  for (size_t i = 0; i < n; i++) {
+    static const uint64_t kMaxGlobalRedzone = 1 << 18;
+    GlobalVariable *G = GlobalsToChange[i];
+
+    // FIXME: Metadata should be attched directly to the global directly instead
+    // of being added to llvm.asan.globals.
+    auto MD = GlobalsMD.get(G);
+    StringRef NameForGlobal = G->getName();
+    // Create string holding the global name (use global name from metadata
+    // if it's available, otherwise just write the name of global variable).
+    GlobalVariable *Name = createPrivateGlobalForString(
+        M, MD.Name.empty() ? NameForGlobal : MD.Name,
+        /*AllowMerging*/ true, kAsanGenPrefix);
+
+    Type *Ty = G->getValueType();
+    uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+    uint64_t MinRZ = MinRedzoneSizeForGlobal();
+    // MinRZ <= RZ <= kMaxGlobalRedzone
+    // and trying to make RZ to be ~ 1/4 of SizeInBytes.
+    uint64_t RZ = std::max(
+        MinRZ, std::min(kMaxGlobalRedzone, (SizeInBytes / MinRZ / 4) * MinRZ));
+    uint64_t RightRedzoneSize = RZ;
+    // Round up to MinRZ
+    if (SizeInBytes % MinRZ) RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
+    assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
+    Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
+
+    // Create a new global variable with enough space for a redzone.
+    GlobalValue::LinkageTypes Linkage = G->getLinkage();
+    if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
+      Linkage = GlobalValue::InternalLinkage;
+    GlobalVariable *NewGlobal =
+        new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer,
+                           "", G, G->getThreadLocalMode());
+    NewGlobal->copyAttributesFrom(G);
+    NewGlobal->setComdat(G->getComdat());
+    NewGlobal->setAlignment(MaybeAlign(MinRZ));
+    // Don't fold globals with redzones. ODR violation detector and redzone
+    // poisoning implicitly creates a dependence on the global's address, so it
+    // is no longer valid for it to be marked unnamed_addr.
+    NewGlobal->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+    // Move null-terminated C strings to "__asan_cstring" section on Darwin.
+    if (TargetTriple.isOSBinFormatMachO() && !G->hasSection() &&
+        G->isConstant()) {
+      auto Seq = dyn_cast<ConstantDataSequential>(G->getInitializer());
+      if (Seq && Seq->isCString())
+        NewGlobal->setSection("__TEXT,__asan_cstring,regular");
+    }
+
+    // Transfer the debug info.  The payload starts at offset zero so we can
+    // copy the debug info over as is.
+    SmallVector<DIGlobalVariableExpression *, 1> GVs;
+    G->getDebugInfo(GVs);
+    for (auto *GV : GVs)
+      NewGlobal->addDebugInfo(GV);
+
+    Value *Indices2[2];
+    Indices2[0] = IRB.getInt32(0);
+    Indices2[1] = IRB.getInt32(0);
+
+    G->replaceAllUsesWith(
+        ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true));
+    NewGlobal->takeName(G);
+    G->eraseFromParent();
+    NewGlobals[i] = NewGlobal;
+
+    Constant *SourceLoc;
+    if (!MD.SourceLoc.empty()) {
+      auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
+      SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
+    } else {
+      SourceLoc = ConstantInt::get(IntptrTy, 0);
+    }
+
+    Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
+    GlobalValue *InstrumentedGlobal = NewGlobal;
+
+    bool CanUsePrivateAliases =
+        TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() ||
+        TargetTriple.isOSBinFormatWasm();
+    if (CanUsePrivateAliases && UsePrivateAlias) {
+      // Create local alias for NewGlobal to avoid crash on ODR between
+      // instrumented and non-instrumented libraries.
+      InstrumentedGlobal =
+          GlobalAlias::create(GlobalValue::PrivateLinkage, "", NewGlobal);
+    }
+
+    // ODR should not happen for local linkage.
+    if (NewGlobal->hasLocalLinkage()) {
+      ODRIndicator = ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1),
+                                               IRB.getInt8PtrTy());
+    } else if (UseOdrIndicator) {
+      // With local aliases, we need to provide another externally visible
+      // symbol __odr_asan_XXX to detect ODR violation.
+      auto *ODRIndicatorSym =
+          new GlobalVariable(M, IRB.getInt8Ty(), false, Linkage,
+                             Constant::getNullValue(IRB.getInt8Ty()),
+                             kODRGenPrefix + NameForGlobal, nullptr,
+                             NewGlobal->getThreadLocalMode());
+
+      // Set meaningful attributes for indicator symbol.
+      ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
+      ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
+      ODRIndicatorSym->setAlignment(Align::None());
+      ODRIndicator = ODRIndicatorSym;
+    }
+
+    Constant *Initializer = ConstantStruct::get(
+        GlobalStructTy,
+        ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy),
+        ConstantInt::get(IntptrTy, SizeInBytes),
+        ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
+        ConstantExpr::getPointerCast(Name, IntptrTy),
+        ConstantExpr::getPointerCast(ModuleName, IntptrTy),
+        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
+        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
+
+    if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
+
+    LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
+
+    Initializers[i] = Initializer;
+  }
+
+  // Add instrumented globals to llvm.compiler.used list to avoid LTO from
+  // ConstantMerge'ing them.
+  SmallVector<GlobalValue *, 16> GlobalsToAddToUsedList;
+  for (size_t i = 0; i < n; i++) {
+    GlobalVariable *G = NewGlobals[i];
+    if (G->getName().empty()) continue;
+    GlobalsToAddToUsedList.push_back(G);
+  }
+  appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList));
+
+  std::string ELFUniqueModuleId =
+      (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
+                                                        : "";
+
+  if (!ELFUniqueModuleId.empty()) {
+    InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
+    *CtorComdat = true;
+  } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
+    InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
+  } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
+    InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
+  } else {
+    InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
+  }
+
+  // Create calls for poisoning before initializers run and unpoisoning after.
+  if (HasDynamicallyInitializedGlobals)
+    createInitializerPoisonCalls(M, ModuleName);
+
+  LLVM_DEBUG(dbgs() << M);
+  return true;
+}
+
+int ModuleAddressSanitizer::GetAsanVersion(const Module &M) const {
+  int LongSize = M.getDataLayout().getPointerSizeInBits();
+  bool isAndroid = Triple(M.getTargetTriple()).isAndroid();
+  int Version = 8;
+  // 32-bit Android is one version ahead because of the switch to dynamic
+  // shadow.
+  Version += (LongSize == 32 && isAndroid);
+  return Version;
+}
+
+bool ModuleAddressSanitizer::instrumentModule(Module &M) {
+  initializeCallbacks(M);
+
+  if (CompileKernel)
+    return false;
+
+  // Create a module constructor. A destructor is created lazily because not all
+  // platforms, and not all modules need it.
+  std::string AsanVersion = std::to_string(GetAsanVersion(M));
+  std::string VersionCheckName =
+      ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : "";
+  std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{}, VersionCheckName);
+
+  bool CtorComdat = true;
+  // TODO(glider): temporarily disabled globals instrumentation for KASan.
+  if (ClGlobals) {
+    IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
+    InstrumentGlobals(IRB, M, &CtorComdat);
+  }
+
+  const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple);
+
+  // Put the constructor and destructor in comdat if both
+  // (1) global instrumentation is not TU-specific
+  // (2) target is ELF.
+  if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+    AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
+    appendToGlobalCtors(M, AsanCtorFunction, Priority, AsanCtorFunction);
+    if (AsanDtorFunction) {
+      AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
+      appendToGlobalDtors(M, AsanDtorFunction, Priority, AsanDtorFunction);
+    }
+  } else {
+    appendToGlobalCtors(M, AsanCtorFunction, Priority);
+    if (AsanDtorFunction)
+      appendToGlobalDtors(M, AsanDtorFunction, Priority);
+  }
+
+  return true;
+}
+
+void AddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Create __asan_report* callbacks.
+  // IsWrite, TypeSize and Exp are encoded in the function name.
+  for (int Exp = 0; Exp < 2; Exp++) {
+    for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+      const std::string TypeStr = AccessIsWrite ? "store" : "load";
+      const std::string ExpStr = Exp ? "exp_" : "";
+      const std::string EndingStr = Recover ? "_noabort" : "";
+
+      SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+      SmallVector<Type *, 2> Args1{1, IntptrTy};
+      if (Exp) {
+        Type *ExpType = Type::getInt32Ty(*C);
+        Args2.push_back(ExpType);
+        Args1.push_back(ExpType);
+      }
+      AsanErrorCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
+          kAsanReportErrorTemplate + ExpStr + TypeStr + "_n" + EndingStr,
+          FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+      AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
+          ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
+          FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+           AccessSizeIndex++) {
+        const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
+        AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+            M.getOrInsertFunction(
+                kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
+                FunctionType::get(IRB.getVoidTy(), Args1, false));
+
+        AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+            M.getOrInsertFunction(
+                ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
+                FunctionType::get(IRB.getVoidTy(), Args1, false));
+      }
+    }
+  }
+
+  const std::string MemIntrinCallbackPrefix =
+      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+  AsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
+                                      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                      IRB.getInt8PtrTy(), IntptrTy);
+  AsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
+                                     IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                     IRB.getInt8PtrTy(), IntptrTy);
+  AsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
+                                     IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                     IRB.getInt32Ty(), IntptrTy);
+
+  AsanHandleNoReturnFunc =
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy());
+
+  AsanPtrCmpFunction =
+      M.getOrInsertFunction(kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  AsanPtrSubFunction =
+      M.getOrInsertFunction(kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  // We insert an empty inline asm after __asan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+  if (Mapping.InGlobal)
+    AsanShadowGlobal = M.getOrInsertGlobal("__asan_shadow",
+                                           ArrayType::get(IRB.getInt8Ty(), 0));
+}
+
+bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __asan_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    FunctionCallee AsanInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
+    IRBuilder<> IRB(&F.front(), F.front().begin());
+    IRB.CreateCall(AsanInitFunction, {});
+    return true;
+  }
+  return false;
+}
+
+void AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
+  // Generate code only when dynamic addressing is needed.
+  if (Mapping.Offset != kDynamicShadowSentinel)
+    return;
+
+  IRBuilder<> IRB(&F.front().front());
+  if (Mapping.InGlobal) {
+    if (ClWithIfuncSuppressRemat) {
+      // An empty inline asm with input reg == output reg.
+      // An opaque pointer-to-int cast, basically.
+      InlineAsm *Asm = InlineAsm::get(
+          FunctionType::get(IntptrTy, {AsanShadowGlobal->getType()}, false),
+          StringRef(""), StringRef("=r,0"),
+          /*hasSideEffects=*/false);
+      LocalDynamicShadow =
+          IRB.CreateCall(Asm, {AsanShadowGlobal}, ".asan.shadow");
+    } else {
+      LocalDynamicShadow =
+          IRB.CreatePointerCast(AsanShadowGlobal, IntptrTy, ".asan.shadow");
+    }
+  } else {
+    Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+        kAsanShadowMemoryDynamicAddress, IntptrTy);
+    LocalDynamicShadow = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
+  }
+}
+
+void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
+  // Find the one possible call to llvm.localescape and pre-mark allocas passed
+  // to it as uninteresting. This assumes we haven't started processing allocas
+  // yet. This check is done up front because iterating the use list in
+  // isInterestingAlloca would be algorithmically slower.
+  assert(ProcessedAllocas.empty() && "must process localescape before allocas");
+
+  // Try to get the declaration of llvm.localescape. If it's not in the module,
+  // we can exit early.
+  if (!F.getParent()->getFunction("llvm.localescape")) return;
+
+  // Look for a call to llvm.localescape call in the entry block. It can't be in
+  // any other block.
+  for (Instruction &I : F.getEntryBlock()) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
+      // We found a call. Mark all the allocas passed in as uninteresting.
+      for (Value *Arg : II->arg_operands()) {
+        AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
+        assert(AI && AI->isStaticAlloca() &&
+               "non-static alloca arg to localescape");
+        ProcessedAllocas[AI] = false;
+      }
+      break;
+    }
+  }
+}
+
+bool AddressSanitizer::instrumentFunction(Function &F,
+                                          const TargetLibraryInfo *TLI) {
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
+  if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
+  if (F.getName().startswith("__asan_")) return false;
+
+  bool FunctionModified = false;
+
+  // If needed, insert __asan_init before checking for SanitizeAddress attr.
+  // This function needs to be called even if the function body is not
+  // instrumented.
+  if (maybeInsertAsanInitAtFunctionEntry(F))
+    FunctionModified = true;
+
+  // Leave if the function doesn't need instrumentation.
+  if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
+
+  LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+
+  initializeCallbacks(*F.getParent());
+
+  FunctionStateRAII CleanupObj(this);
+
+  maybeInsertDynamicShadowAtFunctionEntry(F);
+
+  // We can't instrument allocas used with llvm.localescape. Only static allocas
+  // can be passed to that intrinsic.
+  markEscapedLocalAllocas(F);
+
+  // We want to instrument every address only once per basic block (unless there
+  // are calls between uses).
+  SmallPtrSet<Value *, 16> TempsToInstrument;
+  SmallVector<Instruction *, 16> ToInstrument;
+  SmallVector<Instruction *, 8> NoReturnCalls;
+  SmallVector<BasicBlock *, 16> AllBlocks;
+  SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
+  int NumAllocas = 0;
+  bool IsWrite;
+  unsigned Alignment;
+  uint64_t TypeSize;
+
+  // Fill the set of memory operations to instrument.
+  for (auto &BB : F) {
+    AllBlocks.push_back(&BB);
+    TempsToInstrument.clear();
+    int NumInsnsPerBB = 0;
+    for (auto &Inst : BB) {
+      if (LooksLikeCodeInBug11395(&Inst)) return false;
+      Value *MaybeMask = nullptr;
+      if (Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
+                                                  &Alignment, &MaybeMask)) {
+        if (ClOpt && ClOptSameTemp) {
+          // If we have a mask, skip instrumentation if we've already
+          // instrumented the full object. But don't add to TempsToInstrument
+          // because we might get another load/store with a different mask.
+          if (MaybeMask) {
+            if (TempsToInstrument.count(Addr))
+              continue; // We've seen this (whole) temp in the current BB.
+          } else {
+            if (!TempsToInstrument.insert(Addr).second)
+              continue; // We've seen this temp in the current BB.
+          }
+        }
+      } else if (((ClInvalidPointerPairs || ClInvalidPointerCmp) &&
+                  isInterestingPointerComparison(&Inst)) ||
+                 ((ClInvalidPointerPairs || ClInvalidPointerSub) &&
+                  isInterestingPointerSubtraction(&Inst))) {
+        PointerComparisonsOrSubtracts.push_back(&Inst);
+        continue;
+      } else if (isa<MemIntrinsic>(Inst)) {
+        // ok, take it.
+      } else {
+        if (isa<AllocaInst>(Inst)) NumAllocas++;
+        CallSite CS(&Inst);
+        if (CS) {
+          // A call inside BB.
+          TempsToInstrument.clear();
+          if (CS.doesNotReturn() && !CS->hasMetadata("nosanitize"))
+            NoReturnCalls.push_back(CS.getInstruction());
+        }
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+        continue;
+      }
+      ToInstrument.push_back(&Inst);
+      NumInsnsPerBB++;
+      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
+    }
+  }
+
+  bool UseCalls =
+      (ClInstrumentationWithCallsThreshold >= 0 &&
+       ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  ObjectSizeOpts ObjSizeOpts;
+  ObjSizeOpts.RoundToAlign = true;
+  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
+
+  // Instrument.
+  int NumInstrumented = 0;
+  for (auto Inst : ToInstrument) {
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      if (isInterestingMemoryAccess(Inst, &IsWrite, &TypeSize, &Alignment))
+        instrumentMop(ObjSizeVis, Inst, UseCalls,
+                      F.getParent()->getDataLayout());
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  FunctionStackPoisoner FSP(F, *this);
+  bool ChangedStack = FSP.runOnFunction();
+
+  // We must unpoison the stack before NoReturn calls (throw, _exit, etc).
+  // See e.g. https://github.com/google/sanitizers/issues/37
+  for (auto CI : NoReturnCalls) {
+    IRBuilder<> IRB(CI);
+    IRB.CreateCall(AsanHandleNoReturnFunc, {});
+  }
+
+  for (auto Inst : PointerComparisonsOrSubtracts) {
+    instrumentPointerComparisonOrSubtraction(Inst);
+    NumInstrumented++;
+  }
+
+  if (NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty())
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
+                    << F << "\n");
+
+  return FunctionModified;
+}
+
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+  if (LongSize != 32) return false;
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->isInlineAsm()) return false;
+  if (CI->getNumArgOperands() <= 5) return false;
+  // We have inline assembly with quite a few arguments.
+  return true;
+}
+
+void FunctionStackPoisoner::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
+    std::string Suffix = itostr(i);
+    AsanStackMallocFunc[i] = M.getOrInsertFunction(
+        kAsanStackMallocNameTemplate + Suffix, IntptrTy, IntptrTy);
+    AsanStackFreeFunc[i] =
+        M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
+                              IRB.getVoidTy(), IntptrTy, IntptrTy);
+  }
+  if (ASan.UseAfterScope) {
+    AsanPoisonStackMemoryFunc = M.getOrInsertFunction(
+        kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+    AsanUnpoisonStackMemoryFunc = M.getOrInsertFunction(
+        kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  }
+
+  for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
+    std::ostringstream Name;
+    Name << kAsanSetShadowPrefix;
+    Name << std::setw(2) << std::setfill('0') << std::hex << Val;
+    AsanSetShadowFunc[Val] =
+        M.getOrInsertFunction(Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy);
+  }
+
+  AsanAllocaPoisonFunc = M.getOrInsertFunction(
+      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  AsanAllocasUnpoisonFunc = M.getOrInsertFunction(
+      kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy);
+}
+
+void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+                                               ArrayRef<uint8_t> ShadowBytes,
+                                               size_t Begin, size_t End,
+                                               IRBuilder<> &IRB,
+                                               Value *ShadowBase) {
+  if (Begin >= End)
+    return;
+
+  const size_t LargestStoreSizeInBytes =
+      std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8);
+
+  const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian();
+
+  // Poison given range in shadow using larges store size with out leading and
+  // trailing zeros in ShadowMask. Zeros never change, so they need neither
+  // poisoning nor up-poisoning. Still we don't mind if some of them get into a
+  // middle of a store.
+  for (size_t i = Begin; i < End;) {
+    if (!ShadowMask[i]) {
+      assert(!ShadowBytes[i]);
+      ++i;
+      continue;
+    }
+
+    size_t StoreSizeInBytes = LargestStoreSizeInBytes;
+    // Fit store size into the range.
+    while (StoreSizeInBytes > End - i)
+      StoreSizeInBytes /= 2;
+
+    // Minimize store size by trimming trailing zeros.
+    for (size_t j = StoreSizeInBytes - 1; j && !ShadowMask[i + j]; --j) {
+      while (j <= StoreSizeInBytes / 2)
+        StoreSizeInBytes /= 2;
+    }
+
+    uint64_t Val = 0;
+    for (size_t j = 0; j < StoreSizeInBytes; j++) {
+      if (IsLittleEndian)
+        Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
+      else
+        Val = (Val << 8) | ShadowBytes[i + j];
+    }
+
+    Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+    Value *Poison = IRB.getIntN(StoreSizeInBytes * 8, Val);
+    IRB.CreateAlignedStore(
+        Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()), 1);
+
+    i += StoreSizeInBytes;
+  }
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+                                         ArrayRef<uint8_t> ShadowBytes,
+                                         IRBuilder<> &IRB, Value *ShadowBase) {
+  copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase);
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+                                         ArrayRef<uint8_t> ShadowBytes,
+                                         size_t Begin, size_t End,
+                                         IRBuilder<> &IRB, Value *ShadowBase) {
+  assert(ShadowMask.size() == ShadowBytes.size());
+  size_t Done = Begin;
+  for (size_t i = Begin, j = Begin + 1; i < End; i = j++) {
+    if (!ShadowMask[i]) {
+      assert(!ShadowBytes[i]);
+      continue;
+    }
+    uint8_t Val = ShadowBytes[i];
+    if (!AsanSetShadowFunc[Val])
+      continue;
+
+    // Skip same values.
+    for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) {
+    }
+
+    if (j - i >= ClMaxInlinePoisoningSize) {
+      copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase);
+      IRB.CreateCall(AsanSetShadowFunc[Val],
+                     {IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)),
+                      ConstantInt::get(IntptrTy, j - i)});
+      Done = j;
+    }
+  }
+
+  copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase);
+}
+
+// Fake stack allocator (asan_fake_stack.h) has 11 size classes
+// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
+static int StackMallocSizeClass(uint64_t LocalStackSize) {
+  assert(LocalStackSize <= kMaxStackMallocSize);
+  uint64_t MaxSize = kMinStackMallocSize;
+  for (int i = 0;; i++, MaxSize *= 2)
+    if (LocalStackSize <= MaxSize) return i;
+  llvm_unreachable("impossible LocalStackSize");
+}
+
+void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
+  Instruction *CopyInsertPoint = &F.front().front();
+  if (CopyInsertPoint == ASan.LocalDynamicShadow) {
+    // Insert after the dynamic shadow location is determined
+    CopyInsertPoint = CopyInsertPoint->getNextNode();
+    assert(CopyInsertPoint);
+  }
+  IRBuilder<> IRB(CopyInsertPoint);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr()) {
+      Type *Ty = Arg.getType()->getPointerElementType();
+      unsigned Alignment = Arg.getParamAlignment();
+      if (Alignment == 0)
+        Alignment = DL.getABITypeAlignment(Ty);
+
+      AllocaInst *AI = IRB.CreateAlloca(
+          Ty, nullptr,
+          (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) +
+              ".byval");
+      AI->setAlignment(Align(Alignment));
+      Arg.replaceAllUsesWith(AI);
+
+      uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+      IRB.CreateMemCpy(AI, Alignment, &Arg, Alignment, AllocSize);
+    }
+  }
+}
+
+PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
+                                          Value *ValueIfTrue,
+                                          Instruction *ThenTerm,
+                                          Value *ValueIfFalse) {
+  PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+  BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
+  PHI->addIncoming(ValueIfFalse, CondBlock);
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  PHI->addIncoming(ValueIfTrue, ThenBlock);
+  return PHI;
+}
+
+Value *FunctionStackPoisoner::createAllocaForLayout(
+    IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) {
+  AllocaInst *Alloca;
+  if (Dynamic) {
+    Alloca = IRB.CreateAlloca(IRB.getInt8Ty(),
+                              ConstantInt::get(IRB.getInt64Ty(), L.FrameSize),
+                              "MyAlloca");
+  } else {
+    Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize),
+                              nullptr, "MyAlloca");
+    assert(Alloca->isStaticAlloca());
+  }
+  assert((ClRealignStack & (ClRealignStack - 1)) == 0);
+  size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
+  Alloca->setAlignment(MaybeAlign(FrameAlignment));
+  return IRB.CreatePointerCast(Alloca, IntptrTy);
+}
+
+void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
+  BasicBlock &FirstBB = *F.begin();
+  IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
+  DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
+  IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
+  DynamicAllocaLayout->setAlignment(Align(32));
+}
+
+void FunctionStackPoisoner::processDynamicAllocas() {
+  if (!ClInstrumentDynamicAllocas || DynamicAllocaVec.empty()) {
+    assert(DynamicAllocaPoisonCallVec.empty());
+    return;
+  }
+
+  // Insert poison calls for lifetime intrinsics for dynamic allocas.
+  for (const auto &APC : DynamicAllocaPoisonCallVec) {
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    assert(ASan.isInterestingAlloca(*APC.AI));
+    assert(!APC.AI->isStaticAlloca());
+
+    IRBuilder<> IRB(APC.InsBefore);
+    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
+    // Dynamic allocas will be unpoisoned unconditionally below in
+    // unpoisonDynamicAllocas.
+    // Flag that we need unpoison static allocas.
+  }
+
+  // Handle dynamic allocas.
+  createDynamicAllocasInitStorage();
+  for (auto &AI : DynamicAllocaVec)
+    handleDynamicAllocaCall(AI);
+  unpoisonDynamicAllocas();
+}
+
+void FunctionStackPoisoner::processStaticAllocas() {
+  if (AllocaVec.empty()) {
+    assert(StaticAllocaPoisonCallVec.empty());
+    return;
+  }
+
+  int StackMallocIdx = -1;
+  DebugLoc EntryDebugLocation;
+  if (auto SP = F.getSubprogram())
+    EntryDebugLocation = DebugLoc::get(SP->getScopeLine(), 0, SP);
+
+  Instruction *InsBefore = AllocaVec[0];
+  IRBuilder<> IRB(InsBefore);
+  IRB.SetCurrentDebugLocation(EntryDebugLocation);
+
+  // Make sure non-instrumented allocas stay in the entry block. Otherwise,
+  // debug info is broken, because only entry-block allocas are treated as
+  // regular stack slots.
+  auto InsBeforeB = InsBefore->getParent();
+  assert(InsBeforeB == &F.getEntryBlock());
+  for (auto *AI : StaticAllocasToMoveUp)
+    if (AI->getParent() == InsBeforeB)
+      AI->moveBefore(InsBefore);
+
+  // If we have a call to llvm.localescape, keep it in the entry block.
+  if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
+
+  SmallVector<ASanStackVariableDescription, 16> SVD;
+  SVD.reserve(AllocaVec.size());
+  for (AllocaInst *AI : AllocaVec) {
+    ASanStackVariableDescription D = {AI->getName().data(),
+                                      ASan.getAllocaSizeInBytes(*AI),
+                                      0,
+                                      AI->getAlignment(),
+                                      AI,
+                                      0,
+                                      0};
+    SVD.push_back(D);
+  }
+
+  // Minimal header size (left redzone) is 4 pointers,
+  // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
+  size_t Granularity = 1ULL << Mapping.Scale;
+  size_t MinHeaderSize = std::max((size_t)ASan.LongSize / 2, Granularity);
+  const ASanStackFrameLayout &L =
+      ComputeASanStackFrameLayout(SVD, Granularity, MinHeaderSize);
+
+  // Build AllocaToSVDMap for ASanStackVariableDescription lookup.
+  DenseMap<const AllocaInst *, ASanStackVariableDescription *> AllocaToSVDMap;
+  for (auto &Desc : SVD)
+    AllocaToSVDMap[Desc.AI] = &Desc;
+
+  // Update SVD with information from lifetime intrinsics.
+  for (const auto &APC : StaticAllocaPoisonCallVec) {
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    assert(ASan.isInterestingAlloca(*APC.AI));
+    assert(APC.AI->isStaticAlloca());
+
+    ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+    Desc.LifetimeSize = Desc.Size;
+    if (const DILocation *FnLoc = EntryDebugLocation.get()) {
+      if (const DILocation *LifetimeLoc = APC.InsBefore->getDebugLoc().get()) {
+        if (LifetimeLoc->getFile() == FnLoc->getFile())
+          if (unsigned Line = LifetimeLoc->getLine())
+            Desc.Line = std::min(Desc.Line ? Desc.Line : Line, Line);
+      }
+    }
+  }
+
+  auto DescriptionString = ComputeASanStackFrameDescription(SVD);
+  LLVM_DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
+  uint64_t LocalStackSize = L.FrameSize;
+  bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
+                       LocalStackSize <= kMaxStackMallocSize;
+  bool DoDynamicAlloca = ClDynamicAllocaStack;
+  // Don't do dynamic alloca or stack malloc if:
+  // 1) There is inline asm: too often it makes assumptions on which registers
+  //    are available.
+  // 2) There is a returns_twice call (typically setjmp), which is
+  //    optimization-hostile, and doesn't play well with introduced indirect
+  //    register-relative calculation of local variable addresses.
+  DoDynamicAlloca &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
+  DoStackMalloc &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
+
+  Value *StaticAlloca =
+      DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
+
+  Value *FakeStack;
+  Value *LocalStackBase;
+  Value *LocalStackBaseAlloca;
+  uint8_t DIExprFlags = DIExpression::ApplyOffset;
+
+  if (DoStackMalloc) {
+    LocalStackBaseAlloca =
+        IRB.CreateAlloca(IntptrTy, nullptr, "asan_local_stack_base");
+    // void *FakeStack = __asan_option_detect_stack_use_after_return
+    //     ? __asan_stack_malloc_N(LocalStackSize)
+    //     : nullptr;
+    // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
+    Constant *OptionDetectUseAfterReturn = F.getParent()->getOrInsertGlobal(
+        kAsanOptionDetectUseAfterReturn, IRB.getInt32Ty());
+    Value *UseAfterReturnIsEnabled = IRB.CreateICmpNE(
+        IRB.CreateLoad(IRB.getInt32Ty(), OptionDetectUseAfterReturn),
+        Constant::getNullValue(IRB.getInt32Ty()));
+    Instruction *Term =
+        SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
+    IRBuilder<> IRBIf(Term);
+    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
+    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    Value *FakeStackValue =
+        IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
+                         ConstantInt::get(IntptrTy, LocalStackSize));
+    IRB.SetInsertPoint(InsBefore);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
+    FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
+                          ConstantInt::get(IntptrTy, 0));
+
+    Value *NoFakeStack =
+        IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+    Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
+    IRBIf.SetInsertPoint(Term);
+    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
+    Value *AllocaValue =
+        DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
+
+    IRB.SetInsertPoint(InsBefore);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
+    LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
+    IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
+    DIExprFlags |= DIExpression::DerefBefore;
+  } else {
+    // void *FakeStack = nullptr;
+    // void *LocalStackBase = alloca(LocalStackSize);
+    FakeStack = ConstantInt::get(IntptrTy, 0);
+    LocalStackBase =
+        DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
+    LocalStackBaseAlloca = LocalStackBase;
+  }
+
+  // Replace Alloca instructions with base+offset.
+  for (const auto &Desc : SVD) {
+    AllocaInst *AI = Desc.AI;
+    replaceDbgDeclareForAlloca(AI, LocalStackBaseAlloca, DIB, DIExprFlags,
+                               Desc.Offset);
+    Value *NewAllocaPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
+        AI->getType());
+    AI->replaceAllUsesWith(NewAllocaPtr);
+  }
+
+  // The left-most redzone has enough space for at least 4 pointers.
+  // Write the Magic value to redzone[0].
+  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
+  IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
+                  BasePlus0);
+  // Write the frame description constant to redzone[1].
+  Value *BasePlus1 = IRB.CreateIntToPtr(
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
+      IntptrPtrTy);
+  GlobalVariable *StackDescriptionGlobal =
+      createPrivateGlobalForString(*F.getParent(), DescriptionString,
+                                   /*AllowMerging*/ true, kAsanGenPrefix);
+  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
+  IRB.CreateStore(Description, BasePlus1);
+  // Write the PC to redzone[2].
+  Value *BasePlus2 = IRB.CreateIntToPtr(
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
+      IntptrPtrTy);
+  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
+
+  const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
+
+  // Poison the stack red zones at the entry.
+  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+  // As mask we must use most poisoned case: red zones and after scope.
+  // As bytes we can use either the same or just red zones only.
+  copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
+
+  if (!StaticAllocaPoisonCallVec.empty()) {
+    const auto &ShadowInScope = GetShadowBytes(SVD, L);
+
+    // Poison static allocas near lifetime intrinsics.
+    for (const auto &APC : StaticAllocaPoisonCallVec) {
+      const ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+      assert(Desc.Offset % L.Granularity == 0);
+      size_t Begin = Desc.Offset / L.Granularity;
+      size_t End = Begin + (APC.Size + L.Granularity - 1) / L.Granularity;
+
+      IRBuilder<> IRB(APC.InsBefore);
+      copyToShadow(ShadowAfterScope,
+                   APC.DoPoison ? ShadowAfterScope : ShadowInScope, Begin, End,
+                   IRB, ShadowBase);
+    }
+  }
+
+  SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
+  SmallVector<uint8_t, 64> ShadowAfterReturn;
+
+  // (Un)poison the stack before all ret instructions.
+  for (auto Ret : RetVec) {
+    IRBuilder<> IRBRet(Ret);
+    // Mark the current frame as retired.
+    IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
+                       BasePlus0);
+    if (DoStackMalloc) {
+      assert(StackMallocIdx >= 0);
+      // if FakeStack != 0  // LocalStackBase == FakeStack
+      //     // In use-after-return mode, poison the whole stack frame.
+      //     if StackMallocIdx <= 4
+      //         // For small sizes inline the whole thing:
+      //         memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
+      //         **SavedFlagPtr(FakeStack) = 0
+      //     else
+      //         __asan_stack_free_N(FakeStack, LocalStackSize)
+      // else
+      //     <This is not a fake stack; unpoison the redzones>
+      Value *Cmp =
+          IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
+      Instruction *ThenTerm, *ElseTerm;
+      SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
+
+      IRBuilder<> IRBPoison(ThenTerm);
+      if (StackMallocIdx <= 4) {
+        int ClassSize = kMinStackMallocSize << StackMallocIdx;
+        ShadowAfterReturn.resize(ClassSize / L.Granularity,
+                                 kAsanStackUseAfterReturnMagic);
+        copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
+                     ShadowBase);
+        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
+            FakeStack,
+            ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(
+            IntptrTy, IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        IRBPoison.CreateStore(
+            Constant::getNullValue(IRBPoison.getInt8Ty()),
+            IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
+      } else {
+        // For larger frames call __asan_stack_free_*.
+        IRBPoison.CreateCall(
+            AsanStackFreeFunc[StackMallocIdx],
+            {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
+      }
+
+      IRBuilder<> IRBElse(ElseTerm);
+      copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase);
+    } else {
+      copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase);
+    }
+  }
+
+  // We are done. Remove the old unused alloca instructions.
+  for (auto AI : AllocaVec) AI->eraseFromParent();
+}
+
+void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
+                                         IRBuilder<> &IRB, bool DoPoison) {
+  // For now just insert the call to ASan runtime.
+  Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
+  Value *SizeArg = ConstantInt::get(IntptrTy, Size);
+  IRB.CreateCall(
+      DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc,
+      {AddrArg, SizeArg});
+}
+
+// Handling llvm.lifetime intrinsics for a given %alloca:
+// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
+// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
+//     invalid accesses) and unpoison it for llvm.lifetime.start (the memory
+//     could be poisoned by previous llvm.lifetime.end instruction, as the
+//     variable may go in and out of scope several times, e.g. in loops).
+// (3) if we poisoned at least one %alloca in a function,
+//     unpoison the whole stack frame at function exit.
+void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
+  IRBuilder<> IRB(AI);
+
+  const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment());
+  const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
+
+  Value *Zero = Constant::getNullValue(IntptrTy);
+  Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);
+  Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask);
+
+  // Since we need to extend alloca with additional memory to locate
+  // redzones, and OldSize is number of allocated blocks with
+  // ElementSize size, get allocated memory size in bytes by
+  // OldSize * ElementSize.
+  const unsigned ElementSize =
+      F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
+  Value *OldSize =
+      IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false),
+                    ConstantInt::get(IntptrTy, ElementSize));
+
+  // PartialSize = OldSize % 32
+  Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask);
+
+  // Misalign = kAllocaRzSize - PartialSize;
+  Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize);
+
+  // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0;
+  Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
+  Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
+
+  // AdditionalChunkSize = Align + PartialPadding + kAllocaRzSize
+  // Align is added to locate left redzone, PartialPadding for possible
+  // partial redzone and kAllocaRzSize for right redzone respectively.
+  Value *AdditionalChunkSize = IRB.CreateAdd(
+      ConstantInt::get(IntptrTy, Align + kAllocaRzSize), PartialPadding);
+
+  Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
+
+  // Insert new alloca with new NewSize and Align params.
+  AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
+  NewAlloca->setAlignment(MaybeAlign(Align));
+
+  // NewAddress = Address + Align
+  Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
+                                    ConstantInt::get(IntptrTy, Align));
+
+  // Insert __asan_alloca_poison call for new created alloca.
+  IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});
+
+  // Store the last alloca's address to DynamicAllocaLayout. We'll need this
+  // for unpoisoning stuff.
+  IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout);
+
+  Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
+
+  // Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
+  AI->replaceAllUsesWith(NewAddressPtr);
+
+  // We are done. Erase old alloca from parent.
+  AI->eraseFromParent();
+}
+
+// isSafeAccess returns true if Addr is always inbounds with respect to its
+// base object. For example, it is a field access or an array access with
+// constant inbounds index.
+bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                    Value *Addr, uint64_t TypeSize) const {
+  SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
+  if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
+  uint64_t Size = SizeOffset.first.getZExtValue();
+  int64_t Offset = SizeOffset.second.getSExtValue();
+  // Three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  return Offset >= 0 && Size >= uint64_t(Offset) &&
+         Size - uint64_t(Offset) >= TypeSize / 8;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
new file mode 100644
index 000000000000..ae34be986537
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -0,0 +1,248 @@
+//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/BoundsChecking.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bounds-checking"
+
+static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
+                                  cl::desc("Use one trap block per function"));
+
+STATISTIC(ChecksAdded, "Bounds checks added");
+STATISTIC(ChecksSkipped, "Bounds checks skipped");
+STATISTIC(ChecksUnable, "Bounds checks unable to add");
+
+using BuilderTy = IRBuilder<TargetFolder>;
+
+/// Gets the conditions under which memory accessing instructions will overflow.
+///
+/// \p Ptr is the pointer that will be read/written, and \p InstVal is either
+/// the result from the load or the value being stored. It is used to determine
+/// the size of memory block that is touched.
+///
+/// Returns the condition under which the access will overflow.
+static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
+                                 const DataLayout &DL, TargetLibraryInfo &TLI,
+                                 ObjectSizeOffsetEvaluator &ObjSizeEval,
+                                 BuilderTy &IRB, ScalarEvolution &SE) {
+  uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
+  LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+                    << " bytes\n");
+
+  SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr);
+
+  if (!ObjSizeEval.bothKnown(SizeOffset)) {
+    ++ChecksUnable;
+    return nullptr;
+  }
+
+  Value *Size   = SizeOffset.first;
+  Value *Offset = SizeOffset.second;
+  ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
+
+  Type *IntTy = DL.getIntPtrType(Ptr->getType());
+  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+
+  auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size));
+  auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset));
+  auto NeededSizeRange = SE.getUnsignedRange(SE.getSCEV(NeededSizeVal));
+
+  // three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  //
+  // optimization: if Size >= 0 (signed), skip 1st check
+  // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
+  Value *ObjSize = IRB.CreateSub(Size, Offset);
+  Value *Cmp2 = SizeRange.getUnsignedMin().uge(OffsetRange.getUnsignedMax())
+                    ? ConstantInt::getFalse(Ptr->getContext())
+                    : IRB.CreateICmpULT(Size, Offset);
+  Value *Cmp3 = SizeRange.sub(OffsetRange)
+                        .getUnsignedMin()
+                        .uge(NeededSizeRange.getUnsignedMax())
+                    ? ConstantInt::getFalse(Ptr->getContext())
+                    : IRB.CreateICmpULT(ObjSize, NeededSizeVal);
+  Value *Or = IRB.CreateOr(Cmp2, Cmp3);
+  if ((!SizeCI || SizeCI->getValue().slt(0)) &&
+      !SizeRange.getSignedMin().isNonNegative()) {
+    Value *Cmp1 = IRB.CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
+    Or = IRB.CreateOr(Cmp1, Or);
+  }
+
+  return Or;
+}
+
+/// Adds run-time bounds checks to memory accessing instructions.
+///
+/// \p Or is the condition that should guard the trap.
+///
+/// \p GetTrapBB is a callable that returns the trap BB to use on failure.
+template <typename GetTrapBBT>
+static void insertBoundsCheck(Value *Or, BuilderTy IRB, GetTrapBBT GetTrapBB) {
+  // check if the comparison is always false
+  ConstantInt *C = dyn_cast_or_null<ConstantInt>(Or);
+  if (C) {
+    ++ChecksSkipped;
+    // If non-zero, nothing to do.
+    if (!C->getZExtValue())
+      return;
+  }
+  ++ChecksAdded;
+
+  BasicBlock::iterator SplitI = IRB.GetInsertPoint();
+  BasicBlock *OldBB = SplitI->getParent();
+  BasicBlock *Cont = OldBB->splitBasicBlock(SplitI);
+  OldBB->getTerminator()->eraseFromParent();
+
+  if (C) {
+    // If we have a constant zero, unconditionally branch.
+    // FIXME: We should really handle this differently to bypass the splitting
+    // the block.
+    BranchInst::Create(GetTrapBB(IRB), OldBB);
+    return;
+  }
+
+  // Create the conditional branch.
+  BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB);
+}
+
+static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
+                              ScalarEvolution &SE) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  ObjectSizeOpts EvalOpts;
+  EvalOpts.RoundToAlign = true;
+  ObjectSizeOffsetEvaluator ObjSizeEval(DL, &TLI, F.getContext(), EvalOpts);
+
+  // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
+  // touching instructions
+  SmallVector<std::pair<Instruction *, Value *>, 4> TrapInfo;
+  for (Instruction &I : instructions(F)) {
+    Value *Or = nullptr;
+    BuilderTy IRB(I.getParent(), BasicBlock::iterator(&I), TargetFolder(DL));
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      Or = getBoundsCheckCond(LI->getPointerOperand(), LI, DL, TLI,
+                              ObjSizeEval, IRB, SE);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      Or = getBoundsCheckCond(SI->getPointerOperand(), SI->getValueOperand(),
+                              DL, TLI, ObjSizeEval, IRB, SE);
+    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+      Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getCompareOperand(),
+                              DL, TLI, ObjSizeEval, IRB, SE);
+    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
+      Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(), DL,
+                              TLI, ObjSizeEval, IRB, SE);
+    }
+    if (Or)
+      TrapInfo.push_back(std::make_pair(&I, Or));
+  }
+
+  // Create a trapping basic block on demand using a callback. Depending on
+  // flags, this will either create a single block for the entire function or
+  // will create a fresh block every time it is called.
+  BasicBlock *TrapBB = nullptr;
+  auto GetTrapBB = [&TrapBB](BuilderTy &IRB) {
+    if (TrapBB && SingleTrapBB)
+      return TrapBB;
+
+    Function *Fn = IRB.GetInsertBlock()->getParent();
+    // FIXME: This debug location doesn't make a lot of sense in the
+    // `SingleTrapBB` case.
+    auto DebugLoc = IRB.getCurrentDebugLocation();
+    IRBuilder<>::InsertPointGuard Guard(IRB);
+    TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+    IRB.SetInsertPoint(TrapBB);
+
+    auto *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
+    CallInst *TrapCall = IRB.CreateCall(F, {});
+    TrapCall->setDoesNotReturn();
+    TrapCall->setDoesNotThrow();
+    TrapCall->setDebugLoc(DebugLoc);
+    IRB.CreateUnreachable();
+
+    return TrapBB;
+  };
+
+  // Add the checks.
+  for (const auto &Entry : TrapInfo) {
+    Instruction *Inst = Entry.first;
+    BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
+    insertBoundsCheck(Entry.second, IRB, GetTrapBB);
+  }
+
+  return !TrapInfo.empty();
+}
+
+PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+
+  if (!addBoundsChecking(F, TLI, SE))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct BoundsCheckingLegacyPass : public FunctionPass {
+  static char ID;
+
+  BoundsCheckingLegacyPass() : FunctionPass(ID) {
+    initializeBoundsCheckingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    return addBoundsChecking(F, TLI, SE);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+  }
+};
+} // namespace
+
+char BoundsCheckingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(BoundsCheckingLegacyPass, "bounds-checking",
+                      "Run-time bounds checking", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(BoundsCheckingLegacyPass, "bounds-checking",
+                    "Run-time bounds checking", false, false)
+
+FunctionPass *llvm::createBoundsCheckingLegacyPass() {
+  return new BoundsCheckingLegacyPass();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/CFGMST.h b/llvm/lib/Transforms/Instrumentation/CFGMST.h
new file mode 100644
index 000000000000..8bb6f47c4846
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -0,0 +1,288 @@
+//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Union-find algorithm to compute Minimum Spanning Tree
+// for a given CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "cfgmst"
+
+namespace llvm {
+
+/// An union-find based Minimum Spanning Tree for CFG
+///
+/// Implements a Union-find algorithm to compute Minimum Spanning Tree
+/// for a given CFG.
+template <class Edge, class BBInfo> class CFGMST {
+public:
+  Function &F;
+
+  // Store all the edges in CFG. It may contain some stale edges
+  // when Removed is set.
+  std::vector<std::unique_ptr<Edge>> AllEdges;
+
+  // This map records the auxiliary information for each BB.
+  DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos;
+
+  // Whehter the function has an exit block with no successors.
+  // (For function with an infinite loop, this block may be absent)
+  bool ExitBlockFound = false;
+
+  // Find the root group of the G and compress the path from G to the root.
+  BBInfo *findAndCompressGroup(BBInfo *G) {
+    if (G->Group != G)
+      G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group));
+    return static_cast<BBInfo *>(G->Group);
+  }
+
+  // Union BB1 and BB2 into the same group and return true.
+  // Returns false if BB1 and BB2 are already in the same group.
+  bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) {
+    BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1));
+    BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2));
+
+    if (BB1G == BB2G)
+      return false;
+
+    // Make the smaller rank tree a direct child or the root of high rank tree.
+    if (BB1G->Rank < BB2G->Rank)
+      BB1G->Group = BB2G;
+    else {
+      BB2G->Group = BB1G;
+      // If the ranks are the same, increment root of one tree by one.
+      if (BB1G->Rank == BB2G->Rank)
+        BB1G->Rank++;
+    }
+    return true;
+  }
+
+  // Give BB, return the auxiliary information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    assert(It->second.get() != nullptr);
+    return *It->second.get();
+  }
+
+  // Give BB, return the auxiliary information if it's available.
+  BBInfo *findBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    if (It == BBInfos.end())
+      return nullptr;
+    return It->second.get();
+  }
+
+  // Traverse the CFG using a stack. Find all the edges and assign the weight.
+  // Edges with large weight will be put into MST first so they are less likely
+  // to be instrumented.
+  void buildEdges() {
+    LLVM_DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+
+    const BasicBlock *Entry = &(F.getEntryBlock());
+    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+    Edge *EntryIncoming = nullptr, *EntryOutgoing = nullptr,
+        *ExitOutgoing = nullptr, *ExitIncoming = nullptr;
+    uint64_t MaxEntryOutWeight = 0, MaxExitOutWeight = 0, MaxExitInWeight = 0;
+
+    // Add a fake edge to the entry.
+    EntryIncoming = &addEdge(nullptr, Entry, EntryWeight);
+    LLVM_DEBUG(dbgs() << "  Edge: from fake node to " << Entry->getName()
+                      << " w = " << EntryWeight << "\n");
+
+    // Special handling for single BB functions.
+    if (succ_empty(Entry)) {
+      addEdge(Entry, nullptr, EntryWeight);
+      return;
+    }
+
+    static const uint32_t CriticalEdgeMultiplier = 1000;
+
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      Instruction *TI = BB->getTerminator();
+      uint64_t BBWeight =
+          (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
+      uint64_t Weight = 2;
+      if (int successors = TI->getNumSuccessors()) {
+        for (int i = 0; i != successors; ++i) {
+          BasicBlock *TargetBB = TI->getSuccessor(i);
+          bool Critical = isCriticalEdge(TI, i);
+          uint64_t scaleFactor = BBWeight;
+          if (Critical) {
+            if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+              scaleFactor *= CriticalEdgeMultiplier;
+            else
+              scaleFactor = UINT64_MAX;
+          }
+          if (BPI != nullptr)
+            Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
+          auto *E = &addEdge(&*BB, TargetBB, Weight);
+          E->IsCritical = Critical;
+          LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
+                            << TargetBB->getName() << "  w=" << Weight << "\n");
+
+          // Keep track of entry/exit edges:
+          if (&*BB == Entry) {
+            if (Weight > MaxEntryOutWeight) {
+              MaxEntryOutWeight = Weight;
+              EntryOutgoing = E;
+            }
+          }
+
+          auto *TargetTI = TargetBB->getTerminator();
+          if (TargetTI && !TargetTI->getNumSuccessors()) {
+            if (Weight > MaxExitInWeight) {
+              MaxExitInWeight = Weight;
+              ExitIncoming = E;
+            }
+          }
+        }
+      } else {
+        ExitBlockFound = true;
+        Edge *ExitO = &addEdge(&*BB, nullptr, BBWeight);
+        if (BBWeight > MaxExitOutWeight) {
+          MaxExitOutWeight = BBWeight;
+          ExitOutgoing = ExitO;
+        }
+        LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to fake exit"
+                          << " w = " << BBWeight << "\n");
+      }
+    }
+
+    // Entry/exit edge adjustment heurisitic:
+    // prefer instrumenting entry edge over exit edge
+    // if possible. Those exit edges may never have a chance to be
+    // executed (for instance the program is an event handling loop)
+    // before the profile is asynchronously dumped.
+    //
+    // If EntryIncoming and ExitOutgoing has similar weight, make sure
+    // ExitOutging is selected as the min-edge. Similarly, if EntryOutgoing
+    // and ExitIncoming has similar weight, make sure ExitIncoming becomes
+    // the min-edge.
+    uint64_t EntryInWeight = EntryWeight;
+
+    if (EntryInWeight >= MaxExitOutWeight &&
+        EntryInWeight * 2 < MaxExitOutWeight * 3) {
+      EntryIncoming->Weight = MaxExitOutWeight;
+      ExitOutgoing->Weight = EntryInWeight + 1;
+    }
+
+    if (MaxEntryOutWeight >= MaxExitInWeight &&
+        MaxEntryOutWeight * 2 < MaxExitInWeight * 3) {
+      EntryOutgoing->Weight = MaxExitInWeight;
+      ExitIncoming->Weight = MaxEntryOutWeight + 1;
+    }
+  }
+
+  // Sort CFG edges based on its weight.
+  void sortEdgesByWeight() {
+    llvm::stable_sort(AllEdges, [](const std::unique_ptr<Edge> &Edge1,
+                                   const std::unique_ptr<Edge> &Edge2) {
+      return Edge1->Weight > Edge2->Weight;
+    });
+  }
+
+  // Traverse all the edges and compute the Minimum Weight Spanning Tree
+  // using union-find algorithm.
+  void computeMinimumSpanningTree() {
+    // First, put all the critical edge with landing-pad as the Dest to MST.
+    // This works around the insufficient support of critical edges split
+    // when destination BB is a landing pad.
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (Ei->IsCritical) {
+        if (Ei->DestBB && Ei->DestBB->isLandingPad()) {
+          if (unionGroups(Ei->SrcBB, Ei->DestBB))
+            Ei->InMST = true;
+        }
+      }
+    }
+
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      // If we detect infinite loops, force
+      // instrumenting the entry edge:
+      if (!ExitBlockFound && Ei->SrcBB == nullptr)
+        continue;
+      if (unionGroups(Ei->SrcBB, Ei->DestBB))
+        Ei->InMST = true;
+    }
+  }
+
+  // Dump the Debug information about the instrumentation.
+  void dumpEdges(raw_ostream &OS, const Twine &Message) const {
+    if (!Message.str().empty())
+      OS << Message << "\n";
+    OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n";
+    for (auto &BI : BBInfos) {
+      const BasicBlock *BB = BI.first;
+      OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  "
+         << BI.second->infoString() << "\n";
+    }
+
+    OS << "  Number of Edges: " << AllEdges.size()
+       << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
+    uint32_t Count = 0;
+    for (auto &EI : AllEdges)
+      OS << "  Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->"
+         << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n";
+  }
+
+  // Add an edge to AllEdges with weight W.
+  Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) {
+    uint32_t Index = BBInfos.size();
+    auto Iter = BBInfos.end();
+    bool Inserted;
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
+    if (Inserted) {
+      // Newly inserted, update the real info.
+      Iter->second = std::move(std::make_unique<BBInfo>(Index));
+      Index++;
+    }
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
+    if (Inserted)
+      // Newly inserted, update the real info.
+      Iter->second = std::move(std::make_unique<BBInfo>(Index));
+    AllEdges.emplace_back(new Edge(Src, Dest, W));
+    return *AllEdges.back();
+  }
+
+  BranchProbabilityInfo *BPI;
+  BlockFrequencyInfo *BFI;
+
+public:
+  CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr,
+         BlockFrequencyInfo *BFI_ = nullptr)
+      : F(Func), BPI(BPI_), BFI(BFI_) {
+    buildEdges();
+    sortEdgesByWeight();
+    computeMinimumSpanningTree();
+  }
+};
+
+} // end namespace llvm
+
+#undef DEBUG_TYPE // "cfgmst"
+
+#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
new file mode 100644
index 000000000000..358abab3cceb
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -0,0 +1,98 @@
+//===-- CGProfile.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/Instrumentation.h"
+
+#include <array>
+
+using namespace llvm;
+
+PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
+  MapVector<std::pair<Function *, Function *>, uint64_t> Counts;
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  InstrProfSymtab Symtab;
+  auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
+                          Function *CalledF, uint64_t NewCount) {
+    if (!CalledF || !TTI.isLoweredToCall(CalledF))
+      return;
+    uint64_t &Count = Counts[std::make_pair(F, CalledF)];
+    Count = SaturatingAdd(Count, NewCount);
+  };
+  // Ignore error here.  Indirect calls are ignored if this fails.
+  (void)(bool)Symtab.create(M);
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+    if (BFI.getEntryFreq() == 0)
+      continue;
+    TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+    for (auto &BB : F) {
+      Optional<uint64_t> BBCount = BFI.getBlockProfileCount(&BB);
+      if (!BBCount)
+        continue;
+      for (auto &I : BB) {
+        CallSite CS(&I);
+        if (!CS)
+          continue;
+        if (CS.isIndirectCall()) {
+          InstrProfValueData ValueData[8];
+          uint32_t ActualNumValueData;
+          uint64_t TotalC;
+          if (!getValueProfDataFromInst(*CS.getInstruction(),
+                                        IPVK_IndirectCallTarget, 8, ValueData,
+                                        ActualNumValueData, TotalC))
+            continue;
+          for (const auto &VD :
+               ArrayRef<InstrProfValueData>(ValueData, ActualNumValueData)) {
+            UpdateCounts(TTI, &F, Symtab.getFunction(VD.Value), VD.Count);
+          }
+          continue;
+        }
+        UpdateCounts(TTI, &F, CS.getCalledFunction(), *BBCount);
+      }
+    }
+  }
+
+  addModuleFlags(M, Counts);
+
+  return PreservedAnalyses::all();
+}
+
+void CGProfilePass::addModuleFlags(
+    Module &M,
+    MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) const {
+  if (Counts.empty())
+    return;
+
+  LLVMContext &Context = M.getContext();
+  MDBuilder MDB(Context);
+  std::vector<Metadata *> Nodes;
+
+  for (auto E : Counts) {
+    Metadata *Vals[] = {ValueAsMetadata::get(E.first.first),
+                        ValueAsMetadata::get(E.first.second),
+                        MDB.createConstant(ConstantInt::get(
+                            Type::getInt64Ty(Context), E.second))};
+    Nodes.push_back(MDNode::get(Context, Vals));
+  }
+
+  M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes));
+}
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
new file mode 100644
index 000000000000..55c64fa4b727
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -0,0 +1,2110 @@
+//===-- ControlHeightReduction.cpp - Control Height Reduction -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges conditional blocks of code and reduces the number of
+// conditional branches in the hot paths based on profiles.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#include <set>
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "chr"
+
+#define CHR_DEBUG(X) LLVM_DEBUG(X)
+
+static cl::opt<bool> ForceCHR("force-chr", cl::init(false), cl::Hidden,
+                              cl::desc("Apply CHR for all functions"));
+
+static cl::opt<double> CHRBiasThreshold(
+    "chr-bias-threshold", cl::init(0.99), cl::Hidden,
+    cl::desc("CHR considers a branch bias greater than this ratio as biased"));
+
+static cl::opt<unsigned> CHRMergeThreshold(
+    "chr-merge-threshold", cl::init(2), cl::Hidden,
+    cl::desc("CHR merges a group of N branches/selects where N >= this value"));
+
+static cl::opt<std::string> CHRModuleList(
+    "chr-module-list", cl::init(""), cl::Hidden,
+    cl::desc("Specify file to retrieve the list of modules to apply CHR to"));
+
+static cl::opt<std::string> CHRFunctionList(
+    "chr-function-list", cl::init(""), cl::Hidden,
+    cl::desc("Specify file to retrieve the list of functions to apply CHR to"));
+
+static StringSet<> CHRModules;
+static StringSet<> CHRFunctions;
+
+static void parseCHRFilterFiles() {
+  if (!CHRModuleList.empty()) {
+    auto FileOrErr = MemoryBuffer::getFile(CHRModuleList);
+    if (!FileOrErr) {
+      errs() << "Error: Couldn't read the chr-module-list file " << CHRModuleList << "\n";
+      std::exit(1);
+    }
+    StringRef Buf = FileOrErr->get()->getBuffer();
+    SmallVector<StringRef, 0> Lines;
+    Buf.split(Lines, '\n');
+    for (StringRef Line : Lines) {
+      Line = Line.trim();
+      if (!Line.empty())
+        CHRModules.insert(Line);
+    }
+  }
+  if (!CHRFunctionList.empty()) {
+    auto FileOrErr = MemoryBuffer::getFile(CHRFunctionList);
+    if (!FileOrErr) {
+      errs() << "Error: Couldn't read the chr-function-list file " << CHRFunctionList << "\n";
+      std::exit(1);
+    }
+    StringRef Buf = FileOrErr->get()->getBuffer();
+    SmallVector<StringRef, 0> Lines;
+    Buf.split(Lines, '\n');
+    for (StringRef Line : Lines) {
+      Line = Line.trim();
+      if (!Line.empty())
+        CHRFunctions.insert(Line);
+    }
+  }
+}
+
+namespace {
+class ControlHeightReductionLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ControlHeightReductionLegacyPass() : FunctionPass(ID) {
+    initializeControlHeightReductionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+    parseCHRFilterFiles();
+  }
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<RegionInfoPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char ControlHeightReductionLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ControlHeightReductionLegacyPass,
+                      "chr",
+                      "Reduce control height in the hot paths",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_END(ControlHeightReductionLegacyPass,
+                    "chr",
+                    "Reduce control height in the hot paths",
+                    false, false)
+
+FunctionPass *llvm::createControlHeightReductionLegacyPass() {
+  return new ControlHeightReductionLegacyPass();
+}
+
+namespace {
+
+struct CHRStats {
+  CHRStats() : NumBranches(0), NumBranchesDelta(0),
+               WeightedNumBranchesDelta(0) {}
+  void print(raw_ostream &OS) const {
+    OS << "CHRStats: NumBranches " << NumBranches
+       << " NumBranchesDelta " << NumBranchesDelta
+       << " WeightedNumBranchesDelta " << WeightedNumBranchesDelta;
+  }
+  uint64_t NumBranches;       // The original number of conditional branches /
+                              // selects
+  uint64_t NumBranchesDelta;  // The decrease of the number of conditional
+                              // branches / selects in the hot paths due to CHR.
+  uint64_t WeightedNumBranchesDelta; // NumBranchesDelta weighted by the profile
+                                     // count at the scope entry.
+};
+
+// RegInfo - some properties of a Region.
+struct RegInfo {
+  RegInfo() : R(nullptr), HasBranch(false) {}
+  RegInfo(Region *RegionIn) : R(RegionIn), HasBranch(false) {}
+  Region *R;
+  bool HasBranch;
+  SmallVector<SelectInst *, 8> Selects;
+};
+
+typedef DenseMap<Region *, DenseSet<Instruction *>> HoistStopMapTy;
+
+// CHRScope - a sequence of regions to CHR together. It corresponds to a
+// sequence of conditional blocks. It can have subscopes which correspond to
+// nested conditional blocks. Nested CHRScopes form a tree.
+class CHRScope {
+ public:
+  CHRScope(RegInfo RI) : BranchInsertPoint(nullptr) {
+    assert(RI.R && "Null RegionIn");
+    RegInfos.push_back(RI);
+  }
+
+  Region *getParentRegion() {
+    assert(RegInfos.size() > 0 && "Empty CHRScope");
+    Region *Parent = RegInfos[0].R->getParent();
+    assert(Parent && "Unexpected to call this on the top-level region");
+    return Parent;
+  }
+
+  BasicBlock *getEntryBlock() {
+    assert(RegInfos.size() > 0 && "Empty CHRScope");
+    return RegInfos.front().R->getEntry();
+  }
+
+  BasicBlock *getExitBlock() {
+    assert(RegInfos.size() > 0 && "Empty CHRScope");
+    return RegInfos.back().R->getExit();
+  }
+
+  bool appendable(CHRScope *Next) {
+    // The next scope is appendable only if this scope is directly connected to
+    // it (which implies it post-dominates this scope) and this scope dominates
+    // it (no edge to the next scope outside this scope).
+    BasicBlock *NextEntry = Next->getEntryBlock();
+    if (getExitBlock() != NextEntry)
+      // Not directly connected.
+      return false;
+    Region *LastRegion = RegInfos.back().R;
+    for (BasicBlock *Pred : predecessors(NextEntry))
+      if (!LastRegion->contains(Pred))
+        // There's an edge going into the entry of the next scope from outside
+        // of this scope.
+        return false;
+    return true;
+  }
+
+  void append(CHRScope *Next) {
+    assert(RegInfos.size() > 0 && "Empty CHRScope");
+    assert(Next->RegInfos.size() > 0 && "Empty CHRScope");
+    assert(getParentRegion() == Next->getParentRegion() &&
+           "Must be siblings");
+    assert(getExitBlock() == Next->getEntryBlock() &&
+           "Must be adjacent");
+    for (RegInfo &RI : Next->RegInfos)
+      RegInfos.push_back(RI);
+    for (CHRScope *Sub : Next->Subs)
+      Subs.push_back(Sub);
+  }
+
+  void addSub(CHRScope *SubIn) {
+#ifndef NDEBUG
+    bool IsChild = false;
+    for (RegInfo &RI : RegInfos)
+      if (RI.R == SubIn->getParentRegion()) {
+        IsChild = true;
+        break;
+      }
+    assert(IsChild && "Must be a child");
+#endif
+    Subs.push_back(SubIn);
+  }
+
+  // Split this scope at the boundary region into two, which will belong to the
+  // tail and returns the tail.
+  CHRScope *split(Region *Boundary) {
+    assert(Boundary && "Boundary null");
+    assert(RegInfos.begin()->R != Boundary &&
+           "Can't be split at beginning");
+    auto BoundaryIt = std::find_if(RegInfos.begin(), RegInfos.end(),
+                                   [&Boundary](const RegInfo& RI) {
+                                     return Boundary == RI.R;
+                                   });
+    if (BoundaryIt == RegInfos.end())
+      return nullptr;
+    SmallVector<RegInfo, 8> TailRegInfos;
+    SmallVector<CHRScope *, 8> TailSubs;
+    TailRegInfos.insert(TailRegInfos.begin(), BoundaryIt, RegInfos.end());
+    RegInfos.resize(BoundaryIt - RegInfos.begin());
+    DenseSet<Region *> TailRegionSet;
+    for (RegInfo &RI : TailRegInfos)
+      TailRegionSet.insert(RI.R);
+    for (auto It = Subs.begin(); It != Subs.end(); ) {
+      CHRScope *Sub = *It;
+      assert(Sub && "null Sub");
+      Region *Parent = Sub->getParentRegion();
+      if (TailRegionSet.count(Parent)) {
+        TailSubs.push_back(Sub);
+        It = Subs.erase(It);
+      } else {
+        assert(std::find_if(RegInfos.begin(), RegInfos.end(),
+                            [&Parent](const RegInfo& RI) {
+                              return Parent == RI.R;
+                            }) != RegInfos.end() &&
+               "Must be in head");
+        ++It;
+      }
+    }
+    assert(HoistStopMap.empty() && "MapHoistStops must be empty");
+    return new CHRScope(TailRegInfos, TailSubs);
+  }
+
+  bool contains(Instruction *I) const {
+    BasicBlock *Parent = I->getParent();
+    for (const RegInfo &RI : RegInfos)
+      if (RI.R->contains(Parent))
+        return true;
+    return false;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  SmallVector<RegInfo, 8> RegInfos; // Regions that belong to this scope
+  SmallVector<CHRScope *, 8> Subs;  // Subscopes.
+
+  // The instruction at which to insert the CHR conditional branch (and hoist
+  // the dependent condition values).
+  Instruction *BranchInsertPoint;
+
+  // True-biased and false-biased regions (conditional blocks),
+  // respectively. Used only for the outermost scope and includes regions in
+  // subscopes. The rest are unbiased.
+  DenseSet<Region *> TrueBiasedRegions;
+  DenseSet<Region *> FalseBiasedRegions;
+  // Among the biased regions, the regions that get CHRed.
+  SmallVector<RegInfo, 8> CHRRegions;
+
+  // True-biased and false-biased selects, respectively. Used only for the
+  // outermost scope and includes ones in subscopes.
+  DenseSet<SelectInst *> TrueBiasedSelects;
+  DenseSet<SelectInst *> FalseBiasedSelects;
+
+  // Map from one of the above regions to the instructions to stop
+  // hoisting instructions at through use-def chains.
+  HoistStopMapTy HoistStopMap;
+
+ private:
+  CHRScope(SmallVector<RegInfo, 8> &RegInfosIn,
+           SmallVector<CHRScope *, 8> &SubsIn)
+    : RegInfos(RegInfosIn), Subs(SubsIn), BranchInsertPoint(nullptr) {}
+};
+
+class CHR {
+ public:
+  CHR(Function &Fin, BlockFrequencyInfo &BFIin, DominatorTree &DTin,
+      ProfileSummaryInfo &PSIin, RegionInfo &RIin,
+      OptimizationRemarkEmitter &OREin)
+      : F(Fin), BFI(BFIin), DT(DTin), PSI(PSIin), RI(RIin), ORE(OREin) {}
+
+  ~CHR() {
+    for (CHRScope *Scope : Scopes) {
+      delete Scope;
+    }
+  }
+
+  bool run();
+
+ private:
+  // See the comments in CHR::run() for the high level flow of the algorithm and
+  // what the following functions do.
+
+  void findScopes(SmallVectorImpl<CHRScope *> &Output) {
+    Region *R = RI.getTopLevelRegion();
+    CHRScope *Scope = findScopes(R, nullptr, nullptr, Output);
+    if (Scope) {
+      Output.push_back(Scope);
+    }
+  }
+  CHRScope *findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
+                        SmallVectorImpl<CHRScope *> &Scopes);
+  CHRScope *findScope(Region *R);
+  void checkScopeHoistable(CHRScope *Scope);
+
+  void splitScopes(SmallVectorImpl<CHRScope *> &Input,
+                   SmallVectorImpl<CHRScope *> &Output);
+  SmallVector<CHRScope *, 8> splitScope(CHRScope *Scope,
+                                        CHRScope *Outer,
+                                        DenseSet<Value *> *OuterConditionValues,
+                                        Instruction *OuterInsertPoint,
+                                        SmallVectorImpl<CHRScope *> &Output,
+                                        DenseSet<Instruction *> &Unhoistables);
+
+  void classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes);
+  void classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope);
+
+  void filterScopes(SmallVectorImpl<CHRScope *> &Input,
+                    SmallVectorImpl<CHRScope *> &Output);
+
+  void setCHRRegions(SmallVectorImpl<CHRScope *> &Input,
+                     SmallVectorImpl<CHRScope *> &Output);
+  void setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope);
+
+  void sortScopes(SmallVectorImpl<CHRScope *> &Input,
+                  SmallVectorImpl<CHRScope *> &Output);
+
+  void transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes);
+  void transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs);
+  void cloneScopeBlocks(CHRScope *Scope,
+                        BasicBlock *PreEntryBlock,
+                        BasicBlock *ExitBlock,
+                        Region *LastRegion,
+                        ValueToValueMapTy &VMap);
+  BranchInst *createMergedBranch(BasicBlock *PreEntryBlock,
+                                 BasicBlock *EntryBlock,
+                                 BasicBlock *NewEntryBlock,
+                                 ValueToValueMapTy &VMap);
+  void fixupBranchesAndSelects(CHRScope *Scope,
+                               BasicBlock *PreEntryBlock,
+                               BranchInst *MergedBR,
+                               uint64_t ProfileCount);
+  void fixupBranch(Region *R,
+                   CHRScope *Scope,
+                   IRBuilder<> &IRB,
+                   Value *&MergedCondition, BranchProbability &CHRBranchBias);
+  void fixupSelect(SelectInst* SI,
+                   CHRScope *Scope,
+                   IRBuilder<> &IRB,
+                   Value *&MergedCondition, BranchProbability &CHRBranchBias);
+  void addToMergedCondition(bool IsTrueBiased, Value *Cond,
+                            Instruction *BranchOrSelect,
+                            CHRScope *Scope,
+                            IRBuilder<> &IRB,
+                            Value *&MergedCondition);
+
+  Function &F;
+  BlockFrequencyInfo &BFI;
+  DominatorTree &DT;
+  ProfileSummaryInfo &PSI;
+  RegionInfo &RI;
+  OptimizationRemarkEmitter &ORE;
+  CHRStats Stats;
+
+  // All the true-biased regions in the function
+  DenseSet<Region *> TrueBiasedRegionsGlobal;
+  // All the false-biased regions in the function
+  DenseSet<Region *> FalseBiasedRegionsGlobal;
+  // All the true-biased selects in the function
+  DenseSet<SelectInst *> TrueBiasedSelectsGlobal;
+  // All the false-biased selects in the function
+  DenseSet<SelectInst *> FalseBiasedSelectsGlobal;
+  // A map from biased regions to their branch bias
+  DenseMap<Region *, BranchProbability> BranchBiasMap;
+  // A map from biased selects to their branch bias
+  DenseMap<SelectInst *, BranchProbability> SelectBiasMap;
+  // All the scopes.
+  DenseSet<CHRScope *> Scopes;
+};
+
+} // end anonymous namespace
+
+static inline
+raw_ostream LLVM_ATTRIBUTE_UNUSED &operator<<(raw_ostream &OS,
+                                              const CHRStats &Stats) {
+  Stats.print(OS);
+  return OS;
+}
+
+static inline
+raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) {
+  Scope.print(OS);
+  return OS;
+}
+
+static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) {
+  if (ForceCHR)
+    return true;
+
+  if (!CHRModuleList.empty() || !CHRFunctionList.empty()) {
+    if (CHRModules.count(F.getParent()->getName()))
+      return true;
+    return CHRFunctions.count(F.getName());
+  }
+
+  assert(PSI.hasProfileSummary() && "Empty PSI?");
+  return PSI.isFunctionEntryHot(&F);
+}
+
+static void LLVM_ATTRIBUTE_UNUSED dumpIR(Function &F, const char *Label,
+                                         CHRStats *Stats) {
+  StringRef FuncName = F.getName();
+  StringRef ModuleName = F.getParent()->getName();
+  (void)(FuncName); // Unused in release build.
+  (void)(ModuleName); // Unused in release build.
+  CHR_DEBUG(dbgs() << "CHR IR dump " << Label << " " << ModuleName << " "
+            << FuncName);
+  if (Stats)
+    CHR_DEBUG(dbgs() << " " << *Stats);
+  CHR_DEBUG(dbgs() << "\n");
+  CHR_DEBUG(F.dump());
+}
+
+void CHRScope::print(raw_ostream &OS) const {
+  assert(RegInfos.size() > 0 && "Empty CHRScope");
+  OS << "CHRScope[";
+  OS << RegInfos.size() << ", Regions[";
+  for (const RegInfo &RI : RegInfos) {
+    OS << RI.R->getNameStr();
+    if (RI.HasBranch)
+      OS << " B";
+    if (RI.Selects.size() > 0)
+      OS << " S" << RI.Selects.size();
+    OS << ", ";
+  }
+  if (RegInfos[0].R->getParent()) {
+    OS << "], Parent " << RegInfos[0].R->getParent()->getNameStr();
+  } else {
+    // top level region
+    OS << "]";
+  }
+  OS << ", Subs[";
+  for (CHRScope *Sub : Subs) {
+    OS << *Sub << ", ";
+  }
+  OS << "]]";
+}
+
+// Return true if the given instruction type can be hoisted by CHR.
+static bool isHoistableInstructionType(Instruction *I) {
+  return isa<BinaryOperator>(I) || isa<CastInst>(I) || isa<SelectInst>(I) ||
+      isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+      isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+      isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
+      isa<InsertValueInst>(I);
+}
+
+// Return true if the given instruction can be hoisted by CHR.
+static bool isHoistable(Instruction *I, DominatorTree &DT) {
+  if (!isHoistableInstructionType(I))
+    return false;
+  return isSafeToSpeculativelyExecute(I, nullptr, &DT);
+}
+
+// Recursively traverse the use-def chains of the given value and return a set
+// of the unhoistable base values defined within the scope (excluding the
+// first-region entry block) or the (hoistable or unhoistable) base values that
+// are defined outside (including the first-region entry block) of the
+// scope. The returned set doesn't include constants.
+static std::set<Value *> getBaseValues(
+    Value *V, DominatorTree &DT,
+    DenseMap<Value *, std::set<Value *>> &Visited) {
+  if (Visited.count(V)) {
+    return Visited[V];
+  }
+  std::set<Value *> Result;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    // We don't stop at a block that's not in the Scope because we would miss some
+    // instructions that are based on the same base values if we stop there.
+    if (!isHoistable(I, DT)) {
+      Result.insert(I);
+      Visited.insert(std::make_pair(V, Result));
+      return Result;
+    }
+    // I is hoistable above the Scope.
+    for (Value *Op : I->operands()) {
+      std::set<Value *> OpResult = getBaseValues(Op, DT, Visited);
+      Result.insert(OpResult.begin(), OpResult.end());
+    }
+    Visited.insert(std::make_pair(V, Result));
+    return Result;
+  }
+  if (isa<Argument>(V)) {
+    Result.insert(V);
+    Visited.insert(std::make_pair(V, Result));
+    return Result;
+  }
+  // We don't include others like constants because those won't lead to any
+  // chance of folding of conditions (eg two bit checks merged into one check)
+  // after CHR.
+  Visited.insert(std::make_pair(V, Result));
+  return Result;  // empty
+}
+
+// Return true if V is already hoisted or can be hoisted (along with its
+// operands) above the insert point. When it returns true and HoistStops is
+// non-null, the instructions to stop hoisting at through the use-def chains are
+// inserted into HoistStops.
+static bool
+checkHoistValue(Value *V, Instruction *InsertPoint, DominatorTree &DT,
+                DenseSet<Instruction *> &Unhoistables,
+                DenseSet<Instruction *> *HoistStops,
+                DenseMap<Instruction *, bool> &Visited) {
+  assert(InsertPoint && "Null InsertPoint");
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (Visited.count(I)) {
+      return Visited[I];
+    }
+    assert(DT.getNode(I->getParent()) && "DT must contain I's parent block");
+    assert(DT.getNode(InsertPoint->getParent()) && "DT must contain Destination");
+    if (Unhoistables.count(I)) {
+      // Don't hoist if they are not to be hoisted.
+      Visited[I] = false;
+      return false;
+    }
+    if (DT.dominates(I, InsertPoint)) {
+      // We are already above the insert point. Stop here.
+      if (HoistStops)
+        HoistStops->insert(I);
+      Visited[I] = true;
+      return true;
+    }
+    // We aren't not above the insert point, check if we can hoist it above the
+    // insert point.
+    if (isHoistable(I, DT)) {
+      // Check operands first.
+      DenseSet<Instruction *> OpsHoistStops;
+      bool AllOpsHoisted = true;
+      for (Value *Op : I->operands()) {
+        if (!checkHoistValue(Op, InsertPoint, DT, Unhoistables, &OpsHoistStops,
+                             Visited)) {
+          AllOpsHoisted = false;
+          break;
+        }
+      }
+      if (AllOpsHoisted) {
+        CHR_DEBUG(dbgs() << "checkHoistValue " << *I << "\n");
+        if (HoistStops)
+          HoistStops->insert(OpsHoistStops.begin(), OpsHoistStops.end());
+        Visited[I] = true;
+        return true;
+      }
+    }
+    Visited[I] = false;
+    return false;
+  }
+  // Non-instructions are considered hoistable.
+  return true;
+}
+
+// Returns true and sets the true probability and false probability of an
+// MD_prof metadata if it's well-formed.
+static bool checkMDProf(MDNode *MD, BranchProbability &TrueProb,
+                        BranchProbability &FalseProb) {
+  if (!MD) return false;
+  MDString *MDName = cast<MDString>(MD->getOperand(0));
+  if (MDName->getString() != "branch_weights" ||
+      MD->getNumOperands() != 3)
+    return false;
+  ConstantInt *TrueWeight = mdconst::extract<ConstantInt>(MD->getOperand(1));
+  ConstantInt *FalseWeight = mdconst::extract<ConstantInt>(MD->getOperand(2));
+  if (!TrueWeight || !FalseWeight)
+    return false;
+  uint64_t TrueWt = TrueWeight->getValue().getZExtValue();
+  uint64_t FalseWt = FalseWeight->getValue().getZExtValue();
+  uint64_t SumWt = TrueWt + FalseWt;
+
+  assert(SumWt >= TrueWt && SumWt >= FalseWt &&
+         "Overflow calculating branch probabilities.");
+
+  TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt);
+  FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt);
+  return true;
+}
+
+static BranchProbability getCHRBiasThreshold() {
+  return BranchProbability::getBranchProbability(
+      static_cast<uint64_t>(CHRBiasThreshold * 1000000), 1000000);
+}
+
+// A helper for CheckBiasedBranch and CheckBiasedSelect. If TrueProb >=
+// CHRBiasThreshold, put Key into TrueSet and return true. If FalseProb >=
+// CHRBiasThreshold, put Key into FalseSet and return true. Otherwise, return
+// false.
+template <typename K, typename S, typename M>
+static bool checkBias(K *Key, BranchProbability TrueProb,
+                      BranchProbability FalseProb, S &TrueSet, S &FalseSet,
+                      M &BiasMap) {
+  BranchProbability Threshold = getCHRBiasThreshold();
+  if (TrueProb >= Threshold) {
+    TrueSet.insert(Key);
+    BiasMap[Key] = TrueProb;
+    return true;
+  } else if (FalseProb >= Threshold) {
+    FalseSet.insert(Key);
+    BiasMap[Key] = FalseProb;
+    return true;
+  }
+  return false;
+}
+
+// Returns true and insert a region into the right biased set and the map if the
+// branch of the region is biased.
+static bool checkBiasedBranch(BranchInst *BI, Region *R,
+                              DenseSet<Region *> &TrueBiasedRegionsGlobal,
+                              DenseSet<Region *> &FalseBiasedRegionsGlobal,
+                              DenseMap<Region *, BranchProbability> &BranchBiasMap) {
+  if (!BI->isConditional())
+    return false;
+  BranchProbability ThenProb, ElseProb;
+  if (!checkMDProf(BI->getMetadata(LLVMContext::MD_prof),
+                   ThenProb, ElseProb))
+    return false;
+  BasicBlock *IfThen = BI->getSuccessor(0);
+  BasicBlock *IfElse = BI->getSuccessor(1);
+  assert((IfThen == R->getExit() || IfElse == R->getExit()) &&
+         IfThen != IfElse &&
+         "Invariant from findScopes");
+  if (IfThen == R->getExit()) {
+    // Swap them so that IfThen/ThenProb means going into the conditional code
+    // and IfElse/ElseProb means skipping it.
+    std::swap(IfThen, IfElse);
+    std::swap(ThenProb, ElseProb);
+  }
+  CHR_DEBUG(dbgs() << "BI " << *BI << " ");
+  CHR_DEBUG(dbgs() << "ThenProb " << ThenProb << " ");
+  CHR_DEBUG(dbgs() << "ElseProb " << ElseProb << "\n");
+  return checkBias(R, ThenProb, ElseProb,
+                   TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal,
+                   BranchBiasMap);
+}
+
+// Returns true and insert a select into the right biased set and the map if the
+// select is biased.
+static bool checkBiasedSelect(
+    SelectInst *SI, Region *R,
+    DenseSet<SelectInst *> &TrueBiasedSelectsGlobal,
+    DenseSet<SelectInst *> &FalseBiasedSelectsGlobal,
+    DenseMap<SelectInst *, BranchProbability> &SelectBiasMap) {
+  BranchProbability TrueProb, FalseProb;
+  if (!checkMDProf(SI->getMetadata(LLVMContext::MD_prof),
+                   TrueProb, FalseProb))
+    return false;
+  CHR_DEBUG(dbgs() << "SI " << *SI << " ");
+  CHR_DEBUG(dbgs() << "TrueProb " << TrueProb << " ");
+  CHR_DEBUG(dbgs() << "FalseProb " << FalseProb << "\n");
+  return checkBias(SI, TrueProb, FalseProb,
+                   TrueBiasedSelectsGlobal, FalseBiasedSelectsGlobal,
+                   SelectBiasMap);
+}
+
+// Returns the instruction at which to hoist the dependent condition values and
+// insert the CHR branch for a region. This is the terminator branch in the
+// entry block or the first select in the entry block, if any.
+static Instruction* getBranchInsertPoint(RegInfo &RI) {
+  Region *R = RI.R;
+  BasicBlock *EntryBB = R->getEntry();
+  // The hoist point is by default the terminator of the entry block, which is
+  // the same as the branch instruction if RI.HasBranch is true.
+  Instruction *HoistPoint = EntryBB->getTerminator();
+  for (SelectInst *SI : RI.Selects) {
+    if (SI->getParent() == EntryBB) {
+      // Pick the first select in Selects in the entry block.  Note Selects is
+      // sorted in the instruction order within a block (asserted below).
+      HoistPoint = SI;
+      break;
+    }
+  }
+  assert(HoistPoint && "Null HoistPoint");
+#ifndef NDEBUG
+  // Check that HoistPoint is the first one in Selects in the entry block,
+  // if any.
+  DenseSet<Instruction *> EntryBlockSelectSet;
+  for (SelectInst *SI : RI.Selects) {
+    if (SI->getParent() == EntryBB) {
+      EntryBlockSelectSet.insert(SI);
+    }
+  }
+  for (Instruction &I : *EntryBB) {
+    if (EntryBlockSelectSet.count(&I) > 0) {
+      assert(&I == HoistPoint &&
+             "HoistPoint must be the first one in Selects");
+      break;
+    }
+  }
+#endif
+  return HoistPoint;
+}
+
+// Find a CHR scope in the given region.
+CHRScope * CHR::findScope(Region *R) {
+  CHRScope *Result = nullptr;
+  BasicBlock *Entry = R->getEntry();
+  BasicBlock *Exit = R->getExit();  // null if top level.
+  assert(Entry && "Entry must not be null");
+  assert((Exit == nullptr) == (R->isTopLevelRegion()) &&
+         "Only top level region has a null exit");
+  if (Entry)
+    CHR_DEBUG(dbgs() << "Entry " << Entry->getName() << "\n");
+  else
+    CHR_DEBUG(dbgs() << "Entry null\n");
+  if (Exit)
+    CHR_DEBUG(dbgs() << "Exit " << Exit->getName() << "\n");
+  else
+    CHR_DEBUG(dbgs() << "Exit null\n");
+  // Exclude cases where Entry is part of a subregion (hence it doesn't belong
+  // to this region).
+  bool EntryInSubregion = RI.getRegionFor(Entry) != R;
+  if (EntryInSubregion)
+    return nullptr;
+  // Exclude loops
+  for (BasicBlock *Pred : predecessors(Entry))
+    if (R->contains(Pred))
+      return nullptr;
+  if (Exit) {
+    // Try to find an if-then block (check if R is an if-then).
+    // if (cond) {
+    //  ...
+    // }
+    auto *BI = dyn_cast<BranchInst>(Entry->getTerminator());
+    if (BI)
+      CHR_DEBUG(dbgs() << "BI.isConditional " << BI->isConditional() << "\n");
+    else
+      CHR_DEBUG(dbgs() << "BI null\n");
+    if (BI && BI->isConditional()) {
+      BasicBlock *S0 = BI->getSuccessor(0);
+      BasicBlock *S1 = BI->getSuccessor(1);
+      CHR_DEBUG(dbgs() << "S0 " << S0->getName() << "\n");
+      CHR_DEBUG(dbgs() << "S1 " << S1->getName() << "\n");
+      if (S0 != S1 && (S0 == Exit || S1 == Exit)) {
+        RegInfo RI(R);
+        RI.HasBranch = checkBiasedBranch(
+            BI, R, TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal,
+            BranchBiasMap);
+        Result = new CHRScope(RI);
+        Scopes.insert(Result);
+        CHR_DEBUG(dbgs() << "Found a region with a branch\n");
+        ++Stats.NumBranches;
+        if (!RI.HasBranch) {
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE, "BranchNotBiased", BI)
+                << "Branch not biased";
+          });
+        }
+      }
+    }
+  }
+  {
+    // Try to look for selects in the direct child blocks (as opposed to in
+    // subregions) of R.
+    // ...
+    // if (..) { // Some subregion
+    //   ...
+    // }
+    // if (..) { // Some subregion
+    //   ...
+    // }
+    // ...
+    // a = cond ? b : c;
+    // ...
+    SmallVector<SelectInst *, 8> Selects;
+    for (RegionNode *E : R->elements()) {
+      if (E->isSubRegion())
+        continue;
+      // This returns the basic block of E if E is a direct child of R (not a
+      // subregion.)
+      BasicBlock *BB = E->getEntry();
+      // Need to push in the order to make it easier to find the first Select
+      // later.
+      for (Instruction &I : *BB) {
+        if (auto *SI = dyn_cast<SelectInst>(&I)) {
+          Selects.push_back(SI);
+          ++Stats.NumBranches;
+        }
+      }
+    }
+    if (Selects.size() > 0) {
+      auto AddSelects = [&](RegInfo &RI) {
+        for (auto *SI : Selects)
+          if (checkBiasedSelect(SI, RI.R,
+                                TrueBiasedSelectsGlobal,
+                                FalseBiasedSelectsGlobal,
+                                SelectBiasMap))
+            RI.Selects.push_back(SI);
+          else
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "SelectNotBiased", SI)
+                  << "Select not biased";
+            });
+      };
+      if (!Result) {
+        CHR_DEBUG(dbgs() << "Found a select-only region\n");
+        RegInfo RI(R);
+        AddSelects(RI);
+        Result = new CHRScope(RI);
+        Scopes.insert(Result);
+      } else {
+        CHR_DEBUG(dbgs() << "Found select(s) in a region with a branch\n");
+        AddSelects(Result->RegInfos[0]);
+      }
+    }
+  }
+
+  if (Result) {
+    checkScopeHoistable(Result);
+  }
+  return Result;
+}
+
+// Check that any of the branch and the selects in the region could be
+// hoisted above the the CHR branch insert point (the most dominating of
+// them, either the branch (at the end of the first block) or the first
+// select in the first block). If the branch can't be hoisted, drop the
+// selects in the first blocks.
+//
+// For example, for the following scope/region with selects, we want to insert
+// the merged branch right before the first select in the first/entry block by
+// hoisting c1, c2, c3, and c4.
+//
+// // Branch insert point here.
+// a = c1 ? b : c; // Select 1
+// d = c2 ? e : f; // Select 2
+// if (c3) { // Branch
+//   ...
+//   c4 = foo() // A call.
+//   g = c4 ? h : i; // Select 3
+// }
+//
+// But suppose we can't hoist c4 because it's dependent on the preceding
+// call. Then, we drop Select 3. Furthermore, if we can't hoist c2, we also drop
+// Select 2. If we can't hoist c3, we drop Selects 1 & 2.
+void CHR::checkScopeHoistable(CHRScope *Scope) {
+  RegInfo &RI = Scope->RegInfos[0];
+  Region *R = RI.R;
+  BasicBlock *EntryBB = R->getEntry();
+  auto *Branch = RI.HasBranch ?
+                 cast<BranchInst>(EntryBB->getTerminator()) : nullptr;
+  SmallVector<SelectInst *, 8> &Selects = RI.Selects;
+  if (RI.HasBranch || !Selects.empty()) {
+    Instruction *InsertPoint = getBranchInsertPoint(RI);
+    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+    // Avoid a data dependence from a select or a branch to a(nother)
+    // select. Note no instruction can't data-depend on a branch (a branch
+    // instruction doesn't produce a value).
+    DenseSet<Instruction *> Unhoistables;
+    // Initialize Unhoistables with the selects.
+    for (SelectInst *SI : Selects) {
+      Unhoistables.insert(SI);
+    }
+    // Remove Selects that can't be hoisted.
+    for (auto it = Selects.begin(); it != Selects.end(); ) {
+      SelectInst *SI = *it;
+      if (SI == InsertPoint) {
+        ++it;
+        continue;
+      }
+      DenseMap<Instruction *, bool> Visited;
+      bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint,
+                                         DT, Unhoistables, nullptr, Visited);
+      if (!IsHoistable) {
+        CHR_DEBUG(dbgs() << "Dropping select " << *SI << "\n");
+        ORE.emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE,
+                                          "DropUnhoistableSelect", SI)
+              << "Dropped unhoistable select";
+        });
+        it = Selects.erase(it);
+        // Since we are dropping the select here, we also drop it from
+        // Unhoistables.
+        Unhoistables.erase(SI);
+      } else
+        ++it;
+    }
+    // Update InsertPoint after potentially removing selects.
+    InsertPoint = getBranchInsertPoint(RI);
+    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+    if (RI.HasBranch && InsertPoint != Branch) {
+      DenseMap<Instruction *, bool> Visited;
+      bool IsHoistable = checkHoistValue(Branch->getCondition(), InsertPoint,
+                                         DT, Unhoistables, nullptr, Visited);
+      if (!IsHoistable) {
+        // If the branch isn't hoistable, drop the selects in the entry
+        // block, preferring the branch, which makes the branch the hoist
+        // point.
+        assert(InsertPoint != Branch && "Branch must not be the hoist point");
+        CHR_DEBUG(dbgs() << "Dropping selects in entry block \n");
+        CHR_DEBUG(
+            for (SelectInst *SI : Selects) {
+              dbgs() << "SI " << *SI << "\n";
+            });
+        for (SelectInst *SI : Selects) {
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE,
+                                            "DropSelectUnhoistableBranch", SI)
+                << "Dropped select due to unhoistable branch";
+          });
+        }
+        Selects.erase(std::remove_if(Selects.begin(), Selects.end(),
+                                     [EntryBB](SelectInst *SI) {
+                                       return SI->getParent() == EntryBB;
+                                     }), Selects.end());
+        Unhoistables.clear();
+        InsertPoint = Branch;
+      }
+    }
+    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+#ifndef NDEBUG
+    if (RI.HasBranch) {
+      assert(!DT.dominates(Branch, InsertPoint) &&
+             "Branch can't be already above the hoist point");
+      DenseMap<Instruction *, bool> Visited;
+      assert(checkHoistValue(Branch->getCondition(), InsertPoint,
+                             DT, Unhoistables, nullptr, Visited) &&
+             "checkHoistValue for branch");
+    }
+    for (auto *SI : Selects) {
+      assert(!DT.dominates(SI, InsertPoint) &&
+             "SI can't be already above the hoist point");
+      DenseMap<Instruction *, bool> Visited;
+      assert(checkHoistValue(SI->getCondition(), InsertPoint, DT,
+                             Unhoistables, nullptr, Visited) &&
+             "checkHoistValue for selects");
+    }
+    CHR_DEBUG(dbgs() << "Result\n");
+    if (RI.HasBranch) {
+      CHR_DEBUG(dbgs() << "BI " << *Branch << "\n");
+    }
+    for (auto *SI : Selects) {
+      CHR_DEBUG(dbgs() << "SI " << *SI << "\n");
+    }
+#endif
+  }
+}
+
+// Traverse the region tree, find all nested scopes and merge them if possible.
+CHRScope * CHR::findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
+                           SmallVectorImpl<CHRScope *> &Scopes) {
+  CHR_DEBUG(dbgs() << "findScopes " << R->getNameStr() << "\n");
+  CHRScope *Result = findScope(R);
+  // Visit subscopes.
+  CHRScope *ConsecutiveSubscope = nullptr;
+  SmallVector<CHRScope *, 8> Subscopes;
+  for (auto It = R->begin(); It != R->end(); ++It) {
+    const std::unique_ptr<Region> &SubR = *It;
+    auto NextIt = std::next(It);
+    Region *NextSubR = NextIt != R->end() ? NextIt->get() : nullptr;
+    CHR_DEBUG(dbgs() << "Looking at subregion " << SubR.get()->getNameStr()
+              << "\n");
+    CHRScope *SubCHRScope = findScopes(SubR.get(), NextSubR, R, Scopes);
+    if (SubCHRScope) {
+      CHR_DEBUG(dbgs() << "Subregion Scope " << *SubCHRScope << "\n");
+    } else {
+      CHR_DEBUG(dbgs() << "Subregion Scope null\n");
+    }
+    if (SubCHRScope) {
+      if (!ConsecutiveSubscope)
+        ConsecutiveSubscope = SubCHRScope;
+      else if (!ConsecutiveSubscope->appendable(SubCHRScope)) {
+        Subscopes.push_back(ConsecutiveSubscope);
+        ConsecutiveSubscope = SubCHRScope;
+      } else
+        ConsecutiveSubscope->append(SubCHRScope);
+    } else {
+      if (ConsecutiveSubscope) {
+        Subscopes.push_back(ConsecutiveSubscope);
+      }
+      ConsecutiveSubscope = nullptr;
+    }
+  }
+  if (ConsecutiveSubscope) {
+    Subscopes.push_back(ConsecutiveSubscope);
+  }
+  for (CHRScope *Sub : Subscopes) {
+    if (Result) {
+      // Combine it with the parent.
+      Result->addSub(Sub);
+    } else {
+      // Push Subscopes as they won't be combined with the parent.
+      Scopes.push_back(Sub);
+    }
+  }
+  return Result;
+}
+
+static DenseSet<Value *> getCHRConditionValuesForRegion(RegInfo &RI) {
+  DenseSet<Value *> ConditionValues;
+  if (RI.HasBranch) {
+    auto *BI = cast<BranchInst>(RI.R->getEntry()->getTerminator());
+    ConditionValues.insert(BI->getCondition());
+  }
+  for (SelectInst *SI : RI.Selects) {
+    ConditionValues.insert(SI->getCondition());
+  }
+  return ConditionValues;
+}
+
+
+// Determine whether to split a scope depending on the sets of the branch
+// condition values of the previous region and the current region. We split
+// (return true) it if 1) the condition values of the inner/lower scope can't be
+// hoisted up to the outer/upper scope, or 2) the two sets of the condition
+// values have an empty intersection (because the combined branch conditions
+// won't probably lead to a simpler combined condition).
+static bool shouldSplit(Instruction *InsertPoint,
+                        DenseSet<Value *> &PrevConditionValues,
+                        DenseSet<Value *> &ConditionValues,
+                        DominatorTree &DT,
+                        DenseSet<Instruction *> &Unhoistables) {
+  CHR_DEBUG(
+      dbgs() << "shouldSplit " << *InsertPoint << " PrevConditionValues ";
+      for (Value *V : PrevConditionValues) {
+        dbgs() << *V << ", ";
+      }
+      dbgs() << " ConditionValues ";
+      for (Value *V : ConditionValues) {
+        dbgs() << *V << ", ";
+      }
+      dbgs() << "\n");
+  assert(InsertPoint && "Null InsertPoint");
+  // If any of Bases isn't hoistable to the hoist point, split.
+  for (Value *V : ConditionValues) {
+    DenseMap<Instruction *, bool> Visited;
+    if (!checkHoistValue(V, InsertPoint, DT, Unhoistables, nullptr, Visited)) {
+      CHR_DEBUG(dbgs() << "Split. checkHoistValue false " << *V << "\n");
+      return true; // Not hoistable, split.
+    }
+  }
+  // If PrevConditionValues or ConditionValues is empty, don't split to avoid
+  // unnecessary splits at scopes with no branch/selects.  If
+  // PrevConditionValues and ConditionValues don't intersect at all, split.
+  if (!PrevConditionValues.empty() && !ConditionValues.empty()) {
+    // Use std::set as DenseSet doesn't work with set_intersection.
+    std::set<Value *> PrevBases, Bases;
+    DenseMap<Value *, std::set<Value *>> Visited;
+    for (Value *V : PrevConditionValues) {
+      std::set<Value *> BaseValues = getBaseValues(V, DT, Visited);
+      PrevBases.insert(BaseValues.begin(), BaseValues.end());
+    }
+    for (Value *V : ConditionValues) {
+      std::set<Value *> BaseValues = getBaseValues(V, DT, Visited);
+      Bases.insert(BaseValues.begin(), BaseValues.end());
+    }
+    CHR_DEBUG(
+        dbgs() << "PrevBases ";
+        for (Value *V : PrevBases) {
+          dbgs() << *V << ", ";
+        }
+        dbgs() << " Bases ";
+        for (Value *V : Bases) {
+          dbgs() << *V << ", ";
+        }
+        dbgs() << "\n");
+    std::set<Value *> Intersection;
+    std::set_intersection(PrevBases.begin(), PrevBases.end(),
+                          Bases.begin(), Bases.end(),
+                          std::inserter(Intersection, Intersection.begin()));
+    if (Intersection.empty()) {
+      // Empty intersection, split.
+      CHR_DEBUG(dbgs() << "Split. Intersection empty\n");
+      return true;
+    }
+  }
+  CHR_DEBUG(dbgs() << "No split\n");
+  return false;  // Don't split.
+}
+
+static void getSelectsInScope(CHRScope *Scope,
+                              DenseSet<Instruction *> &Output) {
+  for (RegInfo &RI : Scope->RegInfos)
+    for (SelectInst *SI : RI.Selects)
+      Output.insert(SI);
+  for (CHRScope *Sub : Scope->Subs)
+    getSelectsInScope(Sub, Output);
+}
+
+void CHR::splitScopes(SmallVectorImpl<CHRScope *> &Input,
+                      SmallVectorImpl<CHRScope *> &Output) {
+  for (CHRScope *Scope : Input) {
+    assert(!Scope->BranchInsertPoint &&
+           "BranchInsertPoint must not be set");
+    DenseSet<Instruction *> Unhoistables;
+    getSelectsInScope(Scope, Unhoistables);
+    splitScope(Scope, nullptr, nullptr, nullptr, Output, Unhoistables);
+  }
+#ifndef NDEBUG
+  for (CHRScope *Scope : Output) {
+    assert(Scope->BranchInsertPoint && "BranchInsertPoint must be set");
+  }
+#endif
+}
+
+SmallVector<CHRScope *, 8> CHR::splitScope(
+    CHRScope *Scope,
+    CHRScope *Outer,
+    DenseSet<Value *> *OuterConditionValues,
+    Instruction *OuterInsertPoint,
+    SmallVectorImpl<CHRScope *> &Output,
+    DenseSet<Instruction *> &Unhoistables) {
+  if (Outer) {
+    assert(OuterConditionValues && "Null OuterConditionValues");
+    assert(OuterInsertPoint && "Null OuterInsertPoint");
+  }
+  bool PrevSplitFromOuter = true;
+  DenseSet<Value *> PrevConditionValues;
+  Instruction *PrevInsertPoint = nullptr;
+  SmallVector<CHRScope *, 8> Splits;
+  SmallVector<bool, 8> SplitsSplitFromOuter;
+  SmallVector<DenseSet<Value *>, 8> SplitsConditionValues;
+  SmallVector<Instruction *, 8> SplitsInsertPoints;
+  SmallVector<RegInfo, 8> RegInfos(Scope->RegInfos);  // Copy
+  for (RegInfo &RI : RegInfos) {
+    Instruction *InsertPoint = getBranchInsertPoint(RI);
+    DenseSet<Value *> ConditionValues = getCHRConditionValuesForRegion(RI);
+    CHR_DEBUG(
+        dbgs() << "ConditionValues ";
+        for (Value *V : ConditionValues) {
+          dbgs() << *V << ", ";
+        }
+        dbgs() << "\n");
+    if (RI.R == RegInfos[0].R) {
+      // First iteration. Check to see if we should split from the outer.
+      if (Outer) {
+        CHR_DEBUG(dbgs() << "Outer " << *Outer << "\n");
+        CHR_DEBUG(dbgs() << "Should split from outer at "
+                  << RI.R->getNameStr() << "\n");
+        if (shouldSplit(OuterInsertPoint, *OuterConditionValues,
+                        ConditionValues, DT, Unhoistables)) {
+          PrevConditionValues = ConditionValues;
+          PrevInsertPoint = InsertPoint;
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE,
+                                            "SplitScopeFromOuter",
+                                            RI.R->getEntry()->getTerminator())
+                << "Split scope from outer due to unhoistable branch/select "
+                << "and/or lack of common condition values";
+          });
+        } else {
+          // Not splitting from the outer. Use the outer bases and insert
+          // point. Union the bases.
+          PrevSplitFromOuter = false;
+          PrevConditionValues = *OuterConditionValues;
+          PrevConditionValues.insert(ConditionValues.begin(),
+                                     ConditionValues.end());
+          PrevInsertPoint = OuterInsertPoint;
+        }
+      } else {
+        CHR_DEBUG(dbgs() << "Outer null\n");
+        PrevConditionValues = ConditionValues;
+        PrevInsertPoint = InsertPoint;
+      }
+    } else {
+      CHR_DEBUG(dbgs() << "Should split from prev at "
+                << RI.R->getNameStr() << "\n");
+      if (shouldSplit(PrevInsertPoint, PrevConditionValues, ConditionValues,
+                      DT, Unhoistables)) {
+        CHRScope *Tail = Scope->split(RI.R);
+        Scopes.insert(Tail);
+        Splits.push_back(Scope);
+        SplitsSplitFromOuter.push_back(PrevSplitFromOuter);
+        SplitsConditionValues.push_back(PrevConditionValues);
+        SplitsInsertPoints.push_back(PrevInsertPoint);
+        Scope = Tail;
+        PrevConditionValues = ConditionValues;
+        PrevInsertPoint = InsertPoint;
+        PrevSplitFromOuter = true;
+        ORE.emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE,
+                                          "SplitScopeFromPrev",
+                                          RI.R->getEntry()->getTerminator())
+              << "Split scope from previous due to unhoistable branch/select "
+              << "and/or lack of common condition values";
+        });
+      } else {
+        // Not splitting. Union the bases. Keep the hoist point.
+        PrevConditionValues.insert(ConditionValues.begin(), ConditionValues.end());
+      }
+    }
+  }
+  Splits.push_back(Scope);
+  SplitsSplitFromOuter.push_back(PrevSplitFromOuter);
+  SplitsConditionValues.push_back(PrevConditionValues);
+  assert(PrevInsertPoint && "Null PrevInsertPoint");
+  SplitsInsertPoints.push_back(PrevInsertPoint);
+  assert(Splits.size() == SplitsConditionValues.size() &&
+         Splits.size() == SplitsSplitFromOuter.size() &&
+         Splits.size() == SplitsInsertPoints.size() && "Mismatching sizes");
+  for (size_t I = 0; I < Splits.size(); ++I) {
+    CHRScope *Split = Splits[I];
+    DenseSet<Value *> &SplitConditionValues = SplitsConditionValues[I];
+    Instruction *SplitInsertPoint = SplitsInsertPoints[I];
+    SmallVector<CHRScope *, 8> NewSubs;
+    DenseSet<Instruction *> SplitUnhoistables;
+    getSelectsInScope(Split, SplitUnhoistables);
+    for (CHRScope *Sub : Split->Subs) {
+      SmallVector<CHRScope *, 8> SubSplits = splitScope(
+          Sub, Split, &SplitConditionValues, SplitInsertPoint, Output,
+          SplitUnhoistables);
+      NewSubs.insert(NewSubs.end(), SubSplits.begin(), SubSplits.end());
+    }
+    Split->Subs = NewSubs;
+  }
+  SmallVector<CHRScope *, 8> Result;
+  for (size_t I = 0; I < Splits.size(); ++I) {
+    CHRScope *Split = Splits[I];
+    if (SplitsSplitFromOuter[I]) {
+      // Split from the outer.
+      Output.push_back(Split);
+      Split->BranchInsertPoint = SplitsInsertPoints[I];
+      CHR_DEBUG(dbgs() << "BranchInsertPoint " << *SplitsInsertPoints[I]
+                << "\n");
+    } else {
+      // Connected to the outer.
+      Result.push_back(Split);
+    }
+  }
+  if (!Outer)
+    assert(Result.empty() &&
+           "If no outer (top-level), must return no nested ones");
+  return Result;
+}
+
+void CHR::classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes) {
+  for (CHRScope *Scope : Scopes) {
+    assert(Scope->TrueBiasedRegions.empty() && Scope->FalseBiasedRegions.empty() && "Empty");
+    classifyBiasedScopes(Scope, Scope);
+    CHR_DEBUG(
+        dbgs() << "classifyBiasedScopes " << *Scope << "\n";
+        dbgs() << "TrueBiasedRegions ";
+        for (Region *R : Scope->TrueBiasedRegions) {
+          dbgs() << R->getNameStr() << ", ";
+        }
+        dbgs() << "\n";
+        dbgs() << "FalseBiasedRegions ";
+        for (Region *R : Scope->FalseBiasedRegions) {
+          dbgs() << R->getNameStr() << ", ";
+        }
+        dbgs() << "\n";
+        dbgs() << "TrueBiasedSelects ";
+        for (SelectInst *SI : Scope->TrueBiasedSelects) {
+          dbgs() << *SI << ", ";
+        }
+        dbgs() << "\n";
+        dbgs() << "FalseBiasedSelects ";
+        for (SelectInst *SI : Scope->FalseBiasedSelects) {
+          dbgs() << *SI << ", ";
+        }
+        dbgs() << "\n";);
+  }
+}
+
+void CHR::classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope) {
+  for (RegInfo &RI : Scope->RegInfos) {
+    if (RI.HasBranch) {
+      Region *R = RI.R;
+      if (TrueBiasedRegionsGlobal.count(R) > 0)
+        OutermostScope->TrueBiasedRegions.insert(R);
+      else if (FalseBiasedRegionsGlobal.count(R) > 0)
+        OutermostScope->FalseBiasedRegions.insert(R);
+      else
+        llvm_unreachable("Must be biased");
+    }
+    for (SelectInst *SI : RI.Selects) {
+      if (TrueBiasedSelectsGlobal.count(SI) > 0)
+        OutermostScope->TrueBiasedSelects.insert(SI);
+      else if (FalseBiasedSelectsGlobal.count(SI) > 0)
+        OutermostScope->FalseBiasedSelects.insert(SI);
+      else
+        llvm_unreachable("Must be biased");
+    }
+  }
+  for (CHRScope *Sub : Scope->Subs) {
+    classifyBiasedScopes(Sub, OutermostScope);
+  }
+}
+
+static bool hasAtLeastTwoBiasedBranches(CHRScope *Scope) {
+  unsigned NumBiased = Scope->TrueBiasedRegions.size() +
+                       Scope->FalseBiasedRegions.size() +
+                       Scope->TrueBiasedSelects.size() +
+                       Scope->FalseBiasedSelects.size();
+  return NumBiased >= CHRMergeThreshold;
+}
+
+void CHR::filterScopes(SmallVectorImpl<CHRScope *> &Input,
+                       SmallVectorImpl<CHRScope *> &Output) {
+  for (CHRScope *Scope : Input) {
+    // Filter out the ones with only one region and no subs.
+    if (!hasAtLeastTwoBiasedBranches(Scope)) {
+      CHR_DEBUG(dbgs() << "Filtered out by biased branches truthy-regions "
+                << Scope->TrueBiasedRegions.size()
+                << " falsy-regions " << Scope->FalseBiasedRegions.size()
+                << " true-selects " << Scope->TrueBiasedSelects.size()
+                << " false-selects " << Scope->FalseBiasedSelects.size() << "\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(
+            DEBUG_TYPE,
+            "DropScopeWithOneBranchOrSelect",
+            Scope->RegInfos[0].R->getEntry()->getTerminator())
+            << "Drop scope with < "
+            << ore::NV("CHRMergeThreshold", CHRMergeThreshold)
+            << " biased branch(es) or select(s)";
+      });
+      continue;
+    }
+    Output.push_back(Scope);
+  }
+}
+
+void CHR::setCHRRegions(SmallVectorImpl<CHRScope *> &Input,
+                        SmallVectorImpl<CHRScope *> &Output) {
+  for (CHRScope *Scope : Input) {
+    assert(Scope->HoistStopMap.empty() && Scope->CHRRegions.empty() &&
+           "Empty");
+    setCHRRegions(Scope, Scope);
+    Output.push_back(Scope);
+    CHR_DEBUG(
+        dbgs() << "setCHRRegions HoistStopMap " << *Scope << "\n";
+        for (auto pair : Scope->HoistStopMap) {
+          Region *R = pair.first;
+          dbgs() << "Region " << R->getNameStr() << "\n";
+          for (Instruction *I : pair.second) {
+            dbgs() << "HoistStop " << *I << "\n";
+          }
+        }
+        dbgs() << "CHRRegions" << "\n";
+        for (RegInfo &RI : Scope->CHRRegions) {
+          dbgs() << RI.R->getNameStr() << "\n";
+        });
+  }
+}
+
+void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) {
+  DenseSet<Instruction *> Unhoistables;
+  // Put the biased selects in Unhoistables because they should stay where they
+  // are and constant-folded after CHR (in case one biased select or a branch
+  // can depend on another biased select.)
+  for (RegInfo &RI : Scope->RegInfos) {
+    for (SelectInst *SI : RI.Selects) {
+      Unhoistables.insert(SI);
+    }
+  }
+  Instruction *InsertPoint = OutermostScope->BranchInsertPoint;
+  for (RegInfo &RI : Scope->RegInfos) {
+    Region *R = RI.R;
+    DenseSet<Instruction *> HoistStops;
+    bool IsHoisted = false;
+    if (RI.HasBranch) {
+      assert((OutermostScope->TrueBiasedRegions.count(R) > 0 ||
+              OutermostScope->FalseBiasedRegions.count(R) > 0) &&
+             "Must be truthy or falsy");
+      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      // Note checkHoistValue fills in HoistStops.
+      DenseMap<Instruction *, bool> Visited;
+      bool IsHoistable = checkHoistValue(BI->getCondition(), InsertPoint, DT,
+                                         Unhoistables, &HoistStops, Visited);
+      assert(IsHoistable && "Must be hoistable");
+      (void)(IsHoistable);  // Unused in release build
+      IsHoisted = true;
+    }
+    for (SelectInst *SI : RI.Selects) {
+      assert((OutermostScope->TrueBiasedSelects.count(SI) > 0 ||
+              OutermostScope->FalseBiasedSelects.count(SI) > 0) &&
+             "Must be true or false biased");
+      // Note checkHoistValue fills in HoistStops.
+      DenseMap<Instruction *, bool> Visited;
+      bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint, DT,
+                                         Unhoistables, &HoistStops, Visited);
+      assert(IsHoistable && "Must be hoistable");
+      (void)(IsHoistable);  // Unused in release build
+      IsHoisted = true;
+    }
+    if (IsHoisted) {
+      OutermostScope->CHRRegions.push_back(RI);
+      OutermostScope->HoistStopMap[R] = HoistStops;
+    }
+  }
+  for (CHRScope *Sub : Scope->Subs)
+    setCHRRegions(Sub, OutermostScope);
+}
+
+bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) {
+  return Scope1->RegInfos[0].R->getDepth() < Scope2->RegInfos[0].R->getDepth();
+}
+
+void CHR::sortScopes(SmallVectorImpl<CHRScope *> &Input,
+                     SmallVectorImpl<CHRScope *> &Output) {
+  Output.resize(Input.size());
+  llvm::copy(Input, Output.begin());
+  llvm::stable_sort(Output, CHRScopeSorter);
+}
+
+// Return true if V is already hoisted or was hoisted (along with its operands)
+// to the insert point.
+static void hoistValue(Value *V, Instruction *HoistPoint, Region *R,
+                       HoistStopMapTy &HoistStopMap,
+                       DenseSet<Instruction *> &HoistedSet,
+                       DenseSet<PHINode *> &TrivialPHIs,
+                       DominatorTree &DT) {
+  auto IT = HoistStopMap.find(R);
+  assert(IT != HoistStopMap.end() && "Region must be in hoist stop map");
+  DenseSet<Instruction *> &HoistStops = IT->second;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (I == HoistPoint)
+      return;
+    if (HoistStops.count(I))
+      return;
+    if (auto *PN = dyn_cast<PHINode>(I))
+      if (TrivialPHIs.count(PN))
+        // The trivial phi inserted by the previous CHR scope could replace a
+        // non-phi in HoistStops. Note that since this phi is at the exit of a
+        // previous CHR scope, which dominates this scope, it's safe to stop
+        // hoisting there.
+        return;
+    if (HoistedSet.count(I))
+      // Already hoisted, return.
+      return;
+    assert(isHoistableInstructionType(I) && "Unhoistable instruction type");
+    assert(DT.getNode(I->getParent()) && "DT must contain I's block");
+    assert(DT.getNode(HoistPoint->getParent()) &&
+           "DT must contain HoistPoint block");
+    if (DT.dominates(I, HoistPoint))
+      // We are already above the hoist point. Stop here. This may be necessary
+      // when multiple scopes would independently hoist the same
+      // instruction. Since an outer (dominating) scope would hoist it to its
+      // entry before an inner (dominated) scope would to its entry, the inner
+      // scope may see the instruction already hoisted, in which case it
+      // potentially wrong for the inner scope to hoist it and could cause bad
+      // IR (non-dominating def), but safe to skip hoisting it instead because
+      // it's already in a block that dominates the inner scope.
+      return;
+    for (Value *Op : I->operands()) {
+      hoistValue(Op, HoistPoint, R, HoistStopMap, HoistedSet, TrivialPHIs, DT);
+    }
+    I->moveBefore(HoistPoint);
+    HoistedSet.insert(I);
+    CHR_DEBUG(dbgs() << "hoistValue " << *I << "\n");
+  }
+}
+
+// Hoist the dependent condition values of the branches and the selects in the
+// scope to the insert point.
+static void hoistScopeConditions(CHRScope *Scope, Instruction *HoistPoint,
+                                 DenseSet<PHINode *> &TrivialPHIs,
+                                 DominatorTree &DT) {
+  DenseSet<Instruction *> HoistedSet;
+  for (const RegInfo &RI : Scope->CHRRegions) {
+    Region *R = RI.R;
+    bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+    bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
+    if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
+      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      hoistValue(BI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
+                 HoistedSet, TrivialPHIs, DT);
+    }
+    for (SelectInst *SI : RI.Selects) {
+      bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+      bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI);
+      if (!(IsTrueBiased || IsFalseBiased))
+        continue;
+      hoistValue(SI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
+                 HoistedSet, TrivialPHIs, DT);
+    }
+  }
+}
+
+// Negate the predicate if an ICmp if it's used only by branches or selects by
+// swapping the operands of the branches or the selects. Returns true if success.
+static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
+                                                 Instruction *ExcludedUser,
+                                                 CHRScope *Scope) {
+  for (User *U : ICmp->users()) {
+    if (U == ExcludedUser)
+      continue;
+    if (isa<BranchInst>(U) && cast<BranchInst>(U)->isConditional())
+      continue;
+    if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == ICmp)
+      continue;
+    return false;
+  }
+  for (User *U : ICmp->users()) {
+    if (U == ExcludedUser)
+      continue;
+    if (auto *BI = dyn_cast<BranchInst>(U)) {
+      assert(BI->isConditional() && "Must be conditional");
+      BI->swapSuccessors();
+      // Don't need to swap this in terms of
+      // TrueBiasedRegions/FalseBiasedRegions because true-based/false-based
+      // mean whehter the branch is likely go into the if-then rather than
+      // successor0/successor1 and because we can tell which edge is the then or
+      // the else one by comparing the destination to the region exit block.
+      continue;
+    }
+    if (auto *SI = dyn_cast<SelectInst>(U)) {
+      // Swap operands
+      SI->swapValues();
+      SI->swapProfMetadata();
+      if (Scope->TrueBiasedSelects.count(SI)) {
+        assert(Scope->FalseBiasedSelects.count(SI) == 0 &&
+               "Must not be already in");
+        Scope->FalseBiasedSelects.insert(SI);
+      } else if (Scope->FalseBiasedSelects.count(SI)) {
+        assert(Scope->TrueBiasedSelects.count(SI) == 0 &&
+               "Must not be already in");
+        Scope->TrueBiasedSelects.insert(SI);
+      }
+      continue;
+    }
+    llvm_unreachable("Must be a branch or a select");
+  }
+  ICmp->setPredicate(CmpInst::getInversePredicate(ICmp->getPredicate()));
+  return true;
+}
+
+// A helper for transformScopes. Insert a trivial phi at the scope exit block
+// for a value that's defined in the scope but used outside it (meaning it's
+// alive at the exit block).
+static void insertTrivialPHIs(CHRScope *Scope,
+                              BasicBlock *EntryBlock, BasicBlock *ExitBlock,
+                              DenseSet<PHINode *> &TrivialPHIs) {
+  DenseSet<BasicBlock *> BlocksInScopeSet;
+  SmallVector<BasicBlock *, 8> BlocksInScopeVec;
+  for (RegInfo &RI : Scope->RegInfos) {
+    for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
+                                            // sub-Scopes.
+      BlocksInScopeSet.insert(BB);
+      BlocksInScopeVec.push_back(BB);
+    }
+  }
+  CHR_DEBUG(
+      dbgs() << "Inserting redudant phis\n";
+      for (BasicBlock *BB : BlocksInScopeVec) {
+        dbgs() << "BlockInScope " << BB->getName() << "\n";
+      });
+  for (BasicBlock *BB : BlocksInScopeVec) {
+    for (Instruction &I : *BB) {
+      SmallVector<Instruction *, 8> Users;
+      for (User *U : I.users()) {
+        if (auto *UI = dyn_cast<Instruction>(U)) {
+          if (BlocksInScopeSet.count(UI->getParent()) == 0 &&
+              // Unless there's already a phi for I at the exit block.
+              !(isa<PHINode>(UI) && UI->getParent() == ExitBlock)) {
+            CHR_DEBUG(dbgs() << "V " << I << "\n");
+            CHR_DEBUG(dbgs() << "Used outside scope by user " << *UI << "\n");
+            Users.push_back(UI);
+          } else if (UI->getParent() == EntryBlock && isa<PHINode>(UI)) {
+            // There's a loop backedge from a block that's dominated by this
+            // scope to the entry block.
+            CHR_DEBUG(dbgs() << "V " << I << "\n");
+            CHR_DEBUG(dbgs()
+                      << "Used at entry block (for a back edge) by a phi user "
+                      << *UI << "\n");
+            Users.push_back(UI);
+          }
+        }
+      }
+      if (Users.size() > 0) {
+        // Insert a trivial phi for I (phi [&I, P0], [&I, P1], ...) at
+        // ExitBlock. Replace I with the new phi in UI unless UI is another
+        // phi at ExitBlock.
+        unsigned PredCount = std::distance(pred_begin(ExitBlock),
+                                           pred_end(ExitBlock));
+        PHINode *PN = PHINode::Create(I.getType(), PredCount, "",
+                                      &ExitBlock->front());
+        for (BasicBlock *Pred : predecessors(ExitBlock)) {
+          PN->addIncoming(&I, Pred);
+        }
+        TrivialPHIs.insert(PN);
+        CHR_DEBUG(dbgs() << "Insert phi " << *PN << "\n");
+        for (Instruction *UI : Users) {
+          for (unsigned J = 0, NumOps = UI->getNumOperands(); J < NumOps; ++J) {
+            if (UI->getOperand(J) == &I) {
+              UI->setOperand(J, PN);
+            }
+          }
+          CHR_DEBUG(dbgs() << "Updated user " << *UI << "\n");
+        }
+      }
+    }
+  }
+}
+
+// Assert that all the CHR regions of the scope have a biased branch or select.
+static void LLVM_ATTRIBUTE_UNUSED
+assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) {
+#ifndef NDEBUG
+  auto HasBiasedBranchOrSelect = [](RegInfo &RI, CHRScope *Scope) {
+    if (Scope->TrueBiasedRegions.count(RI.R) ||
+        Scope->FalseBiasedRegions.count(RI.R))
+      return true;
+    for (SelectInst *SI : RI.Selects)
+      if (Scope->TrueBiasedSelects.count(SI) ||
+          Scope->FalseBiasedSelects.count(SI))
+        return true;
+    return false;
+  };
+  for (RegInfo &RI : Scope->CHRRegions) {
+    assert(HasBiasedBranchOrSelect(RI, Scope) &&
+           "Must have biased branch or select");
+  }
+#endif
+}
+
+// Assert that all the condition values of the biased branches and selects have
+// been hoisted to the pre-entry block or outside of the scope.
+static void LLVM_ATTRIBUTE_UNUSED assertBranchOrSelectConditionHoisted(
+    CHRScope *Scope, BasicBlock *PreEntryBlock) {
+  CHR_DEBUG(dbgs() << "Biased regions condition values \n");
+  for (RegInfo &RI : Scope->CHRRegions) {
+    Region *R = RI.R;
+    bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+    bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
+    if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
+      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      Value *V = BI->getCondition();
+      CHR_DEBUG(dbgs() << *V << "\n");
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        (void)(I); // Unused in release build.
+        assert((I->getParent() == PreEntryBlock ||
+                !Scope->contains(I)) &&
+               "Must have been hoisted to PreEntryBlock or outside the scope");
+      }
+    }
+    for (SelectInst *SI : RI.Selects) {
+      bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+      bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI);
+      if (!(IsTrueBiased || IsFalseBiased))
+        continue;
+      Value *V = SI->getCondition();
+      CHR_DEBUG(dbgs() << *V << "\n");
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        (void)(I); // Unused in release build.
+        assert((I->getParent() == PreEntryBlock ||
+                !Scope->contains(I)) &&
+               "Must have been hoisted to PreEntryBlock or outside the scope");
+      }
+    }
+  }
+}
+
+void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
+  CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n");
+
+  assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region");
+  Region *FirstRegion = Scope->RegInfos[0].R;
+  BasicBlock *EntryBlock = FirstRegion->getEntry();
+  Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R;
+  BasicBlock *ExitBlock = LastRegion->getExit();
+  Optional<uint64_t> ProfileCount = BFI.getBlockProfileCount(EntryBlock);
+
+  if (ExitBlock) {
+    // Insert a trivial phi at the exit block (where the CHR hot path and the
+    // cold path merges) for a value that's defined in the scope but used
+    // outside it (meaning it's alive at the exit block). We will add the
+    // incoming values for the CHR cold paths to it below. Without this, we'd
+    // miss updating phi's for such values unless there happens to already be a
+    // phi for that value there.
+    insertTrivialPHIs(Scope, EntryBlock, ExitBlock, TrivialPHIs);
+  }
+
+  // Split the entry block of the first region. The new block becomes the new
+  // entry block of the first region. The old entry block becomes the block to
+  // insert the CHR branch into. Note DT gets updated. Since DT gets updated
+  // through the split, we update the entry of the first region after the split,
+  // and Region only points to the entry and the exit blocks, rather than
+  // keeping everything in a list or set, the blocks membership and the
+  // entry/exit blocks of the region are still valid after the split.
+  CHR_DEBUG(dbgs() << "Splitting entry block " << EntryBlock->getName()
+            << " at " << *Scope->BranchInsertPoint << "\n");
+  BasicBlock *NewEntryBlock =
+      SplitBlock(EntryBlock, Scope->BranchInsertPoint, &DT);
+  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+         "NewEntryBlock's only pred must be EntryBlock");
+  FirstRegion->replaceEntryRecursive(NewEntryBlock);
+  BasicBlock *PreEntryBlock = EntryBlock;
+
+  ValueToValueMapTy VMap;
+  // Clone the blocks in the scope (excluding the PreEntryBlock) to split into a
+  // hot path (originals) and a cold path (clones) and update the PHIs at the
+  // exit block.
+  cloneScopeBlocks(Scope, PreEntryBlock, ExitBlock, LastRegion, VMap);
+
+  // Replace the old (placeholder) branch with the new (merged) conditional
+  // branch.
+  BranchInst *MergedBr = createMergedBranch(PreEntryBlock, EntryBlock,
+                                            NewEntryBlock, VMap);
+
+#ifndef NDEBUG
+  assertCHRRegionsHaveBiasedBranchOrSelect(Scope);
+#endif
+
+  // Hoist the conditional values of the branches/selects.
+  hoistScopeConditions(Scope, PreEntryBlock->getTerminator(), TrivialPHIs, DT);
+
+#ifndef NDEBUG
+  assertBranchOrSelectConditionHoisted(Scope, PreEntryBlock);
+#endif
+
+  // Create the combined branch condition and constant-fold the branches/selects
+  // in the hot path.
+  fixupBranchesAndSelects(Scope, PreEntryBlock, MergedBr,
+                          ProfileCount ? ProfileCount.getValue() : 0);
+}
+
+// A helper for transformScopes. Clone the blocks in the scope (excluding the
+// PreEntryBlock) to split into a hot path and a cold path and update the PHIs
+// at the exit block.
+void CHR::cloneScopeBlocks(CHRScope *Scope,
+                           BasicBlock *PreEntryBlock,
+                           BasicBlock *ExitBlock,
+                           Region *LastRegion,
+                           ValueToValueMapTy &VMap) {
+  // Clone all the blocks. The original blocks will be the hot-path
+  // CHR-optimized code and the cloned blocks will be the original unoptimized
+  // code. This is so that the block pointers from the
+  // CHRScope/Region/RegionInfo can stay valid in pointing to the hot-path code
+  // which CHR should apply to.
+  SmallVector<BasicBlock*, 8> NewBlocks;
+  for (RegInfo &RI : Scope->RegInfos)
+    for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
+                                            // sub-Scopes.
+      assert(BB != PreEntryBlock && "Don't copy the preetntry block");
+      BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".nonchr", &F);
+      NewBlocks.push_back(NewBB);
+      VMap[BB] = NewBB;
+    }
+
+  // Place the cloned blocks right after the original blocks (right before the
+  // exit block of.)
+  if (ExitBlock)
+    F.getBasicBlockList().splice(ExitBlock->getIterator(),
+                                 F.getBasicBlockList(),
+                                 NewBlocks[0]->getIterator(), F.end());
+
+  // Update the cloned blocks/instructions to refer to themselves.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (Instruction &I : *NewBlocks[i])
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+  // Add the cloned blocks to the PHIs of the exit blocks. ExitBlock is null for
+  // the top-level region but we don't need to add PHIs. The trivial PHIs
+  // inserted above will be updated here.
+  if (ExitBlock)
+    for (PHINode &PN : ExitBlock->phis())
+      for (unsigned I = 0, NumOps = PN.getNumIncomingValues(); I < NumOps;
+           ++I) {
+        BasicBlock *Pred = PN.getIncomingBlock(I);
+        if (LastRegion->contains(Pred)) {
+          Value *V = PN.getIncomingValue(I);
+          auto It = VMap.find(V);
+          if (It != VMap.end()) V = It->second;
+          assert(VMap.find(Pred) != VMap.end() && "Pred must have been cloned");
+          PN.addIncoming(V, cast<BasicBlock>(VMap[Pred]));
+        }
+      }
+}
+
+// A helper for transformScope. Replace the old (placeholder) branch with the
+// new (merged) conditional branch.
+BranchInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock,
+                                    BasicBlock *EntryBlock,
+                                    BasicBlock *NewEntryBlock,
+                                    ValueToValueMapTy &VMap) {
+  BranchInst *OldBR = cast<BranchInst>(PreEntryBlock->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == NewEntryBlock &&
+         "SplitBlock did not work correctly!");
+  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+         "NewEntryBlock's only pred must be EntryBlock");
+  assert(VMap.find(NewEntryBlock) != VMap.end() &&
+         "NewEntryBlock must have been copied");
+  OldBR->dropAllReferences();
+  OldBR->eraseFromParent();
+  // The true predicate is a placeholder. It will be replaced later in
+  // fixupBranchesAndSelects().
+  BranchInst *NewBR = BranchInst::Create(NewEntryBlock,
+                                         cast<BasicBlock>(VMap[NewEntryBlock]),
+                                         ConstantInt::getTrue(F.getContext()));
+  PreEntryBlock->getInstList().push_back(NewBR);
+  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+         "NewEntryBlock's only pred must be EntryBlock");
+  return NewBR;
+}
+
+// A helper for transformScopes. Create the combined branch condition and
+// constant-fold the branches/selects in the hot path.
+void CHR::fixupBranchesAndSelects(CHRScope *Scope,
+                                  BasicBlock *PreEntryBlock,
+                                  BranchInst *MergedBR,
+                                  uint64_t ProfileCount) {
+  Value *MergedCondition = ConstantInt::getTrue(F.getContext());
+  BranchProbability CHRBranchBias(1, 1);
+  uint64_t NumCHRedBranches = 0;
+  IRBuilder<> IRB(PreEntryBlock->getTerminator());
+  for (RegInfo &RI : Scope->CHRRegions) {
+    Region *R = RI.R;
+    if (RI.HasBranch) {
+      fixupBranch(R, Scope, IRB, MergedCondition, CHRBranchBias);
+      ++NumCHRedBranches;
+    }
+    for (SelectInst *SI : RI.Selects) {
+      fixupSelect(SI, Scope, IRB, MergedCondition, CHRBranchBias);
+      ++NumCHRedBranches;
+    }
+  }
+  Stats.NumBranchesDelta += NumCHRedBranches - 1;
+  Stats.WeightedNumBranchesDelta += (NumCHRedBranches - 1) * ProfileCount;
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE,
+                              "CHR",
+                              // Refer to the hot (original) path
+                              MergedBR->getSuccessor(0)->getTerminator())
+        << "Merged " << ore::NV("NumCHRedBranches", NumCHRedBranches)
+        << " branches or selects";
+  });
+  MergedBR->setCondition(MergedCondition);
+  SmallVector<uint32_t, 2> Weights;
+  Weights.push_back(static_cast<uint32_t>(CHRBranchBias.scale(1000)));
+  Weights.push_back(static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)));
+  MDBuilder MDB(F.getContext());
+  MergedBR->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1]
+            << "\n");
+}
+
+// A helper for fixupBranchesAndSelects. Add to the combined branch condition
+// and constant-fold a branch in the hot path.
+void CHR::fixupBranch(Region *R, CHRScope *Scope,
+                      IRBuilder<> &IRB,
+                      Value *&MergedCondition,
+                      BranchProbability &CHRBranchBias) {
+  bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+  assert((IsTrueBiased || Scope->FalseBiasedRegions.count(R)) &&
+         "Must be truthy or falsy");
+  auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+  assert(BranchBiasMap.find(R) != BranchBiasMap.end() &&
+         "Must be in the bias map");
+  BranchProbability Bias = BranchBiasMap[R];
+  assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
+  // Take the min.
+  if (CHRBranchBias > Bias)
+    CHRBranchBias = Bias;
+  BasicBlock *IfThen = BI->getSuccessor(1);
+  BasicBlock *IfElse = BI->getSuccessor(0);
+  BasicBlock *RegionExitBlock = R->getExit();
+  assert(RegionExitBlock && "Null ExitBlock");
+  assert((IfThen == RegionExitBlock || IfElse == RegionExitBlock) &&
+         IfThen != IfElse && "Invariant from findScopes");
+  if (IfThen == RegionExitBlock) {
+    // Swap them so that IfThen means going into it and IfElse means skipping
+    // it.
+    std::swap(IfThen, IfElse);
+  }
+  CHR_DEBUG(dbgs() << "IfThen " << IfThen->getName()
+            << " IfElse " << IfElse->getName() << "\n");
+  Value *Cond = BI->getCondition();
+  BasicBlock *HotTarget = IsTrueBiased ? IfThen : IfElse;
+  bool ConditionTrue = HotTarget == BI->getSuccessor(0);
+  addToMergedCondition(ConditionTrue, Cond, BI, Scope, IRB,
+                       MergedCondition);
+  // Constant-fold the branch at ClonedEntryBlock.
+  assert(ConditionTrue == (HotTarget == BI->getSuccessor(0)) &&
+         "The successor shouldn't change");
+  Value *NewCondition = ConditionTrue ?
+                        ConstantInt::getTrue(F.getContext()) :
+                        ConstantInt::getFalse(F.getContext());
+  BI->setCondition(NewCondition);
+}
+
+// A helper for fixupBranchesAndSelects. Add to the combined branch condition
+// and constant-fold a select in the hot path.
+void CHR::fixupSelect(SelectInst *SI, CHRScope *Scope,
+                      IRBuilder<> &IRB,
+                      Value *&MergedCondition,
+                      BranchProbability &CHRBranchBias) {
+  bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+  assert((IsTrueBiased ||
+          Scope->FalseBiasedSelects.count(SI)) && "Must be biased");
+  assert(SelectBiasMap.find(SI) != SelectBiasMap.end() &&
+         "Must be in the bias map");
+  BranchProbability Bias = SelectBiasMap[SI];
+  assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
+  // Take the min.
+  if (CHRBranchBias > Bias)
+    CHRBranchBias = Bias;
+  Value *Cond = SI->getCondition();
+  addToMergedCondition(IsTrueBiased, Cond, SI, Scope, IRB,
+                       MergedCondition);
+  Value *NewCondition = IsTrueBiased ?
+                        ConstantInt::getTrue(F.getContext()) :
+                        ConstantInt::getFalse(F.getContext());
+  SI->setCondition(NewCondition);
+}
+
+// A helper for fixupBranch/fixupSelect. Add a branch condition to the merged
+// condition.
+void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond,
+                               Instruction *BranchOrSelect,
+                               CHRScope *Scope,
+                               IRBuilder<> &IRB,
+                               Value *&MergedCondition) {
+  if (IsTrueBiased) {
+    MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
+  } else {
+    // If Cond is an icmp and all users of V except for BranchOrSelect is a
+    // branch, negate the icmp predicate and swap the branch targets and avoid
+    // inserting an Xor to negate Cond.
+    bool Done = false;
+    if (auto *ICmp = dyn_cast<ICmpInst>(Cond))
+      if (negateICmpIfUsedByBranchOrSelectOnly(ICmp, BranchOrSelect, Scope)) {
+        MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
+        Done = true;
+      }
+    if (!Done) {
+      Value *Negate = IRB.CreateXor(
+          ConstantInt::getTrue(F.getContext()), Cond);
+      MergedCondition = IRB.CreateAnd(MergedCondition, Negate);
+    }
+  }
+}
+
+void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
+  unsigned I = 0;
+  DenseSet<PHINode *> TrivialPHIs;
+  for (CHRScope *Scope : CHRScopes) {
+    transformScopes(Scope, TrivialPHIs);
+    CHR_DEBUG(
+        std::ostringstream oss;
+        oss << " after transformScopes " << I++;
+        dumpIR(F, oss.str().c_str(), nullptr));
+    (void)I;
+  }
+}
+
+static void LLVM_ATTRIBUTE_UNUSED
+dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, const char *Label) {
+  dbgs() << Label << " " << Scopes.size() << "\n";
+  for (CHRScope *Scope : Scopes) {
+    dbgs() << *Scope << "\n";
+  }
+}
+
+bool CHR::run() {
+  if (!shouldApply(F, PSI))
+    return false;
+
+  CHR_DEBUG(dumpIR(F, "before", nullptr));
+
+  bool Changed = false;
+  {
+    CHR_DEBUG(
+        dbgs() << "RegionInfo:\n";
+        RI.print(dbgs()));
+
+    // Recursively traverse the region tree and find regions that have biased
+    // branches and/or selects and create scopes.
+    SmallVector<CHRScope *, 8> AllScopes;
+    findScopes(AllScopes);
+    CHR_DEBUG(dumpScopes(AllScopes, "All scopes"));
+
+    // Split the scopes if 1) the conditiona values of the biased
+    // branches/selects of the inner/lower scope can't be hoisted up to the
+    // outermost/uppermost scope entry, or 2) the condition values of the biased
+    // branches/selects in a scope (including subscopes) don't share at least
+    // one common value.
+    SmallVector<CHRScope *, 8> SplitScopes;
+    splitScopes(AllScopes, SplitScopes);
+    CHR_DEBUG(dumpScopes(SplitScopes, "Split scopes"));
+
+    // After splitting, set the biased regions and selects of a scope (a tree
+    // root) that include those of the subscopes.
+    classifyBiasedScopes(SplitScopes);
+    CHR_DEBUG(dbgs() << "Set per-scope bias " << SplitScopes.size() << "\n");
+
+    // Filter out the scopes that has only one biased region or select (CHR
+    // isn't useful in such a case).
+    SmallVector<CHRScope *, 8> FilteredScopes;
+    filterScopes(SplitScopes, FilteredScopes);
+    CHR_DEBUG(dumpScopes(FilteredScopes, "Filtered scopes"));
+
+    // Set the regions to be CHR'ed and their hoist stops for each scope.
+    SmallVector<CHRScope *, 8> SetScopes;
+    setCHRRegions(FilteredScopes, SetScopes);
+    CHR_DEBUG(dumpScopes(SetScopes, "Set CHR regions"));
+
+    // Sort CHRScopes by the depth so that outer CHRScopes comes before inner
+    // ones. We need to apply CHR from outer to inner so that we apply CHR only
+    // to the hot path, rather than both hot and cold paths.
+    SmallVector<CHRScope *, 8> SortedScopes;
+    sortScopes(SetScopes, SortedScopes);
+    CHR_DEBUG(dumpScopes(SortedScopes, "Sorted scopes"));
+
+    CHR_DEBUG(
+        dbgs() << "RegionInfo:\n";
+        RI.print(dbgs()));
+
+    // Apply the CHR transformation.
+    if (!SortedScopes.empty()) {
+      transformScopes(SortedScopes);
+      Changed = true;
+    }
+  }
+
+  if (Changed) {
+    CHR_DEBUG(dumpIR(F, "after", &Stats));
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Stats", &F)
+          << ore::NV("Function", &F) << " "
+          << "Reduced the number of branches in hot paths by "
+          << ore::NV("NumBranchesDelta", Stats.NumBranchesDelta)
+          << " (static) and "
+          << ore::NV("WeightedNumBranchesDelta", Stats.WeightedNumBranchesDelta)
+          << " (weighted by PGO count)";
+    });
+  }
+
+  return Changed;
+}
+
+bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ProfileSummaryInfo &PSI =
+      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
+  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
+      std::make_unique<OptimizationRemarkEmitter>(&F);
+  return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run();
+}
+
+namespace llvm {
+
+ControlHeightReductionPass::ControlHeightReductionPass() {
+  parseCHRFilterFiles();
+}
+
+PreservedAnalyses ControlHeightReductionPass::run(
+    Function &F,
+    FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  auto &MAM = MAMProxy.getManager();
+  auto &PSI = *MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto &RI = FAM.getResult<RegionInfoAnalysis>(F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  bool Changed = CHR(F, BFI, DT, PSI, RI, ORE).run();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
new file mode 100644
index 000000000000..c0353cba0b2f
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -0,0 +1,1778 @@
+//===- DataFlowSanitizer.cpp - dynamic data flow analysis -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
+/// analysis.
+///
+/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
+/// class of bugs on its own.  Instead, it provides a generic dynamic data flow
+/// analysis framework to be used by clients to help detect application-specific
+/// issues within their own code.
+///
+/// The analysis is based on automatic propagation of data flow labels (also
+/// known as taint labels) through a program as it performs computation.  Each
+/// byte of application memory is backed by two bytes of shadow memory which
+/// hold the label.  On Linux/x86_64, memory is laid out as follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// |                    |
+/// |       unused       |
+/// |                    |
+/// +--------------------+ 0x200200000000 (kUnusedAddr)
+/// |    union table     |
+/// +--------------------+ 0x200000000000 (kUnionTableAddr)
+/// |   shadow memory    |
+/// +--------------------+ 0x000000010000 (kShadowAddr)
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+/// To derive a shadow memory address from an application memory address,
+/// bits 44-46 are cleared to bring the address into the range
+/// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to
+/// account for the double byte representation of shadow labels and move the
+/// address into the shadow memory range.  See the function
+/// DataFlowSanitizer::getShadowAddress below.
+///
+/// For more information, please refer to the design document:
+/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+// External symbol to be used when generating the shadow address for
+// architectures with multiple VMAs. Instead of using a constant integer
+// the runtime will set the external mask based on the VMA range.
+static const char *const kDFSanExternShadowPtrMask = "__dfsan_shadow_ptr_mask";
+
+// The -dfsan-preserve-alignment flag controls whether this pass assumes that
+// alignment requirements provided by the input IR are correct.  For example,
+// if the input IR contains a load with alignment 8, this flag will cause
+// the shadow load to have alignment 16.  This flag is disabled by default as
+// we have unfortunately encountered too much code (including Clang itself;
+// see PR14291) which performs misaligned access.
+static cl::opt<bool> ClPreserveAlignment(
+    "dfsan-preserve-alignment",
+    cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
+    cl::init(false));
+
+// The ABI list files control how shadow parameters are passed. The pass treats
+// every function labelled "uninstrumented" in the ABI list file as conforming
+// to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
+// additional annotations for those functions, a call to one of those functions
+// will produce a warning message, as the labelling behaviour of the function is
+// unknown.  The other supported annotations are "functional" and "discard",
+// which are described below under DataFlowSanitizer::WrapperKind.
+static cl::list<std::string> ClABIListFiles(
+    "dfsan-abilist",
+    cl::desc("File listing native ABI functions and how the pass treats them"),
+    cl::Hidden);
+
+// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
+// functions (see DataFlowSanitizer::InstrumentedABI below).
+static cl::opt<bool> ClArgsABI(
+    "dfsan-args-abi",
+    cl::desc("Use the argument ABI rather than the TLS ABI"),
+    cl::Hidden);
+
+// Controls whether the pass includes or ignores the labels of pointers in load
+// instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnLoad(
+    "dfsan-combine-pointer-labels-on-load",
+    cl::desc("Combine the label of the pointer with the label of the data when "
+             "loading from memory."),
+    cl::Hidden, cl::init(true));
+
+// Controls whether the pass includes or ignores the labels of pointers in
+// stores instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnStore(
+    "dfsan-combine-pointer-labels-on-store",
+    cl::desc("Combine the label of the pointer with the label of the data when "
+             "storing in memory."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDebugNonzeroLabels(
+    "dfsan-debug-nonzero-labels",
+    cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
+             "load or return with a nonzero label"),
+    cl::Hidden);
+
+static StringRef GetGlobalTypeString(const GlobalValue &G) {
+  // Types of GlobalVariables are always pointer types.
+  Type *GType = G.getValueType();
+  // For now we support blacklisting struct types only.
+  if (StructType *SGType = dyn_cast<StructType>(GType)) {
+    if (!SGType->isLiteral())
+      return SGType->getName();
+  }
+  return "<unknown type>";
+}
+
+namespace {
+
+class DFSanABIList {
+  std::unique_ptr<SpecialCaseList> SCL;
+
+ public:
+  DFSanABIList() = default;
+
+  void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); }
+
+  /// Returns whether either this function or its source file are listed in the
+  /// given category.
+  bool isIn(const Function &F, StringRef Category) const {
+    return isIn(*F.getParent(), Category) ||
+           SCL->inSection("dataflow", "fun", F.getName(), Category);
+  }
+
+  /// Returns whether this global alias is listed in the given category.
+  ///
+  /// If GA aliases a function, the alias's name is matched as a function name
+  /// would be.  Similarly, aliases of globals are matched like globals.
+  bool isIn(const GlobalAlias &GA, StringRef Category) const {
+    if (isIn(*GA.getParent(), Category))
+      return true;
+
+    if (isa<FunctionType>(GA.getValueType()))
+      return SCL->inSection("dataflow", "fun", GA.getName(), Category);
+
+    return SCL->inSection("dataflow", "global", GA.getName(), Category) ||
+           SCL->inSection("dataflow", "type", GetGlobalTypeString(GA),
+                          Category);
+  }
+
+  /// Returns whether this module is listed in the given category.
+  bool isIn(const Module &M, StringRef Category) const {
+    return SCL->inSection("dataflow", "src", M.getModuleIdentifier(), Category);
+  }
+};
+
+/// TransformedFunction is used to express the result of transforming one
+/// function type into another.  This struct is immutable.  It holds metadata
+/// useful for updating calls of the old function to the new type.
+struct TransformedFunction {
+  TransformedFunction(FunctionType* OriginalType,
+                      FunctionType* TransformedType,
+                      std::vector<unsigned> ArgumentIndexMapping)
+      : OriginalType(OriginalType),
+        TransformedType(TransformedType),
+        ArgumentIndexMapping(ArgumentIndexMapping) {}
+
+  // Disallow copies.
+  TransformedFunction(const TransformedFunction&) = delete;
+  TransformedFunction& operator=(const TransformedFunction&) = delete;
+
+  // Allow moves.
+  TransformedFunction(TransformedFunction&&) = default;
+  TransformedFunction& operator=(TransformedFunction&&) = default;
+
+  /// Type of the function before the transformation.
+  FunctionType *OriginalType;
+
+  /// Type of the function after the transformation.
+  FunctionType *TransformedType;
+
+  /// Transforming a function may change the position of arguments.  This
+  /// member records the mapping from each argument's old position to its new
+  /// position.  Argument positions are zero-indexed.  If the transformation
+  /// from F to F' made the first argument of F into the third argument of F',
+  /// then ArgumentIndexMapping[0] will equal 2.
+  std::vector<unsigned> ArgumentIndexMapping;
+};
+
+/// Given function attributes from a call site for the original function,
+/// return function attributes appropriate for a call to the transformed
+/// function.
+AttributeList TransformFunctionAttributes(
+    const TransformedFunction& TransformedFunction,
+    LLVMContext& Ctx, AttributeList CallSiteAttrs) {
+
+  // Construct a vector of AttributeSet for each function argument.
+  std::vector<llvm::AttributeSet> ArgumentAttributes(
+      TransformedFunction.TransformedType->getNumParams());
+
+  // Copy attributes from the parameter of the original function to the
+  // transformed version.  'ArgumentIndexMapping' holds the mapping from
+  // old argument position to new.
+  for (unsigned i=0, ie = TransformedFunction.ArgumentIndexMapping.size();
+       i < ie; ++i) {
+    unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[i];
+    ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(i);
+  }
+
+  // Copy annotations on varargs arguments.
+  for (unsigned i = TransformedFunction.OriginalType->getNumParams(),
+       ie = CallSiteAttrs.getNumAttrSets(); i<ie; ++i) {
+    ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(i));
+  }
+
+  return AttributeList::get(
+      Ctx,
+      CallSiteAttrs.getFnAttributes(),
+      CallSiteAttrs.getRetAttributes(),
+      llvm::makeArrayRef(ArgumentAttributes));
+}
+
+class DataFlowSanitizer : public ModulePass {
+  friend struct DFSanFunction;
+  friend class DFSanVisitor;
+
+  enum {
+    ShadowWidth = 16
+  };
+
+  /// Which ABI should be used for instrumented functions?
+  enum InstrumentedABI {
+    /// Argument and return value labels are passed through additional
+    /// arguments and by modifying the return type.
+    IA_Args,
+
+    /// Argument and return value labels are passed through TLS variables
+    /// __dfsan_arg_tls and __dfsan_retval_tls.
+    IA_TLS
+  };
+
+  /// How should calls to uninstrumented functions be handled?
+  enum WrapperKind {
+    /// This function is present in an uninstrumented form but we don't know
+    /// how it should be handled.  Print a warning and call the function anyway.
+    /// Don't label the return value.
+    WK_Warning,
+
+    /// This function does not write to (user-accessible) memory, and its return
+    /// value is unlabelled.
+    WK_Discard,
+
+    /// This function does not write to (user-accessible) memory, and the label
+    /// of its return value is the union of the label of its arguments.
+    WK_Functional,
+
+    /// Instead of calling the function, a custom wrapper __dfsw_F is called,
+    /// where F is the name of the function.  This function may wrap the
+    /// original function or provide its own implementation.  This is similar to
+    /// the IA_Args ABI, except that IA_Args uses a struct return type to
+    /// pass the return value shadow in a register, while WK_Custom uses an
+    /// extra pointer argument to return the shadow.  This allows the wrapped
+    /// form of the function type to be expressed in C.
+    WK_Custom
+  };
+
+  Module *Mod;
+  LLVMContext *Ctx;
+  IntegerType *ShadowTy;
+  PointerType *ShadowPtrTy;
+  IntegerType *IntptrTy;
+  ConstantInt *ZeroShadow;
+  ConstantInt *ShadowPtrMask;
+  ConstantInt *ShadowPtrMul;
+  Constant *ArgTLS;
+  Constant *RetvalTLS;
+  void *(*GetArgTLSPtr)();
+  void *(*GetRetvalTLSPtr)();
+  FunctionType *GetArgTLSTy;
+  FunctionType *GetRetvalTLSTy;
+  Constant *GetArgTLS;
+  Constant *GetRetvalTLS;
+  Constant *ExternalShadowMask;
+  FunctionType *DFSanUnionFnTy;
+  FunctionType *DFSanUnionLoadFnTy;
+  FunctionType *DFSanUnimplementedFnTy;
+  FunctionType *DFSanSetLabelFnTy;
+  FunctionType *DFSanNonzeroLabelFnTy;
+  FunctionType *DFSanVarargWrapperFnTy;
+  FunctionCallee DFSanUnionFn;
+  FunctionCallee DFSanCheckedUnionFn;
+  FunctionCallee DFSanUnionLoadFn;
+  FunctionCallee DFSanUnimplementedFn;
+  FunctionCallee DFSanSetLabelFn;
+  FunctionCallee DFSanNonzeroLabelFn;
+  FunctionCallee DFSanVarargWrapperFn;
+  MDNode *ColdCallWeights;
+  DFSanABIList ABIList;
+  DenseMap<Value *, Function *> UnwrappedFnMap;
+  AttrBuilder ReadOnlyNoneAttrs;
+  bool DFSanRuntimeShadowMask = false;
+
+  Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  bool isInstrumented(const Function *F);
+  bool isInstrumented(const GlobalAlias *GA);
+  FunctionType *getArgsFunctionType(FunctionType *T);
+  FunctionType *getTrampolineFunctionType(FunctionType *T);
+  TransformedFunction getCustomFunctionType(FunctionType *T);
+  InstrumentedABI getInstrumentedABI();
+  WrapperKind getWrapperKind(Function *F);
+  void addGlobalNamePrefix(GlobalValue *GV);
+  Function *buildWrapperFunction(Function *F, StringRef NewFName,
+                                 GlobalValue::LinkageTypes NewFLink,
+                                 FunctionType *NewFT);
+  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+
+public:
+  static char ID;
+
+  DataFlowSanitizer(
+      const std::vector<std::string> &ABIListFiles = std::vector<std::string>(),
+      void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr);
+
+  bool doInitialization(Module &M) override;
+  bool runOnModule(Module &M) override;
+};
+
+struct DFSanFunction {
+  DataFlowSanitizer &DFS;
+  Function *F;
+  DominatorTree DT;
+  DataFlowSanitizer::InstrumentedABI IA;
+  bool IsNativeABI;
+  Value *ArgTLSPtr = nullptr;
+  Value *RetvalTLSPtr = nullptr;
+  AllocaInst *LabelReturnAlloca = nullptr;
+  DenseMap<Value *, Value *> ValShadowMap;
+  DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+  std::vector<std::pair<PHINode *, PHINode *>> PHIFixups;
+  DenseSet<Instruction *> SkipInsts;
+  std::vector<Value *> NonZeroChecks;
+  bool AvoidNewBlocks;
+
+  struct CachedCombinedShadow {
+    BasicBlock *Block;
+    Value *Shadow;
+  };
+  DenseMap<std::pair<Value *, Value *>, CachedCombinedShadow>
+      CachedCombinedShadows;
+  DenseMap<Value *, std::set<Value *>> ShadowElements;
+
+  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
+      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), IsNativeABI(IsNativeABI) {
+    DT.recalculate(*F);
+    // FIXME: Need to track down the register allocator issue which causes poor
+    // performance in pathological cases with large numbers of basic blocks.
+    AvoidNewBlocks = F->size() > 1000;
+  }
+
+  Value *getArgTLSPtr();
+  Value *getArgTLS(unsigned Index, Instruction *Pos);
+  Value *getRetvalTLS();
+  Value *getShadow(Value *V);
+  void setShadow(Instruction *I, Value *Shadow);
+  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
+  Value *combineOperandShadows(Instruction *Inst);
+  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
+                    Instruction *Pos);
+  void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow,
+                   Instruction *Pos);
+};
+
+class DFSanVisitor : public InstVisitor<DFSanVisitor> {
+public:
+  DFSanFunction &DFSF;
+
+  DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
+
+  const DataLayout &getDataLayout() const {
+    return DFSF.F->getParent()->getDataLayout();
+  }
+
+  void visitOperandShadowInst(Instruction &I);
+  void visitUnaryOperator(UnaryOperator &UO);
+  void visitBinaryOperator(BinaryOperator &BO);
+  void visitCastInst(CastInst &CI);
+  void visitCmpInst(CmpInst &CI);
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  void visitLoadInst(LoadInst &LI);
+  void visitStoreInst(StoreInst &SI);
+  void visitReturnInst(ReturnInst &RI);
+  void visitCallSite(CallSite CS);
+  void visitPHINode(PHINode &PN);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitMemSetInst(MemSetInst &I);
+  void visitMemTransferInst(MemTransferInst &I);
+};
+
+} // end anonymous namespace
+
+char DataFlowSanitizer::ID;
+
+INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
+                "DataFlowSanitizer: dynamic data flow analysis.", false, false)
+
+ModulePass *
+llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles,
+                                  void *(*getArgTLS)(),
+                                  void *(*getRetValTLS)()) {
+  return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS);
+}
+
+DataFlowSanitizer::DataFlowSanitizer(
+    const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(),
+    void *(*getRetValTLS)())
+    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) {
+  std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
+  AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),
+                         ClABIListFiles.end());
+  ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles));
+}
+
+FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
+  SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
+  ArgTypes.append(T->getNumParams(), ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    RetType = StructType::get(RetType, ShadowTy);
+  return FunctionType::get(RetType, ArgTypes, T->isVarArg());
+}
+
+FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  SmallVector<Type *, 4> ArgTypes;
+  ArgTypes.push_back(T->getPointerTo());
+  ArgTypes.append(T->param_begin(), T->param_end());
+  ArgTypes.append(T->getNumParams(), ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+  SmallVector<Type *, 4> ArgTypes;
+
+  // Some parameters of the custom function being constructed are
+  // parameters of T.  Record the mapping from parameters of T to
+  // parameters of the custom function, so that parameter attributes
+  // at call sites can be updated.
+  std::vector<unsigned> ArgumentIndexMapping;
+  for (unsigned i = 0, ie = T->getNumParams(); i != ie; ++i) {
+    Type* param_type = T->getParamType(i);
+    FunctionType *FT;
+    if (isa<PointerType>(param_type) && (FT = dyn_cast<FunctionType>(
+            cast<PointerType>(param_type)->getElementType()))) {
+      ArgumentIndexMapping.push_back(ArgTypes.size());
+      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
+      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
+    } else {
+      ArgumentIndexMapping.push_back(ArgTypes.size());
+      ArgTypes.push_back(param_type);
+    }
+  }
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return TransformedFunction(
+      T, FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg()),
+      ArgumentIndexMapping);
+}
+
+bool DataFlowSanitizer::doInitialization(Module &M) {
+  Triple TargetTriple(M.getTargetTriple());
+  bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
+  bool IsMIPS64 = TargetTriple.isMIPS64();
+  bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64 ||
+                   TargetTriple.getArch() == Triple::aarch64_be;
+
+  const DataLayout &DL = M.getDataLayout();
+
+  Mod = &M;
+  Ctx = &M.getContext();
+  ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
+  ShadowPtrTy = PointerType::getUnqual(ShadowTy);
+  IntptrTy = DL.getIntPtrType(*Ctx);
+  ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
+  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
+  if (IsX86_64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+  else if (IsMIPS64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+  // AArch64 supports multiple VMAs and the shadow mask is set at runtime.
+  else if (IsAArch64)
+    DFSanRuntimeShadowMask = true;
+  else
+    report_fatal_error("unsupported triple");
+
+  Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
+  DFSanUnionFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false);
+  Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy };
+  DFSanUnionLoadFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false);
+  DFSanUnimplementedFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy };
+  DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                        DFSanSetLabelArgs, /*isVarArg=*/false);
+  DFSanNonzeroLabelFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
+  DFSanVarargWrapperFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+
+  if (GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = nullptr;
+    GetArgTLSTy = FunctionType::get(PointerType::getUnqual(ArgTLSTy), false);
+    GetArgTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
+        PointerType::getUnqual(GetArgTLSTy));
+  }
+  if (GetRetvalTLSPtr) {
+    RetvalTLS = nullptr;
+    GetRetvalTLSTy = FunctionType::get(PointerType::getUnqual(ShadowTy), false);
+    GetRetvalTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
+        PointerType::getUnqual(GetRetvalTLSTy));
+  }
+
+  ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
+  return true;
+}
+
+bool DataFlowSanitizer::isInstrumented(const Function *F) {
+  return !ABIList.isIn(*F, "uninstrumented");
+}
+
+bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
+  return !ABIList.isIn(*GA, "uninstrumented");
+}
+
+DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
+  return ClArgsABI ? IA_Args : IA_TLS;
+}
+
+DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
+  if (ABIList.isIn(*F, "functional"))
+    return WK_Functional;
+  if (ABIList.isIn(*F, "discard"))
+    return WK_Discard;
+  if (ABIList.isIn(*F, "custom"))
+    return WK_Custom;
+
+  return WK_Warning;
+}
+
+void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
+  std::string GVName = GV->getName(), Prefix = "dfs$";
+  GV->setName(Prefix + GVName);
+
+  // Try to change the name of the function in module inline asm.  We only do
+  // this for specific asm directives, currently only ".symver", to try to avoid
+  // corrupting asm which happens to contain the symbol name as a substring.
+  // Note that the substitution for .symver assumes that the versioned symbol
+  // also has an instrumented name.
+  std::string Asm = GV->getParent()->getModuleInlineAsm();
+  std::string SearchStr = ".symver " + GVName + ",";
+  size_t Pos = Asm.find(SearchStr);
+  if (Pos != std::string::npos) {
+    Asm.replace(Pos, SearchStr.size(),
+                ".symver " + Prefix + GVName + "," + Prefix);
+    GV->getParent()->setModuleInlineAsm(Asm);
+  }
+}
+
+Function *
+DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
+                                        GlobalValue::LinkageTypes NewFLink,
+                                        FunctionType *NewFT) {
+  FunctionType *FT = F->getFunctionType();
+  Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(),
+                                    NewFName, F->getParent());
+  NewF->copyAttributesFrom(F);
+  NewF->removeAttributes(
+      AttributeList::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
+  if (F->isVarArg()) {
+    NewF->removeAttributes(AttributeList::FunctionIndex,
+                           AttrBuilder().addAttribute("split-stack"));
+    CallInst::Create(DFSanVarargWrapperFn,
+                     IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
+                     BB);
+    new UnreachableInst(*Ctx, BB);
+  } else {
+    std::vector<Value *> Args;
+    unsigned n = FT->getNumParams();
+    for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+      Args.push_back(&*ai);
+    CallInst *CI = CallInst::Create(F, Args, "", BB);
+    if (FT->getReturnType()->isVoidTy())
+      ReturnInst::Create(*Ctx, BB);
+    else
+      ReturnInst::Create(*Ctx, CI, BB);
+  }
+
+  return NewF;
+}
+
+Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
+                                                          StringRef FName) {
+  FunctionType *FTT = getTrampolineFunctionType(FT);
+  FunctionCallee C = Mod->getOrInsertFunction(FName, FTT);
+  Function *F = dyn_cast<Function>(C.getCallee());
+  if (F && F->isDeclaration()) {
+    F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    std::vector<Value *> Args;
+    Function::arg_iterator AI = F->arg_begin(); ++AI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
+      Args.push_back(&*AI);
+    CallInst *CI = CallInst::Create(FT, &*F->arg_begin(), Args, "", BB);
+    ReturnInst *RI;
+    if (FT->getReturnType()->isVoidTy())
+      RI = ReturnInst::Create(*Ctx, BB);
+    else
+      RI = ReturnInst::Create(*Ctx, CI, BB);
+
+    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
+      DFSF.ValShadowMap[&*ValAI] = &*ShadowAI;
+    DFSanVisitor(DFSF).visitCallInst(*CI);
+    if (!FT->getReturnType()->isVoidTy())
+      new StoreInst(DFSF.getShadow(RI->getReturnValue()),
+                    &*std::prev(F->arg_end()), RI);
+  }
+
+  return cast<Constant>(C.getCallee());
+}
+
+bool DataFlowSanitizer::runOnModule(Module &M) {
+  if (ABIList.isIn(M, "skip"))
+    return false;
+
+  if (!GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+  if (!GetRetvalTLSPtr) {
+    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+
+  ExternalShadowMask =
+      Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);
+
+  {
+    AttributeList AL;
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::NoUnwind);
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::ReadNone);
+    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+                         Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
+    DFSanUnionFn =
+        Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy, AL);
+  }
+
+  {
+    AttributeList AL;
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::NoUnwind);
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::ReadNone);
+    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+                         Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
+    DFSanCheckedUnionFn =
+        Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy, AL);
+  }
+  {
+    AttributeList AL;
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::NoUnwind);
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::ReadOnly);
+    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+                         Attribute::ZExt);
+    DFSanUnionLoadFn =
+        Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL);
+  }
+  DFSanUnimplementedFn =
+      Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+  {
+    AttributeList AL;
+    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+    DFSanSetLabelFn =
+        Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy, AL);
+  }
+  DFSanNonzeroLabelFn =
+      Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+  DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
+                                                  DFSanVarargWrapperFnTy);
+
+  std::vector<Function *> FnsToInstrument;
+  SmallPtrSet<Function *, 2> FnsWithNativeABI;
+  for (Function &i : M) {
+    if (!i.isIntrinsic() &&
+        &i != DFSanUnionFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanCheckedUnionFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanUnionLoadFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanUnimplementedFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanSetLabelFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanNonzeroLabelFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanVarargWrapperFn.getCallee()->stripPointerCasts())
+      FnsToInstrument.push_back(&i);
+  }
+
+  // Give function aliases prefixes when necessary, and build wrappers where the
+  // instrumentedness is inconsistent.
+  for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
+    GlobalAlias *GA = &*i;
+    ++i;
+    // Don't stop on weak.  We assume people aren't playing games with the
+    // instrumentedness of overridden weak aliases.
+    if (auto F = dyn_cast<Function>(GA->getBaseObject())) {
+      bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+      if (GAInst && FInst) {
+        addGlobalNamePrefix(GA);
+      } else if (GAInst != FInst) {
+        // Non-instrumented alias of an instrumented function, or vice versa.
+        // Replace the alias with a native-ABI wrapper of the aliasee.  The pass
+        // below will take care of instrumenting it.
+        Function *NewF =
+            buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
+        GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
+        NewF->takeName(GA);
+        GA->eraseFromParent();
+        FnsToInstrument.push_back(NewF);
+      }
+    }
+  }
+
+  ReadOnlyNoneAttrs.addAttribute(Attribute::ReadOnly)
+      .addAttribute(Attribute::ReadNone);
+
+  // First, change the ABI of every function in the module.  ABI-listed
+  // functions keep their original ABI and get a wrapper function.
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    Function &F = **i;
+    FunctionType *FT = F.getFunctionType();
+
+    bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
+                              FT->getReturnType()->isVoidTy());
+
+    if (isInstrumented(&F)) {
+      // Instrumented functions get a 'dfs$' prefix.  This allows us to more
+      // easily identify cases of mismatching ABIs.
+      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
+        FunctionType *NewFT = getArgsFunctionType(FT);
+        Function *NewF = Function::Create(NewFT, F.getLinkage(),
+                                          F.getAddressSpace(), "", &M);
+        NewF->copyAttributesFrom(&F);
+        NewF->removeAttributes(
+            AttributeList::ReturnIndex,
+            AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+        for (Function::arg_iterator FArg = F.arg_begin(),
+                                    NewFArg = NewF->arg_begin(),
+                                    FArgEnd = F.arg_end();
+             FArg != FArgEnd; ++FArg, ++NewFArg) {
+          FArg->replaceAllUsesWith(&*NewFArg);
+        }
+        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
+
+        for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
+             UI != UE;) {
+          BlockAddress *BA = dyn_cast<BlockAddress>(*UI);
+          ++UI;
+          if (BA) {
+            BA->replaceAllUsesWith(
+                BlockAddress::get(NewF, BA->getBasicBlock()));
+            delete BA;
+          }
+        }
+        F.replaceAllUsesWith(
+            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
+        NewF->takeName(&F);
+        F.eraseFromParent();
+        *i = NewF;
+        addGlobalNamePrefix(NewF);
+      } else {
+        addGlobalNamePrefix(&F);
+      }
+    } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
+      // Build a wrapper function for F.  The wrapper simply calls F, and is
+      // added to FnsToInstrument so that any instrumentation according to its
+      // WrapperKind is done in the second pass below.
+      FunctionType *NewFT = getInstrumentedABI() == IA_Args
+                                ? getArgsFunctionType(FT)
+                                : FT;
+
+      // If the function being wrapped has local linkage, then preserve the
+      // function's linkage in the wrapper function.
+      GlobalValue::LinkageTypes wrapperLinkage =
+          F.hasLocalLinkage()
+              ? F.getLinkage()
+              : GlobalValue::LinkOnceODRLinkage;
+
+      Function *NewF = buildWrapperFunction(
+          &F, std::string("dfsw$") + std::string(F.getName()),
+          wrapperLinkage, NewFT);
+      if (getInstrumentedABI() == IA_TLS)
+        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
+
+      Value *WrappedFnCst =
+          ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
+      F.replaceAllUsesWith(WrappedFnCst);
+
+      UnwrappedFnMap[WrappedFnCst] = &F;
+      *i = NewF;
+
+      if (!F.isDeclaration()) {
+        // This function is probably defining an interposition of an
+        // uninstrumented function and hence needs to keep the original ABI.
+        // But any functions it may call need to use the instrumented ABI, so
+        // we instrument it in a mode which preserves the original ABI.
+        FnsWithNativeABI.insert(&F);
+
+        // This code needs to rebuild the iterators, as they may be invalidated
+        // by the push_back, taking care that the new range does not include
+        // any functions added by this code.
+        size_t N = i - FnsToInstrument.begin(),
+               Count = e - FnsToInstrument.begin();
+        FnsToInstrument.push_back(&F);
+        i = FnsToInstrument.begin() + N;
+        e = FnsToInstrument.begin() + Count;
+      }
+               // Hopefully, nobody will try to indirectly call a vararg
+               // function... yet.
+    } else if (FT->isVarArg()) {
+      UnwrappedFnMap[&F] = &F;
+      *i = nullptr;
+    }
+  }
+
+  for (Function *i : FnsToInstrument) {
+    if (!i || i->isDeclaration())
+      continue;
+
+    removeUnreachableBlocks(*i);
+
+    DFSanFunction DFSF(*this, i, FnsWithNativeABI.count(i));
+
+    // DFSanVisitor may create new basic blocks, which confuses df_iterator.
+    // Build a copy of the list before iterating over it.
+    SmallVector<BasicBlock *, 4> BBList(depth_first(&i->getEntryBlock()));
+
+    for (BasicBlock *i : BBList) {
+      Instruction *Inst = &i->front();
+      while (true) {
+        // DFSanVisitor may split the current basic block, changing the current
+        // instruction's next pointer and moving the next instruction to the
+        // tail block from which we should continue.
+        Instruction *Next = Inst->getNextNode();
+        // DFSanVisitor may delete Inst, so keep track of whether it was a
+        // terminator.
+        bool IsTerminator = Inst->isTerminator();
+        if (!DFSF.SkipInsts.count(Inst))
+          DFSanVisitor(DFSF).visit(Inst);
+        if (IsTerminator)
+          break;
+        Inst = Next;
+      }
+    }
+
+    // We will not necessarily be able to compute the shadow for every phi node
+    // until we have visited every block.  Therefore, the code that handles phi
+    // nodes adds them to the PHIFixups list so that they can be properly
+    // handled here.
+    for (std::vector<std::pair<PHINode *, PHINode *>>::iterator
+             i = DFSF.PHIFixups.begin(),
+             e = DFSF.PHIFixups.end();
+         i != e; ++i) {
+      for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
+           ++val) {
+        i->second->setIncomingValue(
+            val, DFSF.getShadow(i->first->getIncomingValue(val)));
+      }
+    }
+
+    // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
+    // places (i.e. instructions in basic blocks we haven't even begun visiting
+    // yet).  To make our life easier, do this work in a pass after the main
+    // instrumentation.
+    if (ClDebugNonzeroLabels) {
+      for (Value *V : DFSF.NonZeroChecks) {
+        Instruction *Pos;
+        if (Instruction *I = dyn_cast<Instruction>(V))
+          Pos = I->getNextNode();
+        else
+          Pos = &DFSF.F->getEntryBlock().front();
+        while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
+          Pos = Pos->getNextNode();
+        IRBuilder<> IRB(Pos);
+        Value *Ne = IRB.CreateICmpNE(V, DFSF.DFS.ZeroShadow);
+        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+            Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
+        IRBuilder<> ThenIRB(BI);
+        ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {});
+      }
+    }
+  }
+
+  return false;
+}
+
+Value *DFSanFunction::getArgTLSPtr() {
+  if (ArgTLSPtr)
+    return ArgTLSPtr;
+  if (DFS.ArgTLS)
+    return ArgTLSPtr = DFS.ArgTLS;
+
+  IRBuilder<> IRB(&F->getEntryBlock().front());
+  return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLSTy, DFS.GetArgTLS, {});
+}
+
+Value *DFSanFunction::getRetvalTLS() {
+  if (RetvalTLSPtr)
+    return RetvalTLSPtr;
+  if (DFS.RetvalTLS)
+    return RetvalTLSPtr = DFS.RetvalTLS;
+
+  IRBuilder<> IRB(&F->getEntryBlock().front());
+  return RetvalTLSPtr =
+             IRB.CreateCall(DFS.GetRetvalTLSTy, DFS.GetRetvalTLS, {});
+}
+
+Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) {
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateConstGEP2_64(ArrayType::get(DFS.ShadowTy, 64),
+                                getArgTLSPtr(), 0, Idx);
+}
+
+Value *DFSanFunction::getShadow(Value *V) {
+  if (!isa<Argument>(V) && !isa<Instruction>(V))
+    return DFS.ZeroShadow;
+  Value *&Shadow = ValShadowMap[V];
+  if (!Shadow) {
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      if (IsNativeABI)
+        return DFS.ZeroShadow;
+      switch (IA) {
+      case DataFlowSanitizer::IA_TLS: {
+        Value *ArgTLSPtr = getArgTLSPtr();
+        Instruction *ArgTLSPos =
+            DFS.ArgTLS ? &*F->getEntryBlock().begin()
+                       : cast<Instruction>(ArgTLSPtr)->getNextNode();
+        IRBuilder<> IRB(ArgTLSPos);
+        Shadow =
+            IRB.CreateLoad(DFS.ShadowTy, getArgTLS(A->getArgNo(), ArgTLSPos));
+        break;
+      }
+      case DataFlowSanitizer::IA_Args: {
+        unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
+        Function::arg_iterator i = F->arg_begin();
+        while (ArgIdx--)
+          ++i;
+        Shadow = &*i;
+        assert(Shadow->getType() == DFS.ShadowTy);
+        break;
+      }
+      }
+      NonZeroChecks.push_back(Shadow);
+    } else {
+      Shadow = DFS.ZeroShadow;
+    }
+  }
+  return Shadow;
+}
+
+void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
+  assert(!ValShadowMap.count(I));
+  assert(Shadow->getType() == DFS.ShadowTy);
+  ValShadowMap[I] = Shadow;
+}
+
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
+  assert(Addr != RetvalTLS && "Reinstrumenting?");
+  IRBuilder<> IRB(Pos);
+  Value *ShadowPtrMaskValue;
+  if (DFSanRuntimeShadowMask)
+    ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask);
+  else
+    ShadowPtrMaskValue = ShadowPtrMask;
+  return IRB.CreateIntToPtr(
+      IRB.CreateMul(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
+                        IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),
+          ShadowPtrMul),
+      ShadowPtrTy);
+}
+
+// Generates IR to compute the union of the two given shadows, inserting it
+// before Pos.  Returns the computed union Value.
+Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
+  if (V1 == DFS.ZeroShadow)
+    return V2;
+  if (V2 == DFS.ZeroShadow)
+    return V1;
+  if (V1 == V2)
+    return V1;
+
+  auto V1Elems = ShadowElements.find(V1);
+  auto V2Elems = ShadowElements.find(V2);
+  if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
+    if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
+                      V2Elems->second.begin(), V2Elems->second.end())) {
+      return V1;
+    } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
+                             V1Elems->second.begin(), V1Elems->second.end())) {
+      return V2;
+    }
+  } else if (V1Elems != ShadowElements.end()) {
+    if (V1Elems->second.count(V2))
+      return V1;
+  } else if (V2Elems != ShadowElements.end()) {
+    if (V2Elems->second.count(V1))
+      return V2;
+  }
+
+  auto Key = std::make_pair(V1, V2);
+  if (V1 > V2)
+    std::swap(Key.first, Key.second);
+  CachedCombinedShadow &CCS = CachedCombinedShadows[Key];
+  if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
+    return CCS.Shadow;
+
+  IRBuilder<> IRB(Pos);
+  if (AvoidNewBlocks) {
+    CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2});
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addParamAttr(0, Attribute::ZExt);
+    Call->addParamAttr(1, Attribute::ZExt);
+
+    CCS.Block = Pos->getParent();
+    CCS.Shadow = Call;
+  } else {
+    BasicBlock *Head = Pos->getParent();
+    Value *Ne = IRB.CreateICmpNE(V1, V2);
+    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
+    IRBuilder<> ThenIRB(BI);
+    CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2});
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addParamAttr(0, Attribute::ZExt);
+    Call->addParamAttr(1, Attribute::ZExt);
+
+    BasicBlock *Tail = BI->getSuccessor(0);
+    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    Phi->addIncoming(Call, Call->getParent());
+    Phi->addIncoming(V1, Head);
+
+    CCS.Block = Tail;
+    CCS.Shadow = Phi;
+  }
+
+  std::set<Value *> UnionElems;
+  if (V1Elems != ShadowElements.end()) {
+    UnionElems = V1Elems->second;
+  } else {
+    UnionElems.insert(V1);
+  }
+  if (V2Elems != ShadowElements.end()) {
+    UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end());
+  } else {
+    UnionElems.insert(V2);
+  }
+  ShadowElements[CCS.Shadow] = std::move(UnionElems);
+
+  return CCS.Shadow;
+}
+
+// A convenience function which folds the shadows of each of the operands
+// of the provided instruction Inst, inserting the IR before Inst.  Returns
+// the computed union Value.
+Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
+  if (Inst->getNumOperands() == 0)
+    return DFS.ZeroShadow;
+
+  Value *Shadow = getShadow(Inst->getOperand(0));
+  for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
+    Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+  }
+  return Shadow;
+}
+
+void DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+  Value *CombinedShadow = DFSF.combineOperandShadows(&I);
+  DFSF.setShadow(&I, CombinedShadow);
+}
+
+// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
+// Addr has alignment Align, and take the union of each of those shadows.
+Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                 Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    const auto i = AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      return IRB.CreateLoad(DFS.ShadowTy, i->second);
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  SmallVector<const Value *, 2> Objs;
+  GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout());
+  bool AllConstants = true;
+  for (const Value *Obj : Objs) {
+    if (isa<Function>(Obj) || isa<BlockAddress>(Obj))
+      continue;
+    if (isa<GlobalVariable>(Obj) && cast<GlobalVariable>(Obj)->isConstant())
+      continue;
+
+    AllConstants = false;
+    break;
+  }
+  if (AllConstants)
+    return DFS.ZeroShadow;
+
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  switch (Size) {
+  case 0:
+    return DFS.ZeroShadow;
+  case 1: {
+    LoadInst *LI = new LoadInst(DFS.ShadowTy, ShadowAddr, "", Pos);
+    LI->setAlignment(MaybeAlign(ShadowAlign));
+    return LI;
+  }
+  case 2: {
+    IRBuilder<> IRB(Pos);
+    Value *ShadowAddr1 = IRB.CreateGEP(DFS.ShadowTy, ShadowAddr,
+                                       ConstantInt::get(DFS.IntptrTy, 1));
+    return combineShadows(
+        IRB.CreateAlignedLoad(DFS.ShadowTy, ShadowAddr, ShadowAlign),
+        IRB.CreateAlignedLoad(DFS.ShadowTy, ShadowAddr1, ShadowAlign), Pos);
+  }
+  }
+  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidth) == 0) {
+    // Fast path for the common case where each byte has identical shadow: load
+    // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
+    // shadow is non-equal.
+    BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
+    IRBuilder<> FallbackIRB(FallbackBB);
+    CallInst *FallbackCall = FallbackIRB.CreateCall(
+        DFS.DFSanUnionLoadFn,
+        {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+    FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+
+    // Compare each of the shadows stored in the loaded 64 bits to each other,
+    // by computing (WideShadow rotl ShadowWidth) == WideShadow.
+    IRBuilder<> IRB(Pos);
+    Value *WideAddr =
+        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+    Value *WideShadow =
+        IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+    Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
+    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth);
+    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth);
+    Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
+    Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
+
+    BasicBlock *Head = Pos->getParent();
+    BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator());
+
+    if (DomTreeNode *OldNode = DT.getNode(Head)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT.addNewBlock(Tail, Head);
+      for (auto Child : Children)
+        DT.changeImmediateDominator(Child, NewNode);
+    }
+
+    // In the following code LastBr will refer to the previous basic block's
+    // conditional branch instruction, whose true successor is fixed up to point
+    // to the next block during the loop below or to the tail after the final
+    // iteration.
+    BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
+    ReplaceInstWithInst(Head->getTerminator(), LastBr);
+    DT.addNewBlock(FallbackBB, Head);
+
+    for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidth) {
+      BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+      DT.addNewBlock(NextBB, LastBr->getParent());
+      IRBuilder<> NextIRB(NextBB);
+      WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+                                   ConstantInt::get(DFS.IntptrTy, 1));
+      Value *NextWideShadow = NextIRB.CreateAlignedLoad(NextIRB.getInt64Ty(),
+                                                        WideAddr, ShadowAlign);
+      ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
+      LastBr->setSuccessor(0, NextBB);
+      LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
+    }
+
+    LastBr->setSuccessor(0, Tail);
+    FallbackIRB.CreateBr(Tail);
+    PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    Shadow->addIncoming(FallbackCall, FallbackBB);
+    Shadow->addIncoming(TruncShadow, LastBr->getParent());
+    return Shadow;
+  }
+
+  IRBuilder<> IRB(Pos);
+  CallInst *FallbackCall = IRB.CreateCall(
+      DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+  return FallbackCall;
+}
+
+void DFSanVisitor::visitLoadInst(LoadInst &LI) {
+  auto &DL = LI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(LI.getType());
+  if (Size == 0) {
+    DFSF.setShadow(&LI, DFSF.DFS.ZeroShadow);
+    return;
+  }
+
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = LI.getAlignment();
+    if (Align == 0)
+      Align = DL.getABITypeAlignment(LI.getType());
+  } else {
+    Align = 1;
+  }
+  IRBuilder<> IRB(&LI);
+  Value *Shadow = DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
+  if (ClCombinePointerLabelsOnLoad) {
+    Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
+    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI);
+  }
+  if (Shadow != DFSF.DFS.ZeroShadow)
+    DFSF.NonZeroChecks.push_back(Shadow);
+
+  DFSF.setShadow(&LI, Shadow);
+}
+
+void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                Value *Shadow, Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    const auto i = AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      IRB.CreateStore(Shadow, i->second);
+      return;
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  IRBuilder<> IRB(Pos);
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  if (Shadow == DFS.ZeroShadow) {
+    IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth);
+    Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
+    Value *ExtShadowAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
+    IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+    return;
+  }
+
+  const unsigned ShadowVecSize = 128 / DFS.ShadowWidth;
+  uint64_t Offset = 0;
+  if (Size >= ShadowVecSize) {
+    VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
+    Value *ShadowVec = UndefValue::get(ShadowVecTy);
+    for (unsigned i = 0; i != ShadowVecSize; ++i) {
+      ShadowVec = IRB.CreateInsertElement(
+          ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
+    }
+    Value *ShadowVecAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
+    do {
+      Value *CurShadowVecAddr =
+          IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset);
+      IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
+      Size -= ShadowVecSize;
+      ++Offset;
+    } while (Size >= ShadowVecSize);
+    Offset *= ShadowVecSize;
+  }
+  while (Size > 0) {
+    Value *CurShadowAddr =
+        IRB.CreateConstGEP1_32(DFS.ShadowTy, ShadowAddr, Offset);
+    IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign);
+    --Size;
+    ++Offset;
+  }
+}
+
+void DFSanVisitor::visitStoreInst(StoreInst &SI) {
+  auto &DL = SI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType());
+  if (Size == 0)
+    return;
+
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = SI.getAlignment();
+    if (Align == 0)
+      Align = DL.getABITypeAlignment(SI.getValueOperand()->getType());
+  } else {
+    Align = 1;
+  }
+
+  Value* Shadow = DFSF.getShadow(SI.getValueOperand());
+  if (ClCombinePointerLabelsOnStore) {
+    Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
+    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
+  }
+  DFSF.storeShadow(SI.getPointerOperand(), Size, Align, Shadow, &SI);
+}
+
+void DFSanVisitor::visitUnaryOperator(UnaryOperator &UO) {
+  visitOperandShadowInst(UO);
+}
+
+void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
+  visitOperandShadowInst(BO);
+}
+
+void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  visitOperandShadowInst(GEPI);
+}
+
+void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
+  bool AllLoadsStores = true;
+  for (User *U : I.users()) {
+    if (isa<LoadInst>(U))
+      continue;
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getPointerOperand() == &I)
+        continue;
+    }
+
+    AllLoadsStores = false;
+    break;
+  }
+  if (AllLoadsStores) {
+    IRBuilder<> IRB(&I);
+    DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy);
+  }
+  DFSF.setShadow(&I, DFSF.DFS.ZeroShadow);
+}
+
+void DFSanVisitor::visitSelectInst(SelectInst &I) {
+  Value *CondShadow = DFSF.getShadow(I.getCondition());
+  Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
+  Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
+
+  if (isa<VectorType>(I.getCondition()->getType())) {
+    DFSF.setShadow(
+        &I,
+        DFSF.combineShadows(
+            CondShadow, DFSF.combineShadows(TrueShadow, FalseShadow, &I), &I));
+  } else {
+    Value *ShadowSel;
+    if (TrueShadow == FalseShadow) {
+      ShadowSel = TrueShadow;
+    } else {
+      ShadowSel =
+          SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
+    }
+    DFSF.setShadow(&I, DFSF.combineShadows(CondShadow, ShadowSel, &I));
+  }
+}
+
+void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *ValShadow = DFSF.getShadow(I.getValue());
+  IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn,
+                 {ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(
+                                                                *DFSF.DFS.Ctx)),
+                  IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
+}
+
+void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
+  Value *LenShadow = IRB.CreateMul(
+      I.getLength(),
+      ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
+  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
+  DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
+  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
+  auto *MTI = cast<MemTransferInst>(
+      IRB.CreateCall(I.getFunctionType(), I.getCalledValue(),
+                     {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
+  if (ClPreserveAlignment) {
+    MTI->setDestAlignment(I.getDestAlignment() * (DFSF.DFS.ShadowWidth / 8));
+    MTI->setSourceAlignment(I.getSourceAlignment() * (DFSF.DFS.ShadowWidth / 8));
+  } else {
+    MTI->setDestAlignment(DFSF.DFS.ShadowWidth / 8);
+    MTI->setSourceAlignment(DFSF.DFS.ShadowWidth / 8);
+  }
+}
+
+void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
+  if (!DFSF.IsNativeABI && RI.getReturnValue()) {
+    switch (DFSF.IA) {
+    case DataFlowSanitizer::IA_TLS: {
+      Value *S = DFSF.getShadow(RI.getReturnValue());
+      IRBuilder<> IRB(&RI);
+      IRB.CreateStore(S, DFSF.getRetvalTLS());
+      break;
+    }
+    case DataFlowSanitizer::IA_Args: {
+      IRBuilder<> IRB(&RI);
+      Type *RT = DFSF.F->getFunctionType()->getReturnType();
+      Value *InsVal =
+          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
+      Value *InsShadow =
+          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
+      RI.setOperand(0, InsShadow);
+      break;
+    }
+    }
+  }
+}
+
+void DFSanVisitor::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) {
+    visitOperandShadowInst(*CS.getInstruction());
+    return;
+  }
+
+  // Calls to this function are synthesized in wrappers, and we shouldn't
+  // instrument them.
+  if (F == DFSF.DFS.DFSanVarargWrapperFn.getCallee()->stripPointerCasts())
+    return;
+
+  IRBuilder<> IRB(CS.getInstruction());
+
+  DenseMap<Value *, Function *>::iterator i =
+      DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue());
+  if (i != DFSF.DFS.UnwrappedFnMap.end()) {
+    Function *F = i->second;
+    switch (DFSF.DFS.getWrapperKind(F)) {
+    case DataFlowSanitizer::WK_Warning:
+      CS.setCalledFunction(F);
+      IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
+                     IRB.CreateGlobalStringPtr(F->getName()));
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    case DataFlowSanitizer::WK_Discard:
+      CS.setCalledFunction(F);
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    case DataFlowSanitizer::WK_Functional:
+      CS.setCalledFunction(F);
+      visitOperandShadowInst(*CS.getInstruction());
+      return;
+    case DataFlowSanitizer::WK_Custom:
+      // Don't try to handle invokes of custom functions, it's too complicated.
+      // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
+      // wrapper.
+      if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+        FunctionType *FT = F->getFunctionType();
+        TransformedFunction CustomFn = DFSF.DFS.getCustomFunctionType(FT);
+        std::string CustomFName = "__dfsw_";
+        CustomFName += F->getName();
+        FunctionCallee CustomF = DFSF.DFS.Mod->getOrInsertFunction(
+            CustomFName, CustomFn.TransformedType);
+        if (Function *CustomFn = dyn_cast<Function>(CustomF.getCallee())) {
+          CustomFn->copyAttributesFrom(F);
+
+          // Custom functions returning non-void will write to the return label.
+          if (!FT->getReturnType()->isVoidTy()) {
+            CustomFn->removeAttributes(AttributeList::FunctionIndex,
+                                       DFSF.DFS.ReadOnlyNoneAttrs);
+          }
+        }
+
+        std::vector<Value *> Args;
+
+        CallSite::arg_iterator i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
+          Type *T = (*i)->getType();
+          FunctionType *ParamFT;
+          if (isa<PointerType>(T) &&
+              (ParamFT = dyn_cast<FunctionType>(
+                   cast<PointerType>(T)->getElementType()))) {
+            std::string TName = "dfst";
+            TName += utostr(FT->getNumParams() - n);
+            TName += "$";
+            TName += F->getName();
+            Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
+            Args.push_back(T);
+            Args.push_back(
+                IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
+          } else {
+            Args.push_back(*i);
+          }
+        }
+
+        i = CS.arg_begin();
+        const unsigned ShadowArgStart = Args.size();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+          Args.push_back(DFSF.getShadow(*i));
+
+        if (FT->isVarArg()) {
+          auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
+                                           CS.arg_size() - FT->getNumParams());
+          auto *LabelVAAlloca = new AllocaInst(
+              LabelVATy, getDataLayout().getAllocaAddrSpace(),
+              "labelva", &DFSF.F->getEntryBlock().front());
+
+          for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
+            auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
+            IRB.CreateStore(DFSF.getShadow(*i), LabelVAPtr);
+          }
+
+          Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0));
+        }
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          if (!DFSF.LabelReturnAlloca) {
+            DFSF.LabelReturnAlloca =
+              new AllocaInst(DFSF.DFS.ShadowTy,
+                             getDataLayout().getAllocaAddrSpace(),
+                             "labelreturn", &DFSF.F->getEntryBlock().front());
+          }
+          Args.push_back(DFSF.LabelReturnAlloca);
+        }
+
+        for (i = CS.arg_begin() + FT->getNumParams(); i != CS.arg_end(); ++i)
+          Args.push_back(*i);
+
+        CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
+        CustomCI->setCallingConv(CI->getCallingConv());
+        CustomCI->setAttributes(TransformFunctionAttributes(CustomFn,
+            CI->getContext(), CI->getAttributes()));
+
+        // Update the parameter attributes of the custom call instruction to
+        // zero extend the shadow parameters. This is required for targets
+        // which consider ShadowTy an illegal type.
+        for (unsigned n = 0; n < FT->getNumParams(); n++) {
+          const unsigned ArgNo = ShadowArgStart + n;
+          if (CustomCI->getArgOperand(ArgNo)->getType() == DFSF.DFS.ShadowTy)
+            CustomCI->addParamAttr(ArgNo, Attribute::ZExt);
+        }
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          LoadInst *LabelLoad =
+              IRB.CreateLoad(DFSF.DFS.ShadowTy, DFSF.LabelReturnAlloca);
+          DFSF.setShadow(CustomCI, LabelLoad);
+        }
+
+        CI->replaceAllUsesWith(CustomCI);
+        CI->eraseFromParent();
+        return;
+      }
+      break;
+    }
+  }
+
+  FunctionType *FT = cast<FunctionType>(
+      CS.getCalledValue()->getType()->getPointerElementType());
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+    for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
+      IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)),
+                      DFSF.getArgTLS(i, CS.getInstruction()));
+    }
+  }
+
+  Instruction *Next = nullptr;
+  if (!CS.getType()->isVoidTy()) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      if (II->getNormalDest()->getSinglePredecessor()) {
+        Next = &II->getNormalDest()->front();
+      } else {
+        BasicBlock *NewBB =
+            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
+        Next = &NewBB->front();
+      }
+    } else {
+      assert(CS->getIterator() != CS->getParent()->end());
+      Next = CS->getNextNode();
+    }
+
+    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+      IRBuilder<> NextIRB(Next);
+      LoadInst *LI = NextIRB.CreateLoad(DFSF.DFS.ShadowTy, DFSF.getRetvalTLS());
+      DFSF.SkipInsts.insert(LI);
+      DFSF.setShadow(CS.getInstruction(), LI);
+      DFSF.NonZeroChecks.push_back(LI);
+    }
+  }
+
+  // Do all instrumentation for IA_Args down here to defer tampering with the
+  // CFG in a way that SplitEdge may be able to detect.
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
+    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
+    Value *Func =
+        IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT));
+    std::vector<Value *> Args;
+
+    CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(*i);
+
+    i = CS.arg_begin();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(DFSF.getShadow(*i));
+
+    if (FT->isVarArg()) {
+      unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
+      ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
+      AllocaInst *VarArgShadow =
+        new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
+                       "", &DFSF.F->getEntryBlock().front());
+      Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
+      for (unsigned n = 0; i != e; ++i, ++n) {
+        IRB.CreateStore(
+            DFSF.getShadow(*i),
+            IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n));
+        Args.push_back(*i);
+      }
+    }
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      NewCS = IRB.CreateInvoke(NewFT, Func, II->getNormalDest(),
+                               II->getUnwindDest(), Args);
+    } else {
+      NewCS = IRB.CreateCall(NewFT, Func, Args);
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(CS.getAttributes().removeAttributes(
+        *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
+        AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType())));
+
+    if (Next) {
+      ExtractValueInst *ExVal =
+          ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next);
+      DFSF.SkipInsts.insert(ExVal);
+      ExtractValueInst *ExShadow =
+          ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
+      DFSF.SkipInsts.insert(ExShadow);
+      DFSF.setShadow(ExVal, ExShadow);
+      DFSF.NonZeroChecks.push_back(ExShadow);
+
+      CS.getInstruction()->replaceAllUsesWith(ExVal);
+    }
+
+    CS.getInstruction()->eraseFromParent();
+  }
+}
+
+void DFSanVisitor::visitPHINode(PHINode &PN) {
+  PHINode *ShadowPN =
+      PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN);
+
+  // Give the shadow phi node valid predecessors to fool SplitEdge into working.
+  Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy);
+  for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
+       ++i) {
+    ShadowPN->addIncoming(UndefShadow, *i);
+  }
+
+  DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
+  DFSF.setShadow(&PN, ShadowPN);
+}
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
new file mode 100644
index 000000000000..ac6082441eae
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -0,0 +1,1229 @@
+//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements GCOV-style profiling. When this pass is run it emits
+// "gcno" files next to the existing source, and instruments the code that runs
+// to records the edges between blocks that run and emit a complementary "gcda"
+// file on exit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "insert-gcov-profiling"
+
+static cl::opt<std::string>
+DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
+                   cl::ValueRequired);
+static cl::opt<bool> DefaultExitBlockBeforeBody("gcov-exit-block-before-body",
+                                                cl::init(false), cl::Hidden);
+
+GCOVOptions GCOVOptions::getDefault() {
+  GCOVOptions Options;
+  Options.EmitNotes = true;
+  Options.EmitData = true;
+  Options.UseCfgChecksum = false;
+  Options.NoRedZone = false;
+  Options.FunctionNamesInData = true;
+  Options.ExitBlockBeforeBody = DefaultExitBlockBeforeBody;
+
+  if (DefaultGCOVVersion.size() != 4) {
+    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+                             DefaultGCOVVersion);
+  }
+  memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
+  return Options;
+}
+
+namespace {
+class GCOVFunction;
+
+class GCOVProfiler {
+public:
+  GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
+  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {
+    assert((Options.EmitNotes || Options.EmitData) &&
+           "GCOVProfiler asked to do nothing?");
+    ReversedVersion[0] = Options.Version[3];
+    ReversedVersion[1] = Options.Version[2];
+    ReversedVersion[2] = Options.Version[1];
+    ReversedVersion[3] = Options.Version[0];
+    ReversedVersion[4] = '\0';
+  }
+  bool
+  runOnModule(Module &M,
+              std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
+
+private:
+  // Create the .gcno files for the Module based on DebugInfo.
+  void emitProfileNotes();
+
+  // Modify the program to track transitions along edges and call into the
+  // profiling runtime to emit .gcda files when run.
+  bool emitProfileArcs();
+
+  bool isFunctionInstrumented(const Function &F);
+  std::vector<Regex> createRegexesFromString(StringRef RegexesStr);
+  static bool doesFilenameMatchARegex(StringRef Filename,
+                                      std::vector<Regex> &Regexes);
+
+  // Get pointers to the functions in the runtime library.
+  FunctionCallee getStartFileFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getEmitFunctionFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getEmitArcsFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getSummaryInfoFunc();
+  FunctionCallee getEndFileFunc();
+
+  // Add the function to write out all our counters to the global destructor
+  // list.
+  Function *
+  insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  Function *insertFlush(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+
+  void AddFlushBeforeForkAndExec();
+
+  enum class GCovFileType { GCNO, GCDA };
+  std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
+
+  GCOVOptions Options;
+
+  // Reversed, NUL-terminated copy of Options.Version.
+  char ReversedVersion[5];
+  // Checksum, produced by hash of EdgeDestinations
+  SmallVector<uint32_t, 4> FileChecksums;
+
+  Module *M;
+  std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
+  LLVMContext *Ctx;
+  SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
+  std::vector<Regex> FilterRe;
+  std::vector<Regex> ExcludeRe;
+  StringMap<bool> InstrumentedFiles;
+};
+
+class GCOVProfilerLegacyPass : public ModulePass {
+public:
+  static char ID;
+  GCOVProfilerLegacyPass()
+      : GCOVProfilerLegacyPass(GCOVOptions::getDefault()) {}
+  GCOVProfilerLegacyPass(const GCOVOptions &Opts)
+      : ModulePass(ID), Profiler(Opts) {
+    initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override { return "GCOV Profiler"; }
+
+  bool runOnModule(Module &M) override {
+    return Profiler.runOnModule(M, [this](Function &F) -> TargetLibraryInfo & {
+      return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    });
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+private:
+  GCOVProfiler Profiler;
+};
+}
+
+char GCOVProfilerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    GCOVProfilerLegacyPass, "insert-gcov-profiling",
+    "Insert instrumentation for GCOV profiling", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    GCOVProfilerLegacyPass, "insert-gcov-profiling",
+    "Insert instrumentation for GCOV profiling", false, false)
+
+ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
+  return new GCOVProfilerLegacyPass(Options);
+}
+
+static StringRef getFunctionName(const DISubprogram *SP) {
+  if (!SP->getLinkageName().empty())
+    return SP->getLinkageName();
+  return SP->getName();
+}
+
+/// Extract a filename for a DISubprogram.
+///
+/// Prefer relative paths in the coverage notes. Clang also may split
+/// up absolute paths into a directory and filename component. When
+/// the relative path doesn't exist, reconstruct the absolute path.
+static SmallString<128> getFilename(const DISubprogram *SP) {
+  SmallString<128> Path;
+  StringRef RelPath = SP->getFilename();
+  if (sys::fs::exists(RelPath))
+    Path = RelPath;
+  else
+    sys::path::append(Path, SP->getDirectory(), SP->getFilename());
+  return Path;
+}
+
+namespace {
+  class GCOVRecord {
+   protected:
+    static const char *const LinesTag;
+    static const char *const FunctionTag;
+    static const char *const BlockTag;
+    static const char *const EdgeTag;
+
+    GCOVRecord() = default;
+
+    void writeBytes(const char *Bytes, int Size) {
+      os->write(Bytes, Size);
+    }
+
+    void write(uint32_t i) {
+      writeBytes(reinterpret_cast<char*>(&i), 4);
+    }
+
+    // Returns the length measured in 4-byte blocks that will be used to
+    // represent this string in a GCOV file
+    static unsigned lengthOfGCOVString(StringRef s) {
+      // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs
+      // padding out to the next 4-byte word. The length is measured in 4-byte
+      // words including padding, not bytes of actual string.
+      return (s.size() / 4) + 1;
+    }
+
+    void writeGCOVString(StringRef s) {
+      uint32_t Len = lengthOfGCOVString(s);
+      write(Len);
+      writeBytes(s.data(), s.size());
+
+      // Write 1 to 4 bytes of NUL padding.
+      assert((unsigned)(4 - (s.size() % 4)) > 0);
+      assert((unsigned)(4 - (s.size() % 4)) <= 4);
+      writeBytes("\0\0\0\0", 4 - (s.size() % 4));
+    }
+
+    raw_ostream *os;
+  };
+  const char *const GCOVRecord::LinesTag = "\0\0\x45\x01";
+  const char *const GCOVRecord::FunctionTag = "\0\0\0\1";
+  const char *const GCOVRecord::BlockTag = "\0\0\x41\x01";
+  const char *const GCOVRecord::EdgeTag = "\0\0\x43\x01";
+
+  class GCOVFunction;
+  class GCOVBlock;
+
+  // Constructed only by requesting it from a GCOVBlock, this object stores a
+  // list of line numbers and a single filename, representing lines that belong
+  // to the block.
+  class GCOVLines : public GCOVRecord {
+   public:
+    void addLine(uint32_t Line) {
+      assert(Line != 0 && "Line zero is not a valid real line number.");
+      Lines.push_back(Line);
+    }
+
+    uint32_t length() const {
+      // Here 2 = 1 for string length + 1 for '0' id#.
+      return lengthOfGCOVString(Filename) + 2 + Lines.size();
+    }
+
+    void writeOut() {
+      write(0);
+      writeGCOVString(Filename);
+      for (int i = 0, e = Lines.size(); i != e; ++i)
+        write(Lines[i]);
+    }
+
+    GCOVLines(StringRef F, raw_ostream *os)
+      : Filename(F) {
+      this->os = os;
+    }
+
+   private:
+    std::string Filename;
+    SmallVector<uint32_t, 32> Lines;
+  };
+
+
+  // Represent a basic block in GCOV. Each block has a unique number in the
+  // function, number of lines belonging to each block, and a set of edges to
+  // other blocks.
+  class GCOVBlock : public GCOVRecord {
+   public:
+    GCOVLines &getFile(StringRef Filename) {
+      return LinesByFile.try_emplace(Filename, Filename, os).first->second;
+    }
+
+    void addEdge(GCOVBlock &Successor) {
+      OutEdges.push_back(&Successor);
+    }
+
+    void writeOut() {
+      uint32_t Len = 3;
+      SmallVector<StringMapEntry<GCOVLines> *, 32> SortedLinesByFile;
+      for (auto &I : LinesByFile) {
+        Len += I.second.length();
+        SortedLinesByFile.push_back(&I);
+      }
+
+      writeBytes(LinesTag, 4);
+      write(Len);
+      write(Number);
+
+      llvm::sort(SortedLinesByFile, [](StringMapEntry<GCOVLines> *LHS,
+                                       StringMapEntry<GCOVLines> *RHS) {
+        return LHS->getKey() < RHS->getKey();
+      });
+      for (auto &I : SortedLinesByFile)
+        I->getValue().writeOut();
+      write(0);
+      write(0);
+    }
+
+    GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) {
+      // Only allow copy before edges and lines have been added. After that,
+      // there are inter-block pointers (eg: edges) that won't take kindly to
+      // blocks being copied or moved around.
+      assert(LinesByFile.empty());
+      assert(OutEdges.empty());
+    }
+
+   private:
+    friend class GCOVFunction;
+
+    GCOVBlock(uint32_t Number, raw_ostream *os)
+        : Number(Number) {
+      this->os = os;
+    }
+
+    uint32_t Number;
+    StringMap<GCOVLines> LinesByFile;
+    SmallVector<GCOVBlock *, 4> OutEdges;
+  };
+
+  // A function has a unique identifier, a checksum (we leave as zero) and a
+  // set of blocks and a map of edges between blocks. This is the only GCOV
+  // object users can construct, the blocks and lines will be rooted here.
+  class GCOVFunction : public GCOVRecord {
+   public:
+     GCOVFunction(const DISubprogram *SP, Function *F, raw_ostream *os,
+                  uint32_t Ident, bool UseCfgChecksum, bool ExitBlockBeforeBody)
+         : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),
+           ReturnBlock(1, os) {
+      this->os = os;
+
+      LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
+
+      uint32_t i = 0;
+      for (auto &BB : *F) {
+        // Skip index 1 if it's assigned to the ReturnBlock.
+        if (i == 1 && ExitBlockBeforeBody)
+          ++i;
+        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++, os)));
+      }
+      if (!ExitBlockBeforeBody)
+        ReturnBlock.Number = i;
+
+      std::string FunctionNameAndLine;
+      raw_string_ostream FNLOS(FunctionNameAndLine);
+      FNLOS << getFunctionName(SP) << SP->getLine();
+      FNLOS.flush();
+      FuncChecksum = hash_value(FunctionNameAndLine);
+    }
+
+    GCOVBlock &getBlock(BasicBlock *BB) {
+      return Blocks.find(BB)->second;
+    }
+
+    GCOVBlock &getReturnBlock() {
+      return ReturnBlock;
+    }
+
+    std::string getEdgeDestinations() {
+      std::string EdgeDestinations;
+      raw_string_ostream EDOS(EdgeDestinations);
+      Function *F = Blocks.begin()->first->getParent();
+      for (BasicBlock &I : *F) {
+        GCOVBlock &Block = getBlock(&I);
+        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i)
+          EDOS << Block.OutEdges[i]->Number;
+      }
+      return EdgeDestinations;
+    }
+
+    uint32_t getFuncChecksum() {
+      return FuncChecksum;
+    }
+
+    void setCfgChecksum(uint32_t Checksum) {
+      CfgChecksum = Checksum;
+    }
+
+    void writeOut() {
+      writeBytes(FunctionTag, 4);
+      SmallString<128> Filename = getFilename(SP);
+      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
+                          1 + lengthOfGCOVString(Filename) + 1;
+      if (UseCfgChecksum)
+        ++BlockLen;
+      write(BlockLen);
+      write(Ident);
+      write(FuncChecksum);
+      if (UseCfgChecksum)
+        write(CfgChecksum);
+      writeGCOVString(getFunctionName(SP));
+      writeGCOVString(Filename);
+      write(SP->getLine());
+
+      // Emit count of blocks.
+      writeBytes(BlockTag, 4);
+      write(Blocks.size() + 1);
+      for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
+        write(0);  // No flags on our blocks.
+      }
+      LLVM_DEBUG(dbgs() << Blocks.size() << " blocks.\n");
+
+      // Emit edges between blocks.
+      if (Blocks.empty()) return;
+      Function *F = Blocks.begin()->first->getParent();
+      for (BasicBlock &I : *F) {
+        GCOVBlock &Block = getBlock(&I);
+        if (Block.OutEdges.empty()) continue;
+
+        writeBytes(EdgeTag, 4);
+        write(Block.OutEdges.size() * 2 + 1);
+        write(Block.Number);
+        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
+          LLVM_DEBUG(dbgs() << Block.Number << " -> "
+                            << Block.OutEdges[i]->Number << "\n");
+          write(Block.OutEdges[i]->Number);
+          write(0);  // no flags
+        }
+      }
+
+      // Emit lines for each block.
+      for (BasicBlock &I : *F)
+        getBlock(&I).writeOut();
+    }
+
+   private:
+     const DISubprogram *SP;
+    uint32_t Ident;
+    uint32_t FuncChecksum;
+    bool UseCfgChecksum;
+    uint32_t CfgChecksum;
+    DenseMap<BasicBlock *, GCOVBlock> Blocks;
+    GCOVBlock ReturnBlock;
+  };
+}
+
+// RegexesStr is a string containing differents regex separated by a semi-colon.
+// For example "foo\..*$;bar\..*$".
+std::vector<Regex> GCOVProfiler::createRegexesFromString(StringRef RegexesStr) {
+  std::vector<Regex> Regexes;
+  while (!RegexesStr.empty()) {
+    std::pair<StringRef, StringRef> HeadTail = RegexesStr.split(';');
+    if (!HeadTail.first.empty()) {
+      Regex Re(HeadTail.first);
+      std::string Err;
+      if (!Re.isValid(Err)) {
+        Ctx->emitError(Twine("Regex ") + HeadTail.first +
+                       " is not valid: " + Err);
+      }
+      Regexes.emplace_back(std::move(Re));
+    }
+    RegexesStr = HeadTail.second;
+  }
+  return Regexes;
+}
+
+bool GCOVProfiler::doesFilenameMatchARegex(StringRef Filename,
+                                           std::vector<Regex> &Regexes) {
+  for (Regex &Re : Regexes) {
+    if (Re.match(Filename)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
+  if (FilterRe.empty() && ExcludeRe.empty()) {
+    return true;
+  }
+  SmallString<128> Filename = getFilename(F.getSubprogram());
+  auto It = InstrumentedFiles.find(Filename);
+  if (It != InstrumentedFiles.end()) {
+    return It->second;
+  }
+
+  SmallString<256> RealPath;
+  StringRef RealFilename;
+
+  // Path can be
+  // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
+  // such a case we must get the real_path.
+  if (sys::fs::real_path(Filename, RealPath)) {
+    // real_path can fail with path like "foo.c".
+    RealFilename = Filename;
+  } else {
+    RealFilename = RealPath;
+  }
+
+  bool ShouldInstrument;
+  if (FilterRe.empty()) {
+    ShouldInstrument = !doesFilenameMatchARegex(RealFilename, ExcludeRe);
+  } else if (ExcludeRe.empty()) {
+    ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe);
+  } else {
+    ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe) &&
+                       !doesFilenameMatchARegex(RealFilename, ExcludeRe);
+  }
+  InstrumentedFiles[Filename] = ShouldInstrument;
+  return ShouldInstrument;
+}
+
+std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
+                                     GCovFileType OutputType) {
+  bool Notes = OutputType == GCovFileType::GCNO;
+
+  if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
+    for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
+      MDNode *N = GCov->getOperand(i);
+      bool ThreeElement = N->getNumOperands() == 3;
+      if (!ThreeElement && N->getNumOperands() != 2)
+        continue;
+      if (dyn_cast<MDNode>(N->getOperand(ThreeElement ? 2 : 1)) != CU)
+        continue;
+
+      if (ThreeElement) {
+        // These nodes have no mangling to apply, it's stored mangled in the
+        // bitcode.
+        MDString *NotesFile = dyn_cast<MDString>(N->getOperand(0));
+        MDString *DataFile = dyn_cast<MDString>(N->getOperand(1));
+        if (!NotesFile || !DataFile)
+          continue;
+        return Notes ? NotesFile->getString() : DataFile->getString();
+      }
+
+      MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+      if (!GCovFile)
+        continue;
+
+      SmallString<128> Filename = GCovFile->getString();
+      sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+      return Filename.str();
+    }
+  }
+
+  SmallString<128> Filename = CU->getFilename();
+  sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+  StringRef FName = sys::path::filename(Filename);
+  SmallString<128> CurPath;
+  if (sys::fs::current_path(CurPath)) return FName;
+  sys::path::append(CurPath, FName);
+  return CurPath.str();
+}
+
+bool GCOVProfiler::runOnModule(
+    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
+  this->M = &M;
+  this->GetTLI = std::move(GetTLI);
+  Ctx = &M.getContext();
+
+  AddFlushBeforeForkAndExec();
+
+  FilterRe = createRegexesFromString(Options.Filter);
+  ExcludeRe = createRegexesFromString(Options.Exclude);
+
+  if (Options.EmitNotes) emitProfileNotes();
+  if (Options.EmitData) return emitProfileArcs();
+  return false;
+}
+
+PreservedAnalyses GCOVProfilerPass::run(Module &M,
+                                        ModuleAnalysisManager &AM) {
+
+  GCOVProfiler Profiler(GCOVOpts);
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  if (!Profiler.runOnModule(M, [&](Function &F) -> TargetLibraryInfo & {
+        return FAM.getResult<TargetLibraryAnalysis>(F);
+      }))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+static bool functionHasLines(Function &F) {
+  // Check whether this function actually has any source lines. Not only
+  // do these waste space, they also can crash gcov.
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      // Debug intrinsic locations correspond to the location of the
+      // declaration, not necessarily any statements or expressions.
+      if (isa<DbgInfoIntrinsic>(&I)) continue;
+
+      const DebugLoc &Loc = I.getDebugLoc();
+      if (!Loc)
+        continue;
+
+      // Artificial lines such as calls to the global constructors.
+      if (Loc.getLine() == 0) continue;
+
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool isUsingScopeBasedEH(Function &F) {
+  if (!F.hasPersonalityFn()) return false;
+
+  EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
+  return isScopedEHPersonality(Personality);
+}
+
+static bool shouldKeepInEntry(BasicBlock::iterator It) {
+	if (isa<AllocaInst>(*It)) return true;
+	if (isa<DbgInfoIntrinsic>(*It)) return true;
+	if (auto *II = dyn_cast<IntrinsicInst>(It)) {
+		if (II->getIntrinsicID() == llvm::Intrinsic::localescape) return true;
+	}
+
+	return false;
+}
+
+void GCOVProfiler::AddFlushBeforeForkAndExec() {
+  SmallVector<Instruction *, 2> ForkAndExecs;
+  for (auto &F : M->functions()) {
+    auto *TLI = &GetTLI(F);
+    for (auto &I : instructions(F)) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (Function *Callee = CI->getCalledFunction()) {
+          LibFunc LF;
+          if (TLI->getLibFunc(*Callee, LF) &&
+              (LF == LibFunc_fork || LF == LibFunc_execl ||
+               LF == LibFunc_execle || LF == LibFunc_execlp ||
+               LF == LibFunc_execv || LF == LibFunc_execvp ||
+               LF == LibFunc_execve || LF == LibFunc_execvpe ||
+               LF == LibFunc_execvP)) {
+            ForkAndExecs.push_back(&I);
+          }
+        }
+      }
+    }
+  }
+
+  // We need to split the block after the fork/exec call
+  // because else the counters for the lines after will be
+  // the same as before the call.
+  for (auto I : ForkAndExecs) {
+    IRBuilder<> Builder(I);
+    FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
+    FunctionCallee GCOVFlush = M->getOrInsertFunction("__gcov_flush", FTy);
+    Builder.CreateCall(GCOVFlush);
+    I->getParent()->splitBasicBlock(I);
+  }
+}
+
+void GCOVProfiler::emitProfileNotes() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return;
+
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    // Each compile unit gets its own .gcno file. This means that whether we run
+    // this pass over the original .o's as they're produced, or run it after
+    // LTO, we'll generate the same .gcno files.
+
+    auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+
+    // Skip module skeleton (and module) CUs.
+    if (CU->getDWOId())
+      continue;
+
+    std::error_code EC;
+    raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC,
+                       sys::fs::OF_None);
+    if (EC) {
+      Ctx->emitError(Twine("failed to open coverage notes file for writing: ") +
+                     EC.message());
+      continue;
+    }
+
+    std::string EdgeDestinations;
+
+    unsigned FunctionIdent = 0;
+    for (auto &F : M->functions()) {
+      DISubprogram *SP = F.getSubprogram();
+      if (!SP) continue;
+      if (!functionHasLines(F) || !isFunctionInstrumented(F))
+        continue;
+      // TODO: Functions using scope-based EH are currently not supported.
+      if (isUsingScopeBasedEH(F)) continue;
+
+      // gcov expects every function to start with an entry block that has a
+      // single successor, so split the entry block to make sure of that.
+      BasicBlock &EntryBlock = F.getEntryBlock();
+      BasicBlock::iterator It = EntryBlock.begin();
+      while (shouldKeepInEntry(It))
+        ++It;
+      EntryBlock.splitBasicBlock(It);
+
+      Funcs.push_back(std::make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++,
+                                                Options.UseCfgChecksum,
+                                                Options.ExitBlockBeforeBody));
+      GCOVFunction &Func = *Funcs.back();
+
+      // Add the function line number to the lines of the entry block
+      // to have a counter for the function definition.
+      uint32_t Line = SP->getLine();
+      auto Filename = getFilename(SP);
+      Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line);
+
+      for (auto &BB : F) {
+        GCOVBlock &Block = Func.getBlock(&BB);
+        Instruction *TI = BB.getTerminator();
+        if (int successors = TI->getNumSuccessors()) {
+          for (int i = 0; i != successors; ++i) {
+            Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+          }
+        } else if (isa<ReturnInst>(TI)) {
+          Block.addEdge(Func.getReturnBlock());
+        }
+
+        for (auto &I : BB) {
+          // Debug intrinsic locations correspond to the location of the
+          // declaration, not necessarily any statements or expressions.
+          if (isa<DbgInfoIntrinsic>(&I)) continue;
+
+          const DebugLoc &Loc = I.getDebugLoc();
+          if (!Loc)
+            continue;
+
+          // Artificial lines such as calls to the global constructors.
+          if (Loc.getLine() == 0 || Loc.isImplicitCode())
+            continue;
+
+          if (Line == Loc.getLine()) continue;
+          Line = Loc.getLine();
+          if (SP != getDISubprogram(Loc.getScope()))
+            continue;
+
+          GCOVLines &Lines = Block.getFile(Filename);
+          Lines.addLine(Loc.getLine());
+        }
+        Line = 0;
+      }
+      EdgeDestinations += Func.getEdgeDestinations();
+    }
+
+    FileChecksums.push_back(hash_value(EdgeDestinations));
+    out.write("oncg", 4);
+    out.write(ReversedVersion, 4);
+    out.write(reinterpret_cast<char*>(&FileChecksums.back()), 4);
+
+    for (auto &Func : Funcs) {
+      Func->setCfgChecksum(FileChecksums.back());
+      Func->writeOut();
+    }
+
+    out.write("\0\0\0\0\0\0\0\0", 8);  // EOF
+    out.close();
+  }
+}
+
+bool GCOVProfiler::emitProfileArcs() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return false;
+
+  bool Result = false;
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
+    for (auto &F : M->functions()) {
+      DISubprogram *SP = F.getSubprogram();
+      if (!SP) continue;
+      if (!functionHasLines(F) || !isFunctionInstrumented(F))
+        continue;
+      // TODO: Functions using scope-based EH are currently not supported.
+      if (isUsingScopeBasedEH(F)) continue;
+      if (!Result) Result = true;
+
+      DenseMap<std::pair<BasicBlock *, BasicBlock *>, unsigned> EdgeToCounter;
+      unsigned Edges = 0;
+      for (auto &BB : F) {
+        Instruction *TI = BB.getTerminator();
+        if (isa<ReturnInst>(TI)) {
+          EdgeToCounter[{&BB, nullptr}] = Edges++;
+        } else {
+          for (BasicBlock *Succ : successors(TI)) {
+            EdgeToCounter[{&BB, Succ}] = Edges++;
+          }
+        }
+      }
+
+      ArrayType *CounterTy =
+        ArrayType::get(Type::getInt64Ty(*Ctx), Edges);
+      GlobalVariable *Counters =
+        new GlobalVariable(*M, CounterTy, false,
+                           GlobalValue::InternalLinkage,
+                           Constant::getNullValue(CounterTy),
+                           "__llvm_gcov_ctr");
+      CountersBySP.push_back(std::make_pair(Counters, SP));
+
+      // If a BB has several predecessors, use a PHINode to select
+      // the correct counter.
+      for (auto &BB : F) {
+        const unsigned EdgeCount =
+            std::distance(pred_begin(&BB), pred_end(&BB));
+        if (EdgeCount) {
+          // The phi node must be at the begin of the BB.
+          IRBuilder<> BuilderForPhi(&*BB.begin());
+          Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+          PHINode *Phi = BuilderForPhi.CreatePHI(Int64PtrTy, EdgeCount);
+          for (BasicBlock *Pred : predecessors(&BB)) {
+            auto It = EdgeToCounter.find({Pred, &BB});
+            assert(It != EdgeToCounter.end());
+            const unsigned Edge = It->second;
+            Value *EdgeCounter = BuilderForPhi.CreateConstInBoundsGEP2_64(
+                Counters->getValueType(), Counters, 0, Edge);
+            Phi->addIncoming(EdgeCounter, Pred);
+          }
+
+          // Skip phis, landingpads.
+          IRBuilder<> Builder(&*BB.getFirstInsertionPt());
+          Value *Count = Builder.CreateLoad(Builder.getInt64Ty(), Phi);
+          Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+          Builder.CreateStore(Count, Phi);
+
+          Instruction *TI = BB.getTerminator();
+          if (isa<ReturnInst>(TI)) {
+            auto It = EdgeToCounter.find({&BB, nullptr});
+            assert(It != EdgeToCounter.end());
+            const unsigned Edge = It->second;
+            Value *Counter = Builder.CreateConstInBoundsGEP2_64(
+                Counters->getValueType(), Counters, 0, Edge);
+            Value *Count = Builder.CreateLoad(Builder.getInt64Ty(), Counter);
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+            Builder.CreateStore(Count, Counter);
+          }
+        }
+      }
+    }
+
+    Function *WriteoutF = insertCounterWriteout(CountersBySP);
+    Function *FlushF = insertFlush(CountersBySP);
+
+    // Create a small bit of code that registers the "__llvm_gcov_writeout" to
+    // be executed at exit and the "__llvm_gcov_flush" function to be executed
+    // when "__gcov_flush" is called.
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                   "__llvm_gcov_init", M);
+    F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+    F->setLinkage(GlobalValue::InternalLinkage);
+    F->addFnAttr(Attribute::NoInline);
+    if (Options.NoRedZone)
+      F->addFnAttr(Attribute::NoRedZone);
+
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    IRBuilder<> Builder(BB);
+
+    FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Type *Params[] = {
+      PointerType::get(FTy, 0),
+      PointerType::get(FTy, 0)
+    };
+    FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
+
+    // Initialize the environment and register the local writeout and flush
+    // functions.
+    FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
+    Builder.CreateCall(GCOVInit, {WriteoutF, FlushF});
+    Builder.CreateRetVoid();
+
+    appendToGlobalCtors(*M, F, 0);
+  }
+
+  return Result;
+}
+
+FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
+  Type *Args[] = {
+    Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
+    Type::getInt8PtrTy(*Ctx),  // const char version[4]
+    Type::getInt32Ty(*Ctx),    // uint32_t checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  AttributeList AL;
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    AL = AL.addParamAttribute(*Ctx, 2, AK);
+  FunctionCallee Res = M->getOrInsertFunction("llvm_gcda_start_file", FTy, AL);
+  return Res;
+}
+
+FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),    // uint32_t ident
+    Type::getInt8PtrTy(*Ctx),  // const char *function_name
+    Type::getInt32Ty(*Ctx),    // uint32_t func_checksum
+    Type::getInt8Ty(*Ctx),     // uint8_t use_extra_checksum
+    Type::getInt32Ty(*Ctx),    // uint32_t cfg_checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  AttributeList AL;
+  if (auto AK = TLI->getExtAttrForI32Param(false)) {
+    AL = AL.addParamAttribute(*Ctx, 0, AK);
+    AL = AL.addParamAttribute(*Ctx, 2, AK);
+    AL = AL.addParamAttribute(*Ctx, 3, AK);
+    AL = AL.addParamAttribute(*Ctx, 4, AK);
+  }
+  return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+}
+
+FunctionCallee GCOVProfiler::getEmitArcsFunc(const TargetLibraryInfo *TLI) {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),     // uint32_t num_counters
+    Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  AttributeList AL;
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    AL = AL.addParamAttribute(*Ctx, 0, AK);
+  return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy, AL);
+}
+
+FunctionCallee GCOVProfiler::getSummaryInfoFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
+}
+
+FunctionCallee GCOVProfiler::getEndFileFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
+}
+
+Function *GCOVProfiler::insertCounterWriteout(
+    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
+  FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  if (!WriteoutF)
+    WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_writeout", M);
+  WriteoutF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  WriteoutF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    WriteoutF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
+  IRBuilder<> Builder(BB);
+
+  auto *TLI = &GetTLI(*WriteoutF);
+
+  FunctionCallee StartFile = getStartFileFunc(TLI);
+  FunctionCallee EmitFunction = getEmitFunctionFunc(TLI);
+  FunctionCallee EmitArcs = getEmitArcsFunc(TLI);
+  FunctionCallee SummaryInfo = getSummaryInfoFunc();
+  FunctionCallee EndFile = getEndFileFunc();
+
+  NamedMDNode *CUNodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CUNodes) {
+    Builder.CreateRetVoid();
+    return WriteoutF;
+  }
+
+  // Collect the relevant data into a large constant data structure that we can
+  // walk to write out everything.
+  StructType *StartFileCallArgsTy = StructType::create(
+      {Builder.getInt8PtrTy(), Builder.getInt8PtrTy(), Builder.getInt32Ty()});
+  StructType *EmitFunctionCallArgsTy = StructType::create(
+      {Builder.getInt32Ty(), Builder.getInt8PtrTy(), Builder.getInt32Ty(),
+       Builder.getInt8Ty(), Builder.getInt32Ty()});
+  StructType *EmitArcsCallArgsTy = StructType::create(
+      {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()});
+  StructType *FileInfoTy =
+      StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(),
+                          EmitFunctionCallArgsTy->getPointerTo(),
+                          EmitArcsCallArgsTy->getPointerTo()});
+
+  Constant *Zero32 = Builder.getInt32(0);
+  // Build an explicit array of two zeros for use in ConstantExpr GEP building.
+  Constant *TwoZero32s[] = {Zero32, Zero32};
+
+  SmallVector<Constant *, 8> FileInfos;
+  for (int i : llvm::seq<int>(0, CUNodes->getNumOperands())) {
+    auto *CU = cast<DICompileUnit>(CUNodes->getOperand(i));
+
+    // Skip module skeleton (and module) CUs.
+    if (CU->getDWOId())
+      continue;
+
+    std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
+    uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
+    auto *StartFileCallArgs = ConstantStruct::get(
+        StartFileCallArgsTy, {Builder.CreateGlobalStringPtr(FilenameGcda),
+                              Builder.CreateGlobalStringPtr(ReversedVersion),
+                              Builder.getInt32(CfgChecksum)});
+
+    SmallVector<Constant *, 8> EmitFunctionCallArgsArray;
+    SmallVector<Constant *, 8> EmitArcsCallArgsArray;
+    for (int j : llvm::seq<int>(0, CountersBySP.size())) {
+      auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second);
+      uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
+      EmitFunctionCallArgsArray.push_back(ConstantStruct::get(
+          EmitFunctionCallArgsTy,
+          {Builder.getInt32(j),
+           Options.FunctionNamesInData
+               ? Builder.CreateGlobalStringPtr(getFunctionName(SP))
+               : Constant::getNullValue(Builder.getInt8PtrTy()),
+           Builder.getInt32(FuncChecksum),
+           Builder.getInt8(Options.UseCfgChecksum),
+           Builder.getInt32(CfgChecksum)}));
+
+      GlobalVariable *GV = CountersBySP[j].first;
+      unsigned Arcs = cast<ArrayType>(GV->getValueType())->getNumElements();
+      EmitArcsCallArgsArray.push_back(ConstantStruct::get(
+          EmitArcsCallArgsTy,
+          {Builder.getInt32(Arcs), ConstantExpr::getInBoundsGetElementPtr(
+                                       GV->getValueType(), GV, TwoZero32s)}));
+    }
+    // Create global arrays for the two emit calls.
+    int CountersSize = CountersBySP.size();
+    assert(CountersSize == (int)EmitFunctionCallArgsArray.size() &&
+           "Mismatched array size!");
+    assert(CountersSize == (int)EmitArcsCallArgsArray.size() &&
+           "Mismatched array size!");
+    auto *EmitFunctionCallArgsArrayTy =
+        ArrayType::get(EmitFunctionCallArgsTy, CountersSize);
+    auto *EmitFunctionCallArgsArrayGV = new GlobalVariable(
+        *M, EmitFunctionCallArgsArrayTy, /*isConstant*/ true,
+        GlobalValue::InternalLinkage,
+        ConstantArray::get(EmitFunctionCallArgsArrayTy,
+                           EmitFunctionCallArgsArray),
+        Twine("__llvm_internal_gcov_emit_function_args.") + Twine(i));
+    auto *EmitArcsCallArgsArrayTy =
+        ArrayType::get(EmitArcsCallArgsTy, CountersSize);
+    EmitFunctionCallArgsArrayGV->setUnnamedAddr(
+        GlobalValue::UnnamedAddr::Global);
+    auto *EmitArcsCallArgsArrayGV = new GlobalVariable(
+        *M, EmitArcsCallArgsArrayTy, /*isConstant*/ true,
+        GlobalValue::InternalLinkage,
+        ConstantArray::get(EmitArcsCallArgsArrayTy, EmitArcsCallArgsArray),
+        Twine("__llvm_internal_gcov_emit_arcs_args.") + Twine(i));
+    EmitArcsCallArgsArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+    FileInfos.push_back(ConstantStruct::get(
+        FileInfoTy,
+        {StartFileCallArgs, Builder.getInt32(CountersSize),
+         ConstantExpr::getInBoundsGetElementPtr(EmitFunctionCallArgsArrayTy,
+                                                EmitFunctionCallArgsArrayGV,
+                                                TwoZero32s),
+         ConstantExpr::getInBoundsGetElementPtr(
+             EmitArcsCallArgsArrayTy, EmitArcsCallArgsArrayGV, TwoZero32s)}));
+  }
+
+  // If we didn't find anything to actually emit, bail on out.
+  if (FileInfos.empty()) {
+    Builder.CreateRetVoid();
+    return WriteoutF;
+  }
+
+  // To simplify code, we cap the number of file infos we write out to fit
+  // easily in a 32-bit signed integer. This gives consistent behavior between
+  // 32-bit and 64-bit systems without requiring (potentially very slow) 64-bit
+  // operations on 32-bit systems. It also seems unreasonable to try to handle
+  // more than 2 billion files.
+  if ((int64_t)FileInfos.size() > (int64_t)INT_MAX)
+    FileInfos.resize(INT_MAX);
+
+  // Create a global for the entire data structure so we can walk it more
+  // easily.
+  auto *FileInfoArrayTy = ArrayType::get(FileInfoTy, FileInfos.size());
+  auto *FileInfoArrayGV = new GlobalVariable(
+      *M, FileInfoArrayTy, /*isConstant*/ true, GlobalValue::InternalLinkage,
+      ConstantArray::get(FileInfoArrayTy, FileInfos),
+      "__llvm_internal_gcov_emit_file_info");
+  FileInfoArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  // Create the CFG for walking this data structure.
+  auto *FileLoopHeader =
+      BasicBlock::Create(*Ctx, "file.loop.header", WriteoutF);
+  auto *CounterLoopHeader =
+      BasicBlock::Create(*Ctx, "counter.loop.header", WriteoutF);
+  auto *FileLoopLatch = BasicBlock::Create(*Ctx, "file.loop.latch", WriteoutF);
+  auto *ExitBB = BasicBlock::Create(*Ctx, "exit", WriteoutF);
+
+  // We always have at least one file, so just branch to the header.
+  Builder.CreateBr(FileLoopHeader);
+
+  // The index into the files structure is our loop induction variable.
+  Builder.SetInsertPoint(FileLoopHeader);
+  PHINode *IV =
+      Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2);
+  IV->addIncoming(Builder.getInt32(0), BB);
+  auto *FileInfoPtr = Builder.CreateInBoundsGEP(
+      FileInfoArrayTy, FileInfoArrayGV, {Builder.getInt32(0), IV});
+  auto *StartFileCallArgsPtr =
+      Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 0);
+  auto *StartFileCall = Builder.CreateCall(
+      StartFile,
+      {Builder.CreateLoad(StartFileCallArgsTy->getElementType(0),
+                          Builder.CreateStructGEP(StartFileCallArgsTy,
+                                                  StartFileCallArgsPtr, 0)),
+       Builder.CreateLoad(StartFileCallArgsTy->getElementType(1),
+                          Builder.CreateStructGEP(StartFileCallArgsTy,
+                                                  StartFileCallArgsPtr, 1)),
+       Builder.CreateLoad(StartFileCallArgsTy->getElementType(2),
+                          Builder.CreateStructGEP(StartFileCallArgsTy,
+                                                  StartFileCallArgsPtr, 2))});
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    StartFileCall->addParamAttr(2, AK);
+  auto *NumCounters =
+      Builder.CreateLoad(FileInfoTy->getElementType(1),
+                         Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 1));
+  auto *EmitFunctionCallArgsArray =
+      Builder.CreateLoad(FileInfoTy->getElementType(2),
+                         Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 2));
+  auto *EmitArcsCallArgsArray =
+      Builder.CreateLoad(FileInfoTy->getElementType(3),
+                         Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 3));
+  auto *EnterCounterLoopCond =
+      Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters);
+  Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch);
+
+  Builder.SetInsertPoint(CounterLoopHeader);
+  auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2);
+  JV->addIncoming(Builder.getInt32(0), FileLoopHeader);
+  auto *EmitFunctionCallArgsPtr = Builder.CreateInBoundsGEP(
+      EmitFunctionCallArgsTy, EmitFunctionCallArgsArray, JV);
+  auto *EmitFunctionCall = Builder.CreateCall(
+      EmitFunction,
+      {Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(0),
+                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
+                                                  EmitFunctionCallArgsPtr, 0)),
+       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(1),
+                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
+                                                  EmitFunctionCallArgsPtr, 1)),
+       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(2),
+                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
+                                                  EmitFunctionCallArgsPtr, 2)),
+       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(3),
+                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
+                                                  EmitFunctionCallArgsPtr, 3)),
+       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(4),
+                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
+                                                  EmitFunctionCallArgsPtr,
+                                                  4))});
+  if (auto AK = TLI->getExtAttrForI32Param(false)) {
+    EmitFunctionCall->addParamAttr(0, AK);
+    EmitFunctionCall->addParamAttr(2, AK);
+    EmitFunctionCall->addParamAttr(3, AK);
+    EmitFunctionCall->addParamAttr(4, AK);
+  }
+  auto *EmitArcsCallArgsPtr =
+      Builder.CreateInBoundsGEP(EmitArcsCallArgsTy, EmitArcsCallArgsArray, JV);
+  auto *EmitArcsCall = Builder.CreateCall(
+      EmitArcs,
+      {Builder.CreateLoad(
+           EmitArcsCallArgsTy->getElementType(0),
+           Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 0)),
+       Builder.CreateLoad(EmitArcsCallArgsTy->getElementType(1),
+                          Builder.CreateStructGEP(EmitArcsCallArgsTy,
+                                                  EmitArcsCallArgsPtr, 1))});
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    EmitArcsCall->addParamAttr(0, AK);
+  auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1));
+  auto *CounterLoopCond = Builder.CreateICmpSLT(NextJV, NumCounters);
+  Builder.CreateCondBr(CounterLoopCond, CounterLoopHeader, FileLoopLatch);
+  JV->addIncoming(NextJV, CounterLoopHeader);
+
+  Builder.SetInsertPoint(FileLoopLatch);
+  Builder.CreateCall(SummaryInfo, {});
+  Builder.CreateCall(EndFile, {});
+  auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1));
+  auto *FileLoopCond =
+      Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size()));
+  Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB);
+  IV->addIncoming(NextIV, FileLoopLatch);
+
+  Builder.SetInsertPoint(ExitBB);
+  Builder.CreateRetVoid();
+
+  return WriteoutF;
+}
+
+Function *GCOVProfiler::
+insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *FlushF = M->getFunction("__llvm_gcov_flush");
+  if (!FlushF)
+    FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__llvm_gcov_flush", M);
+  else
+    FlushF->setLinkage(GlobalValue::InternalLinkage);
+  FlushF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  FlushF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    FlushF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
+
+  // Write out the current counters.
+  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  assert(WriteoutF && "Need to create the writeout function first!");
+
+  IRBuilder<> Builder(Entry);
+  Builder.CreateCall(WriteoutF, {});
+
+  // Zero out the counters.
+  for (const auto &I : CountersBySP) {
+    GlobalVariable *GV = I.first;
+    Constant *Null = Constant::getNullValue(GV->getValueType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = FlushF->getReturnType();
+  if (RetTy == Type::getVoidTy(*Ctx))
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __llvm_gcov_flush was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __llvm_gcov_flush");
+
+  return FlushF;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
new file mode 100644
index 000000000000..f87132ee4758
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -0,0 +1,1521 @@
+//===- HWAddressSanitizer.cpp - detector of uninitialized reads -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of HWAddressSanitizer, an address sanity checker
+/// based on tagged addressing.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hwasan"
+
+static const char *const kHwasanModuleCtorName = "hwasan.module_ctor";
+static const char *const kHwasanNoteName = "hwasan.note";
+static const char *const kHwasanInitName = "__hwasan_init";
+static const char *const kHwasanPersonalityThunkName =
+    "__hwasan_personality_thunk";
+
+static const char *const kHwasanShadowMemoryDynamicAddress =
+    "__hwasan_shadow_memory_dynamic_address";
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+static const size_t kDefaultShadowScale = 4;
+static const uint64_t kDynamicShadowSentinel =
+    std::numeric_limits<uint64_t>::max();
+static const unsigned kPointerTagShift = 56;
+
+static const unsigned kShadowBaseAlignment = 32;
+
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+    "hwasan-memory-access-callback-prefix",
+    cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+    cl::init("__hwasan_"));
+
+static cl::opt<bool>
+    ClInstrumentWithCalls("hwasan-instrument-with-calls",
+                cl::desc("instrument reads and writes with callbacks"),
+                cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInstrumentReads("hwasan-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentWrites(
+    "hwasan-instrument-writes", cl::desc("instrument write instructions"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+    "hwasan-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ClRecover(
+    "hwasan-recover",
+    cl::desc("Enable recovery mode (continue-after-error)."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack",
+                                       cl::desc("instrument stack (allocas)"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClUARRetagToZero(
+    "hwasan-uar-retag-to-zero",
+    cl::desc("Clear alloca tags before returning from the function to allow "
+             "non-instrumented and instrumented function calls mix. When set "
+             "to false, allocas are retagged before returning from the "
+             "function to detect use after return."),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClGenerateTagsWithCalls(
+    "hwasan-generate-tags-with-calls",
+    cl::desc("generate new tags with runtime library calls"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
+                               cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClMatchAllTag(
+    "hwasan-match-all-tag",
+    cl::desc("don't report bad accesses via pointers with this tag"),
+    cl::Hidden, cl::init(-1));
+
+static cl::opt<bool> ClEnableKhwasan(
+    "hwasan-kernel",
+    cl::desc("Enable KernelHWAddressSanitizer instrumentation"),
+    cl::Hidden, cl::init(false));
+
+// These flags allow to change the shadow mapping and control how shadow memory
+// is accessed. The shadow mapping looks like:
+//    Shadow = (Mem >> scale) + offset
+
+static cl::opt<uint64_t>
+    ClMappingOffset("hwasan-mapping-offset",
+                    cl::desc("HWASan shadow mapping offset [EXPERIMENTAL]"),
+                    cl::Hidden, cl::init(0));
+
+static cl::opt<bool>
+    ClWithIfunc("hwasan-with-ifunc",
+                cl::desc("Access dynamic shadow through an ifunc global on "
+                         "platforms that support this"),
+                cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClWithTls(
+    "hwasan-with-tls",
+    cl::desc("Access dynamic shadow through an thread-local pointer on "
+             "platforms that support this"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    ClRecordStackHistory("hwasan-record-stack-history",
+                         cl::desc("Record stack frames with tagged allocations "
+                                  "in a thread-local ring buffer"),
+                         cl::Hidden, cl::init(true));
+static cl::opt<bool>
+    ClInstrumentMemIntrinsics("hwasan-instrument-mem-intrinsics",
+                              cl::desc("instrument memory intrinsics"),
+                              cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    ClInstrumentLandingPads("hwasan-instrument-landing-pads",
+                            cl::desc("instrument landing pads"), cl::Hidden,
+                            cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClUseShortGranules(
+    "hwasan-use-short-granules",
+    cl::desc("use short granules in allocas and outlined checks"), cl::Hidden,
+    cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClInstrumentPersonalityFunctions(
+    "hwasan-instrument-personality-functions",
+    cl::desc("instrument personality functions"), cl::Hidden, cl::init(false),
+    cl::ZeroOrMore);
+
+static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks",
+                                       cl::desc("inline all checks"),
+                                       cl::Hidden, cl::init(false));
+
+namespace {
+
+/// An instrumentation pass implementing detection of addressability bugs
+/// using tagged pointers.
+class HWAddressSanitizer {
+public:
+  explicit HWAddressSanitizer(Module &M, bool CompileKernel = false,
+                              bool Recover = false) : M(M) {
+    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+    this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
+        ClEnableKhwasan : CompileKernel;
+
+    initializeModule();
+  }
+
+  bool sanitizeFunction(Function &F);
+  void initializeModule();
+
+  void initializeCallbacks(Module &M);
+
+  Value *getDynamicShadowIfunc(IRBuilder<> &IRB);
+  Value *getDynamicShadowNonTls(IRBuilder<> &IRB);
+
+  void untagPointerOperand(Instruction *I, Value *Addr);
+  Value *shadowBase();
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  void instrumentMemAccessInline(Value *Ptr, bool IsWrite,
+                                 unsigned AccessSizeIndex,
+                                 Instruction *InsertBefore);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  bool instrumentMemAccess(Instruction *I);
+  Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
+                                   uint64_t *TypeSize, unsigned *Alignment,
+                                   Value **MaybeMask);
+
+  bool isInterestingAlloca(const AllocaInst &AI);
+  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
+  Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
+  Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
+  bool instrumentStack(
+      SmallVectorImpl<AllocaInst *> &Allocas,
+      DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap,
+      SmallVectorImpl<Instruction *> &RetVec, Value *StackTag);
+  Value *readRegister(IRBuilder<> &IRB, StringRef Name);
+  bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
+  Value *getNextTagWithCall(IRBuilder<> &IRB);
+  Value *getStackBaseTag(IRBuilder<> &IRB);
+  Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, AllocaInst *AI,
+                     unsigned AllocaNo);
+  Value *getUARTag(IRBuilder<> &IRB, Value *StackTag);
+
+  Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty);
+  void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord);
+
+  void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
+  void instrumentGlobals();
+
+  void instrumentPersonalityFunctions();
+
+private:
+  LLVMContext *C;
+  Module &M;
+  Triple TargetTriple;
+  FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset;
+  FunctionCallee HWAsanHandleVfork;
+
+  /// This struct defines the shadow mapping using the rule:
+  ///   shadow = (mem >> Scale) + Offset.
+  /// If InGlobal is true, then
+  ///   extern char __hwasan_shadow[];
+  ///   shadow = (mem >> Scale) + &__hwasan_shadow
+  /// If InTls is true, then
+  ///   extern char *__hwasan_tls;
+  ///   shadow = (mem>>Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
+  struct ShadowMapping {
+    int Scale;
+    uint64_t Offset;
+    bool InGlobal;
+    bool InTls;
+
+    void init(Triple &TargetTriple);
+    unsigned getObjectAlignment() const { return 1U << Scale; }
+  };
+  ShadowMapping Mapping;
+
+  Type *VoidTy = Type::getVoidTy(M.getContext());
+  Type *IntptrTy;
+  Type *Int8PtrTy;
+  Type *Int8Ty;
+  Type *Int32Ty;
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+
+  bool CompileKernel;
+  bool Recover;
+  bool UseShortGranules;
+  bool InstrumentLandingPads;
+
+  Function *HwasanCtorFunction;
+
+  FunctionCallee HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
+  FunctionCallee HwasanMemoryAccessCallbackSized[2];
+
+  FunctionCallee HwasanTagMemoryFunc;
+  FunctionCallee HwasanGenerateTagFunc;
+  FunctionCallee HwasanThreadEnterFunc;
+
+  Constant *ShadowGlobal;
+
+  Value *LocalDynamicShadow = nullptr;
+  Value *StackBaseTag = nullptr;
+  GlobalValue *ThreadPtrGlobal = nullptr;
+};
+
+class HWAddressSanitizerLegacyPass : public FunctionPass {
+public:
+  // Pass identification, replacement for typeid.
+  static char ID;
+
+  explicit HWAddressSanitizerLegacyPass(bool CompileKernel = false,
+                                        bool Recover = false)
+      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover) {}
+
+  StringRef getPassName() const override { return "HWAddressSanitizer"; }
+
+  bool doInitialization(Module &M) override {
+    HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
+    return true;
+  }
+
+  bool runOnFunction(Function &F) override {
+    return HWASan->sanitizeFunction(F);
+  }
+
+  bool doFinalization(Module &M) override {
+    HWASan.reset();
+    return false;
+  }
+
+private:
+  std::unique_ptr<HWAddressSanitizer> HWASan;
+  bool CompileKernel;
+  bool Recover;
+};
+
+} // end anonymous namespace
+
+char HWAddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    HWAddressSanitizerLegacyPass, "hwasan",
+    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+    false)
+INITIALIZE_PASS_END(
+    HWAddressSanitizerLegacyPass, "hwasan",
+    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+    false)
+
+FunctionPass *llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel,
+                                                           bool Recover) {
+  assert(!CompileKernel || Recover);
+  return new HWAddressSanitizerLegacyPass(CompileKernel, Recover);
+}
+
+HWAddressSanitizerPass::HWAddressSanitizerPass(bool CompileKernel, bool Recover)
+    : CompileKernel(CompileKernel), Recover(Recover) {}
+
+PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
+                                              ModuleAnalysisManager &MAM) {
+  HWAddressSanitizer HWASan(M, CompileKernel, Recover);
+  bool Modified = false;
+  for (Function &F : M)
+    Modified |= HWASan.sanitizeFunction(F);
+  if (Modified)
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+/// Module-level initialization.
+///
+/// inserts a call to __hwasan_init to the module's constructor list.
+void HWAddressSanitizer::initializeModule() {
+  LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
+  auto &DL = M.getDataLayout();
+
+  TargetTriple = Triple(M.getTargetTriple());
+
+  Mapping.init(TargetTriple);
+
+  C = &(M.getContext());
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(DL);
+  Int8PtrTy = IRB.getInt8PtrTy();
+  Int8Ty = IRB.getInt8Ty();
+  Int32Ty = IRB.getInt32Ty();
+
+  HwasanCtorFunction = nullptr;
+
+  // Older versions of Android do not have the required runtime support for
+  // short granules, global or personality function instrumentation. On other
+  // platforms we currently require using the latest version of the runtime.
+  bool NewRuntime =
+      !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
+
+  UseShortGranules =
+      ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
+
+  // If we don't have personality function support, fall back to landing pads.
+  InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
+                              ? ClInstrumentLandingPads
+                              : !NewRuntime;
+
+  if (!CompileKernel) {
+    std::tie(HwasanCtorFunction, std::ignore) =
+        getOrCreateSanitizerCtorAndInitFunctions(
+            M, kHwasanModuleCtorName, kHwasanInitName,
+            /*InitArgTypes=*/{},
+            /*InitArgs=*/{},
+            // This callback is invoked when the functions are created the first
+            // time. Hook them into the global ctors list in that case:
+            [&](Function *Ctor, FunctionCallee) {
+              Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
+              Ctor->setComdat(CtorComdat);
+              appendToGlobalCtors(M, Ctor, 0, Ctor);
+            });
+
+    bool InstrumentGlobals =
+        ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
+    if (InstrumentGlobals)
+      instrumentGlobals();
+
+    bool InstrumentPersonalityFunctions =
+        ClInstrumentPersonalityFunctions.getNumOccurrences()
+            ? ClInstrumentPersonalityFunctions
+            : NewRuntime;
+    if (InstrumentPersonalityFunctions)
+      instrumentPersonalityFunctions();
+  }
+
+  if (!TargetTriple.isAndroid()) {
+    Constant *C = M.getOrInsertGlobal("__hwasan_tls", IntptrTy, [&] {
+      auto *GV = new GlobalVariable(M, IntptrTy, /*isConstant=*/false,
+                                    GlobalValue::ExternalLinkage, nullptr,
+                                    "__hwasan_tls", nullptr,
+                                    GlobalVariable::InitialExecTLSModel);
+      appendToCompilerUsed(M, GV);
+      return GV;
+    });
+    ThreadPtrGlobal = cast<GlobalVariable>(C);
+  }
+}
+
+void HWAddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    const std::string TypeStr = AccessIsWrite ? "store" : "load";
+    const std::string EndingStr = Recover ? "_noabort" : "";
+
+    HwasanMemoryAccessCallbackSized[AccessIsWrite] = M.getOrInsertFunction(
+        ClMemoryAccessCallbackPrefix + TypeStr + "N" + EndingStr,
+        FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false));
+
+    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+         AccessSizeIndex++) {
+      HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
+          M.getOrInsertFunction(
+              ClMemoryAccessCallbackPrefix + TypeStr +
+                  itostr(1ULL << AccessSizeIndex) + EndingStr,
+              FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false));
+    }
+  }
+
+  HwasanTagMemoryFunc = M.getOrInsertFunction(
+      "__hwasan_tag_memory", IRB.getVoidTy(), Int8PtrTy, Int8Ty, IntptrTy);
+  HwasanGenerateTagFunc =
+      M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty);
+
+  ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow",
+                                     ArrayType::get(IRB.getInt8Ty(), 0));
+
+  const std::string MemIntrinCallbackPrefix =
+      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+  HWAsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
+                                        IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                        IRB.getInt8PtrTy(), IntptrTy);
+  HWAsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
+                                       IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                       IRB.getInt8PtrTy(), IntptrTy);
+  HWAsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
+                                       IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                       IRB.getInt32Ty(), IntptrTy);
+
+  HWAsanHandleVfork =
+      M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy);
+
+  HwasanThreadEnterFunc =
+      M.getOrInsertFunction("__hwasan_thread_enter", IRB.getVoidTy());
+}
+
+Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) {
+  // An empty inline asm with input reg == output reg.
+  // An opaque no-op cast, basically.
+  InlineAsm *Asm = InlineAsm::get(
+      FunctionType::get(Int8PtrTy, {ShadowGlobal->getType()}, false),
+      StringRef(""), StringRef("=r,0"),
+      /*hasSideEffects=*/false);
+  return IRB.CreateCall(Asm, {ShadowGlobal}, ".hwasan.shadow");
+}
+
+Value *HWAddressSanitizer::getDynamicShadowNonTls(IRBuilder<> &IRB) {
+  // Generate code only when dynamic addressing is needed.
+  if (Mapping.Offset != kDynamicShadowSentinel)
+    return nullptr;
+
+  if (Mapping.InGlobal) {
+    return getDynamicShadowIfunc(IRB);
+  } else {
+    Value *GlobalDynamicAddress =
+        IRB.GetInsertBlock()->getParent()->getParent()->getOrInsertGlobal(
+            kHwasanShadowMemoryDynamicAddress, Int8PtrTy);
+    return IRB.CreateLoad(Int8PtrTy, GlobalDynamicAddress);
+  }
+}
+
+Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
+                                                     bool *IsWrite,
+                                                     uint64_t *TypeSize,
+                                                     unsigned *Alignment,
+                                                     Value **MaybeMask) {
+  // Skip memory accesses inserted by another instrumentation.
+  if (I->hasMetadata("nosanitize")) return nullptr;
+
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (LocalDynamicShadow == I)
+    return nullptr;
+
+  Value *PtrOperand = nullptr;
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads) return nullptr;
+    *IsWrite = false;
+    *TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    *Alignment = LI->getAlignment();
+    PtrOperand = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    *Alignment = SI->getAlignment();
+    PtrOperand = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    *Alignment = 0;
+    PtrOperand = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    *Alignment = 0;
+    PtrOperand = XCHG->getPointerOperand();
+  }
+
+  if (PtrOperand) {
+    // Do not instrument accesses from different address spaces; we cannot deal
+    // with them.
+    Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return nullptr;
+
+    // Ignore swifterror addresses.
+    // swifterror memory addresses are mem2reg promoted by instruction
+    // selection. As such they cannot have regular uses like an instrumentation
+    // function and it makes no sense to track them as memory.
+    if (PtrOperand->isSwiftError())
+      return nullptr;
+  }
+
+  return PtrOperand;
+}
+
+static unsigned getPointerOperandIndex(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperandIndex();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperandIndex();
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I))
+    return RMW->getPointerOperandIndex();
+  if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I))
+    return XCHG->getPointerOperandIndex();
+  report_fatal_error("Unexpected instruction");
+  return -1;
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = countTrailingZeros(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
+void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) {
+  if (TargetTriple.isAArch64())
+    return;
+
+  IRBuilder<> IRB(I);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *UntaggedPtr =
+      IRB.CreateIntToPtr(untagPointer(IRB, AddrLong), Addr->getType());
+  I->setOperand(getPointerOperandIndex(I), UntaggedPtr);
+}
+
+Value *HWAddressSanitizer::shadowBase() {
+  if (LocalDynamicShadow)
+    return LocalDynamicShadow;
+  return ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, Mapping.Offset),
+                                   Int8PtrTy);
+}
+
+Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) {
+  // Mem >> Scale
+  Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale);
+  if (Mapping.Offset == 0)
+    return IRB.CreateIntToPtr(Shadow, Int8PtrTy);
+  // (Mem >> Scale) + Offset
+  return IRB.CreateGEP(Int8Ty, shadowBase(), Shadow);
+}
+
+void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
+                                                   unsigned AccessSizeIndex,
+                                                   Instruction *InsertBefore) {
+  const int64_t AccessInfo = Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex;
+  IRBuilder<> IRB(InsertBefore);
+
+  if (!ClInlineAllChecks && TargetTriple.isAArch64() &&
+      TargetTriple.isOSBinFormatELF() && !Recover) {
+    Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+    Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
+    IRB.CreateCall(Intrinsic::getDeclaration(
+                       M, UseShortGranules
+                              ? Intrinsic::hwasan_check_memaccess_shortgranules
+                              : Intrinsic::hwasan_check_memaccess),
+                   {shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
+    return;
+  }
+
+  Value *PtrLong = IRB.CreatePointerCast(Ptr, IntptrTy);
+  Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift),
+                                  IRB.getInt8Ty());
+  Value *AddrLong = untagPointer(IRB, PtrLong);
+  Value *Shadow = memToShadow(AddrLong, IRB);
+  Value *MemTag = IRB.CreateLoad(Int8Ty, Shadow);
+  Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
+
+  int matchAllTag = ClMatchAllTag.getNumOccurrences() > 0 ?
+      ClMatchAllTag : (CompileKernel ? 0xFF : -1);
+  if (matchAllTag != -1) {
+    Value *TagNotIgnored = IRB.CreateICmpNE(PtrTag,
+        ConstantInt::get(PtrTag->getType(), matchAllTag));
+    TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
+  }
+
+  Instruction *CheckTerm =
+      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
+                                MDBuilder(*C).createBranchWeights(1, 100000));
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *OutOfShortGranuleTagRange =
+      IRB.CreateICmpUGT(MemTag, ConstantInt::get(Int8Ty, 15));
+  Instruction *CheckFailTerm =
+      SplitBlockAndInsertIfThen(OutOfShortGranuleTagRange, CheckTerm, !Recover,
+                                MDBuilder(*C).createBranchWeights(1, 100000));
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(PtrLong, 15), Int8Ty);
+  PtrLowBits = IRB.CreateAdd(
+      PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1));
+  Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag);
+  SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false,
+                            MDBuilder(*C).createBranchWeights(1, 100000),
+                            nullptr, nullptr, CheckFailTerm->getParent());
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15);
+  InlineTagAddr = IRB.CreateIntToPtr(InlineTagAddr, Int8PtrTy);
+  Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr);
+  Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag);
+  SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false,
+                            MDBuilder(*C).createBranchWeights(1, 100000),
+                            nullptr, nullptr, CheckFailTerm->getParent());
+
+  IRB.SetInsertPoint(CheckFailTerm);
+  InlineAsm *Asm;
+  switch (TargetTriple.getArch()) {
+    case Triple::x86_64:
+      // The signal handler will find the data address in rdi.
+      Asm = InlineAsm::get(
+          FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
+          "int3\nnopl " + itostr(0x40 + AccessInfo) + "(%rax)",
+          "{rdi}",
+          /*hasSideEffects=*/true);
+      break;
+    case Triple::aarch64:
+    case Triple::aarch64_be:
+      // The signal handler will find the data address in x0.
+      Asm = InlineAsm::get(
+          FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
+          "brk #" + itostr(0x900 + AccessInfo),
+          "{x0}",
+          /*hasSideEffects=*/true);
+      break;
+    default:
+      report_fatal_error("unsupported architecture");
+  }
+  IRB.CreateCall(Asm, PtrLong);
+  if (Recover)
+    cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent());
+}
+
+void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? HWAsanMemmove : HWAsanMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        HWAsanMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
+  LLVM_DEBUG(dbgs() << "Instrumenting: " << *I << "\n");
+  bool IsWrite = false;
+  unsigned Alignment = 0;
+  uint64_t TypeSize = 0;
+  Value *MaybeMask = nullptr;
+
+  if (ClInstrumentMemIntrinsics && isa<MemIntrinsic>(I)) {
+    instrumentMemIntrinsic(cast<MemIntrinsic>(I));
+    return true;
+  }
+
+  Value *Addr =
+      isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment, &MaybeMask);
+
+  if (!Addr)
+    return false;
+
+  if (MaybeMask)
+    return false; //FIXME
+
+  IRBuilder<> IRB(I);
+  if (isPowerOf2_64(TypeSize) &&
+      (TypeSize / 8 <= (1UL << (kNumberOfAccessSizes - 1))) &&
+      (Alignment >= (1UL << Mapping.Scale) || Alignment == 0 ||
+       Alignment >= TypeSize / 8)) {
+    size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+    if (ClInstrumentWithCalls) {
+      IRB.CreateCall(HwasanMemoryAccessCallback[IsWrite][AccessSizeIndex],
+                     IRB.CreatePointerCast(Addr, IntptrTy));
+    } else {
+      instrumentMemAccessInline(Addr, IsWrite, AccessSizeIndex, I);
+    }
+  } else {
+    IRB.CreateCall(HwasanMemoryAccessCallbackSized[IsWrite],
+                   {IRB.CreatePointerCast(Addr, IntptrTy),
+                    ConstantInt::get(IntptrTy, TypeSize / 8)});
+  }
+  untagPointerOperand(I, Addr);
+
+  return true;
+}
+
+static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
+  uint64_t ArraySize = 1;
+  if (AI.isArrayAllocation()) {
+    const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+    assert(CI && "non-constant array size");
+    ArraySize = CI->getZExtValue();
+  }
+  Type *Ty = AI.getAllocatedType();
+  uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+  return SizeInBytes * ArraySize;
+}
+
+bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
+                                   Value *Tag, size_t Size) {
+  size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+  if (!UseShortGranules)
+    Size = AlignedSize;
+
+  Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
+  if (ClInstrumentWithCalls) {
+    IRB.CreateCall(HwasanTagMemoryFunc,
+                   {IRB.CreatePointerCast(AI, Int8PtrTy), JustTag,
+                    ConstantInt::get(IntptrTy, AlignedSize)});
+  } else {
+    size_t ShadowSize = Size >> Mapping.Scale;
+    Value *ShadowPtr = memToShadow(IRB.CreatePointerCast(AI, IntptrTy), IRB);
+    // If this memset is not inlined, it will be intercepted in the hwasan
+    // runtime library. That's OK, because the interceptor skips the checks if
+    // the address is in the shadow region.
+    // FIXME: the interceptor is not as fast as real memset. Consider lowering
+    // llvm.memset right here into either a sequence of stores, or a call to
+    // hwasan_tag_memory.
+    if (ShadowSize)
+      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+    if (Size != AlignedSize) {
+      IRB.CreateStore(
+          ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()),
+          IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
+      IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
+                                   Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
+                                   AlignedSize - 1));
+    }
+  }
+  return true;
+}
+
+static unsigned RetagMask(unsigned AllocaNo) {
+  // A list of 8-bit numbers that have at most one run of non-zero bits.
+  // x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these
+  // masks.
+  // The list does not include the value 255, which is used for UAR.
+  //
+  // Because we are more likely to use earlier elements of this list than later
+  // ones, it is sorted in increasing order of probability of collision with a
+  // mask allocated (temporally) nearby. The program that generated this list
+  // can be found at:
+  // https://github.com/google/sanitizers/blob/master/hwaddress-sanitizer/sort_masks.py
+  static unsigned FastMasks[] = {0,  128, 64,  192, 32,  96,  224, 112, 240,
+                                 48, 16,  120, 248, 56,  24,  8,   124, 252,
+                                 60, 28,  12,  4,   126, 254, 62,  30,  14,
+                                 6,  2,   127, 63,  31,  15,  7,   3,   1};
+  return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))];
+}
+
+Value *HWAddressSanitizer::getNextTagWithCall(IRBuilder<> &IRB) {
+  return IRB.CreateZExt(IRB.CreateCall(HwasanGenerateTagFunc), IntptrTy);
+}
+
+Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
+  if (ClGenerateTagsWithCalls)
+    return getNextTagWithCall(IRB);
+  if (StackBaseTag)
+    return StackBaseTag;
+  // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
+  // first).
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  auto GetStackPointerFn = Intrinsic::getDeclaration(
+      M, Intrinsic::frameaddress,
+      IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+  Value *StackPointer = IRB.CreateCall(
+      GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
+
+  // Extract some entropy from the stack pointer for the tags.
+  // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
+  // between functions).
+  Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
+  Value *StackTag =
+      IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20),
+                    "hwasan.stack.base.tag");
+  return StackTag;
+}
+
+Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
+                                        AllocaInst *AI, unsigned AllocaNo) {
+  if (ClGenerateTagsWithCalls)
+    return getNextTagWithCall(IRB);
+  return IRB.CreateXor(StackTag,
+                       ConstantInt::get(IntptrTy, RetagMask(AllocaNo)));
+}
+
+Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB, Value *StackTag) {
+  if (ClUARRetagToZero)
+    return ConstantInt::get(IntptrTy, 0);
+  if (ClGenerateTagsWithCalls)
+    return getNextTagWithCall(IRB);
+  return IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU));
+}
+
+// Add a tag to an address.
+Value *HWAddressSanitizer::tagPointer(IRBuilder<> &IRB, Type *Ty,
+                                      Value *PtrLong, Value *Tag) {
+  Value *TaggedPtrLong;
+  if (CompileKernel) {
+    // Kernel addresses have 0xFF in the most significant byte.
+    Value *ShiftedTag = IRB.CreateOr(
+        IRB.CreateShl(Tag, kPointerTagShift),
+        ConstantInt::get(IntptrTy, (1ULL << kPointerTagShift) - 1));
+    TaggedPtrLong = IRB.CreateAnd(PtrLong, ShiftedTag);
+  } else {
+    // Userspace can simply do OR (tag << 56);
+    Value *ShiftedTag = IRB.CreateShl(Tag, kPointerTagShift);
+    TaggedPtrLong = IRB.CreateOr(PtrLong, ShiftedTag);
+  }
+  return IRB.CreateIntToPtr(TaggedPtrLong, Ty);
+}
+
+// Remove tag from an address.
+Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) {
+  Value *UntaggedPtrLong;
+  if (CompileKernel) {
+    // Kernel addresses have 0xFF in the most significant byte.
+    UntaggedPtrLong = IRB.CreateOr(PtrLong,
+        ConstantInt::get(PtrLong->getType(), 0xFFULL << kPointerTagShift));
+  } else {
+    // Userspace addresses have 0x00.
+    UntaggedPtrLong = IRB.CreateAnd(PtrLong,
+        ConstantInt::get(PtrLong->getType(), ~(0xFFULL << kPointerTagShift)));
+  }
+  return UntaggedPtrLong;
+}
+
+Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  if (TargetTriple.isAArch64() && TargetTriple.isAndroid()) {
+    // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER
+    // in Bionic's libc/private/bionic_tls.h.
+    Function *ThreadPointerFunc =
+        Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+    Value *SlotPtr = IRB.CreatePointerCast(
+        IRB.CreateConstGEP1_32(IRB.getInt8Ty(),
+                               IRB.CreateCall(ThreadPointerFunc), 0x30),
+        Ty->getPointerTo(0));
+    return SlotPtr;
+  }
+  if (ThreadPtrGlobal)
+    return ThreadPtrGlobal;
+
+
+  return nullptr;
+}
+
+void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
+  if (!Mapping.InTls) {
+    LocalDynamicShadow = getDynamicShadowNonTls(IRB);
+    return;
+  }
+
+  if (!WithFrameRecord && TargetTriple.isAndroid()) {
+    LocalDynamicShadow = getDynamicShadowIfunc(IRB);
+    return;
+  }
+
+  Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy);
+  assert(SlotPtr);
+
+  Instruction *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
+
+  Function *F = IRB.GetInsertBlock()->getParent();
+  if (F->getFnAttribute("hwasan-abi").getValueAsString() == "interceptor") {
+    Value *ThreadLongEqZero =
+        IRB.CreateICmpEQ(ThreadLong, ConstantInt::get(IntptrTy, 0));
+    auto *Br = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        ThreadLongEqZero, cast<Instruction>(ThreadLongEqZero)->getNextNode(),
+        false, MDBuilder(*C).createBranchWeights(1, 100000)));
+
+    IRB.SetInsertPoint(Br);
+    // FIXME: This should call a new runtime function with a custom calling
+    // convention to avoid needing to spill all arguments here.
+    IRB.CreateCall(HwasanThreadEnterFunc);
+    LoadInst *ReloadThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
+
+    IRB.SetInsertPoint(&*Br->getSuccessor(0)->begin());
+    PHINode *ThreadLongPhi = IRB.CreatePHI(IntptrTy, 2);
+    ThreadLongPhi->addIncoming(ThreadLong, ThreadLong->getParent());
+    ThreadLongPhi->addIncoming(ReloadThreadLong, ReloadThreadLong->getParent());
+    ThreadLong = ThreadLongPhi;
+  }
+
+  // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI.
+  Value *ThreadLongMaybeUntagged =
+      TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong);
+
+  if (WithFrameRecord) {
+    StackBaseTag = IRB.CreateAShr(ThreadLong, 3);
+
+    // Prepare ring buffer data.
+    Value *PC;
+    if (TargetTriple.getArch() == Triple::aarch64)
+      PC = readRegister(IRB, "pc");
+    else
+      PC = IRB.CreatePtrToInt(F, IntptrTy);
+    Module *M = F->getParent();
+    auto GetStackPointerFn = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+    Value *SP = IRB.CreatePtrToInt(
+        IRB.CreateCall(GetStackPointerFn,
+                       {Constant::getNullValue(IRB.getInt32Ty())}),
+        IntptrTy);
+    // Mix SP and PC.
+    // Assumptions:
+    // PC is 0x0000PPPPPPPPPPPP  (48 bits are meaningful, others are zero)
+    // SP is 0xsssssssssssSSSS0  (4 lower bits are zero)
+    // We only really need ~20 lower non-zero bits (SSSS), so we mix like this:
+    //       0xSSSSPPPPPPPPPPPP
+    SP = IRB.CreateShl(SP, 44);
+
+    // Store data to ring buffer.
+    Value *RecordPtr =
+        IRB.CreateIntToPtr(ThreadLongMaybeUntagged, IntptrTy->getPointerTo(0));
+    IRB.CreateStore(IRB.CreateOr(PC, SP), RecordPtr);
+
+    // Update the ring buffer. Top byte of ThreadLong defines the size of the
+    // buffer in pages, it must be a power of two, and the start of the buffer
+    // must be aligned by twice that much. Therefore wrap around of the ring
+    // buffer is simply Addr &= ~((ThreadLong >> 56) << 12).
+    // The use of AShr instead of LShr is due to
+    //   https://bugs.llvm.org/show_bug.cgi?id=39030
+    // Runtime library makes sure not to use the highest bit.
+    Value *WrapMask = IRB.CreateXor(
+        IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true),
+        ConstantInt::get(IntptrTy, (uint64_t)-1));
+    Value *ThreadLongNew = IRB.CreateAnd(
+        IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask);
+    IRB.CreateStore(ThreadLongNew, SlotPtr);
+  }
+
+  // Get shadow base address by aligning RecordPtr up.
+  // Note: this is not correct if the pointer is already aligned.
+  // Runtime library will make sure this never happens.
+  LocalDynamicShadow = IRB.CreateAdd(
+      IRB.CreateOr(
+          ThreadLongMaybeUntagged,
+          ConstantInt::get(IntptrTy, (1ULL << kShadowBaseAlignment) - 1)),
+      ConstantInt::get(IntptrTy, 1), "hwasan.shadow");
+  LocalDynamicShadow = IRB.CreateIntToPtr(LocalDynamicShadow, Int8PtrTy);
+}
+
+Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ReadRegister =
+      Intrinsic::getDeclaration(M, Intrinsic::read_register, IntptrTy);
+  MDNode *MD = MDNode::get(*C, {MDString::get(*C, Name)});
+  Value *Args[] = {MetadataAsValue::get(*C, MD)};
+  return IRB.CreateCall(ReadRegister, Args);
+}
+
+bool HWAddressSanitizer::instrumentLandingPads(
+    SmallVectorImpl<Instruction *> &LandingPadVec) {
+  for (auto *LP : LandingPadVec) {
+    IRBuilder<> IRB(LP->getNextNode());
+    IRB.CreateCall(
+        HWAsanHandleVfork,
+        {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp"
+                                                                      : "sp")});
+  }
+  return true;
+}
+
+bool HWAddressSanitizer::instrumentStack(
+    SmallVectorImpl<AllocaInst *> &Allocas,
+    DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap,
+    SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) {
+  // Ideally, we want to calculate tagged stack base pointer, and rewrite all
+  // alloca addresses using that. Unfortunately, offsets are not known yet
+  // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
+  // temp, shift-OR it into each alloca address and xor with the retag mask.
+  // This generates one extra instruction per alloca use.
+  for (unsigned N = 0; N < Allocas.size(); ++N) {
+    auto *AI = Allocas[N];
+    IRBuilder<> IRB(AI->getNextNode());
+
+    // Replace uses of the alloca with tagged address.
+    Value *Tag = getAllocaTag(IRB, StackTag, AI, N);
+    Value *AILong = IRB.CreatePointerCast(AI, IntptrTy);
+    Value *Replacement = tagPointer(IRB, AI->getType(), AILong, Tag);
+    std::string Name =
+        AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
+    Replacement->setName(Name + ".hwasan");
+
+    AI->replaceUsesWithIf(Replacement,
+                          [AILong](Use &U) { return U.getUser() != AILong; });
+
+    for (auto *DDI : AllocaDeclareMap.lookup(AI)) {
+      DIExpression *OldExpr = DDI->getExpression();
+      DIExpression *NewExpr = DIExpression::append(
+          OldExpr, {dwarf::DW_OP_LLVM_tag_offset, RetagMask(N)});
+      DDI->setArgOperand(2, MetadataAsValue::get(*C, NewExpr));
+    }
+
+    size_t Size = getAllocaSizeInBytes(*AI);
+    tagAlloca(IRB, AI, Tag, Size);
+
+    for (auto RI : RetVec) {
+      IRB.SetInsertPoint(RI);
+
+      // Re-tag alloca memory with the special UAR tag.
+      Value *Tag = getUARTag(IRB, StackTag);
+      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
+    }
+  }
+
+  return true;
+}
+
+bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+  return (AI.getAllocatedType()->isSized() &&
+          // FIXME: instrument dynamic allocas, too
+          AI.isStaticAlloca() &&
+          // alloca() may be called with 0 size, ignore it.
+          getAllocaSizeInBytes(AI) > 0 &&
+          // We are only interested in allocas not promotable to registers.
+          // Promotable allocas are common under -O0.
+          !isAllocaPromotable(&AI) &&
+          // inalloca allocas are not treated as static, and we don't want
+          // dynamic alloca instrumentation for them as well.
+          !AI.isUsedWithInAlloca() &&
+          // swifterror allocas are register promoted by ISel
+          !AI.isSwiftError());
+}
+
+bool HWAddressSanitizer::sanitizeFunction(Function &F) {
+  if (&F == HwasanCtorFunction)
+    return false;
+
+  if (!F.hasFnAttribute(Attribute::SanitizeHWAddress))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+
+  SmallVector<Instruction*, 16> ToInstrument;
+  SmallVector<AllocaInst*, 8> AllocasToInstrument;
+  SmallVector<Instruction*, 8> RetVec;
+  SmallVector<Instruction*, 8> LandingPadVec;
+  DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> AllocaDeclareMap;
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (ClInstrumentStack)
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+          if (isInterestingAlloca(*AI))
+            AllocasToInstrument.push_back(AI);
+          continue;
+        }
+
+      if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
+          isa<CleanupReturnInst>(Inst))
+        RetVec.push_back(&Inst);
+
+      if (auto *DDI = dyn_cast<DbgDeclareInst>(&Inst))
+        if (auto *Alloca = dyn_cast_or_null<AllocaInst>(DDI->getAddress()))
+          AllocaDeclareMap[Alloca].push_back(DDI);
+
+      if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
+        LandingPadVec.push_back(&Inst);
+
+      Value *MaybeMask = nullptr;
+      bool IsWrite;
+      unsigned Alignment;
+      uint64_t TypeSize;
+      Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
+                                              &Alignment, &MaybeMask);
+      if (Addr || isa<MemIntrinsic>(Inst))
+        ToInstrument.push_back(&Inst);
+    }
+  }
+
+  initializeCallbacks(*F.getParent());
+
+  if (!LandingPadVec.empty())
+    instrumentLandingPads(LandingPadVec);
+
+  if (AllocasToInstrument.empty() && F.hasPersonalityFn() &&
+      F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) {
+    // __hwasan_personality_thunk is a no-op for functions without an
+    // instrumented stack, so we can drop it.
+    F.setPersonalityFn(nullptr);
+  }
+
+  if (AllocasToInstrument.empty() && ToInstrument.empty())
+    return false;
+
+  assert(!LocalDynamicShadow);
+
+  Instruction *InsertPt = &*F.getEntryBlock().begin();
+  IRBuilder<> EntryIRB(InsertPt);
+  emitPrologue(EntryIRB,
+               /*WithFrameRecord*/ ClRecordStackHistory &&
+                   !AllocasToInstrument.empty());
+
+  bool Changed = false;
+  if (!AllocasToInstrument.empty()) {
+    Value *StackTag =
+        ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
+    Changed |= instrumentStack(AllocasToInstrument, AllocaDeclareMap, RetVec,
+                               StackTag);
+  }
+
+  // Pad and align each of the allocas that we instrumented to stop small
+  // uninteresting allocas from hiding in instrumented alloca's padding and so
+  // that we have enough space to store real tags for short granules.
+  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
+  for (AllocaInst *AI : AllocasToInstrument) {
+    uint64_t Size = getAllocaSizeInBytes(*AI);
+    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    AI->setAlignment(
+        MaybeAlign(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
+    if (Size != AlignedSize) {
+      Type *AllocatedType = AI->getAllocatedType();
+      if (AI->isArrayAllocation()) {
+        uint64_t ArraySize =
+            cast<ConstantInt>(AI->getArraySize())->getZExtValue();
+        AllocatedType = ArrayType::get(AllocatedType, ArraySize);
+      }
+      Type *TypeWithPadding = StructType::get(
+          AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
+      auto *NewAI = new AllocaInst(
+          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
+      NewAI->takeName(AI);
+      NewAI->setAlignment(MaybeAlign(AI->getAlignment()));
+      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
+      NewAI->setSwiftError(AI->isSwiftError());
+      NewAI->copyMetadata(*AI);
+      auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
+      AI->replaceAllUsesWith(Bitcast);
+      AllocaToPaddedAllocaMap[AI] = NewAI;
+    }
+  }
+
+  if (!AllocaToPaddedAllocaMap.empty()) {
+    for (auto &BB : F)
+      for (auto &Inst : BB)
+        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst))
+          if (auto *AI =
+                  dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation()))
+            if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI))
+              DVI->setArgOperand(
+                  0, MetadataAsValue::get(*C, LocalAsMetadata::get(NewAI)));
+    for (auto &P : AllocaToPaddedAllocaMap)
+      P.first->eraseFromParent();
+  }
+
+  // If we split the entry block, move any allocas that were originally in the
+  // entry block back into the entry block so that they aren't treated as
+  // dynamic allocas.
+  if (EntryIRB.GetInsertBlock() != &F.getEntryBlock()) {
+    InsertPt = &*F.getEntryBlock().begin();
+    for (auto II = EntryIRB.GetInsertBlock()->begin(),
+              IE = EntryIRB.GetInsertBlock()->end();
+         II != IE;) {
+      Instruction *I = &*II++;
+      if (auto *AI = dyn_cast<AllocaInst>(I))
+        if (isa<ConstantInt>(AI->getArraySize()))
+          I->moveBefore(InsertPt);
+    }
+  }
+
+  for (auto Inst : ToInstrument)
+    Changed |= instrumentMemAccess(Inst);
+
+  LocalDynamicShadow = nullptr;
+  StackBaseTag = nullptr;
+
+  return Changed;
+}
+
+void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
+  Constant *Initializer = GV->getInitializer();
+  uint64_t SizeInBytes =
+      M.getDataLayout().getTypeAllocSize(Initializer->getType());
+  uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment());
+  if (SizeInBytes != NewSize) {
+    // Pad the initializer out to the next multiple of 16 bytes and add the
+    // required short granule tag.
+    std::vector<uint8_t> Init(NewSize - SizeInBytes, 0);
+    Init.back() = Tag;
+    Constant *Padding = ConstantDataArray::get(*C, Init);
+    Initializer = ConstantStruct::getAnon({Initializer, Padding});
+  }
+
+  auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(),
+                                   GlobalValue::ExternalLinkage, Initializer,
+                                   GV->getName() + ".hwasan");
+  NewGV->copyAttributesFrom(GV);
+  NewGV->setLinkage(GlobalValue::PrivateLinkage);
+  NewGV->copyMetadata(GV, 0);
+  NewGV->setAlignment(
+      MaybeAlign(std::max(GV->getAlignment(), Mapping.getObjectAlignment())));
+
+  // It is invalid to ICF two globals that have different tags. In the case
+  // where the size of the global is a multiple of the tag granularity the
+  // contents of the globals may be the same but the tags (i.e. symbol values)
+  // may be different, and the symbols are not considered during ICF. In the
+  // case where the size is not a multiple of the granularity, the short granule
+  // tags would discriminate two globals with different tags, but there would
+  // otherwise be nothing stopping such a global from being incorrectly ICF'd
+  // with an uninstrumented (i.e. tag 0) global that happened to have the short
+  // granule tag in the last byte.
+  NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+  // Descriptor format (assuming little-endian):
+  // bytes 0-3: relative address of global
+  // bytes 4-6: size of global (16MB ought to be enough for anyone, but in case
+  // it isn't, we create multiple descriptors)
+  // byte 7: tag
+  auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty);
+  const uint64_t MaxDescriptorSize = 0xfffff0;
+  for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes;
+       DescriptorPos += MaxDescriptorSize) {
+    auto *Descriptor =
+        new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage,
+                           nullptr, GV->getName() + ".hwasan.descriptor");
+    auto *GVRelPtr = ConstantExpr::getTrunc(
+        ConstantExpr::getAdd(
+            ConstantExpr::getSub(
+                ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+                ConstantExpr::getPtrToInt(Descriptor, Int64Ty)),
+            ConstantInt::get(Int64Ty, DescriptorPos)),
+        Int32Ty);
+    uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize);
+    auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24));
+    Descriptor->setComdat(NewGV->getComdat());
+    Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag}));
+    Descriptor->setSection("hwasan_globals");
+    Descriptor->setMetadata(LLVMContext::MD_associated,
+                            MDNode::get(*C, ValueAsMetadata::get(NewGV)));
+    appendToCompilerUsed(M, Descriptor);
+  }
+
+  Constant *Aliasee = ConstantExpr::getIntToPtr(
+      ConstantExpr::getAdd(
+          ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+          ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)),
+      GV->getType());
+  auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(),
+                                    GV->getLinkage(), "", Aliasee, &M);
+  Alias->setVisibility(GV->getVisibility());
+  Alias->takeName(GV);
+  GV->replaceAllUsesWith(Alias);
+  GV->eraseFromParent();
+}
+
+void HWAddressSanitizer::instrumentGlobals() {
+  // Start by creating a note that contains pointers to the list of global
+  // descriptors. Adding a note to the output file will cause the linker to
+  // create a PT_NOTE program header pointing to the note that we can use to
+  // find the descriptor list starting from the program headers. A function
+  // provided by the runtime initializes the shadow memory for the globals by
+  // accessing the descriptor list via the note. The dynamic loader needs to
+  // call this function whenever a library is loaded.
+  //
+  // The reason why we use a note for this instead of a more conventional
+  // approach of having a global constructor pass a descriptor list pointer to
+  // the runtime is because of an order of initialization problem. With
+  // constructors we can encounter the following problematic scenario:
+  //
+  // 1) library A depends on library B and also interposes one of B's symbols
+  // 2) B's constructors are called before A's (as required for correctness)
+  // 3) during construction, B accesses one of its "own" globals (actually
+  //    interposed by A) and triggers a HWASAN failure due to the initialization
+  //    for A not having happened yet
+  //
+  // Even without interposition it is possible to run into similar situations in
+  // cases where two libraries mutually depend on each other.
+  //
+  // We only need one note per binary, so put everything for the note in a
+  // comdat.
+  Comdat *NoteComdat = M.getOrInsertComdat(kHwasanNoteName);
+
+  Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0);
+  auto Start =
+      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
+                         nullptr, "__start_hwasan_globals");
+  Start->setVisibility(GlobalValue::HiddenVisibility);
+  Start->setDSOLocal(true);
+  auto Stop =
+      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
+                         nullptr, "__stop_hwasan_globals");
+  Stop->setVisibility(GlobalValue::HiddenVisibility);
+  Stop->setDSOLocal(true);
+
+  // Null-terminated so actually 8 bytes, which are required in order to align
+  // the note properly.
+  auto *Name = ConstantDataArray::get(*C, "LLVM\0\0\0");
+
+  auto *NoteTy = StructType::get(Int32Ty, Int32Ty, Int32Ty, Name->getType(),
+                                 Int32Ty, Int32Ty);
+  auto *Note =
+      new GlobalVariable(M, NoteTy, /*isConstantGlobal=*/true,
+                         GlobalValue::PrivateLinkage, nullptr, kHwasanNoteName);
+  Note->setSection(".note.hwasan.globals");
+  Note->setComdat(NoteComdat);
+  Note->setAlignment(Align(4));
+  Note->setDSOLocal(true);
+
+  // The pointers in the note need to be relative so that the note ends up being
+  // placed in rodata, which is the standard location for notes.
+  auto CreateRelPtr = [&](Constant *Ptr) {
+    return ConstantExpr::getTrunc(
+        ConstantExpr::getSub(ConstantExpr::getPtrToInt(Ptr, Int64Ty),
+                             ConstantExpr::getPtrToInt(Note, Int64Ty)),
+        Int32Ty);
+  };
+  Note->setInitializer(ConstantStruct::getAnon(
+      {ConstantInt::get(Int32Ty, 8),                           // n_namesz
+       ConstantInt::get(Int32Ty, 8),                           // n_descsz
+       ConstantInt::get(Int32Ty, ELF::NT_LLVM_HWASAN_GLOBALS), // n_type
+       Name, CreateRelPtr(Start), CreateRelPtr(Stop)}));
+  appendToCompilerUsed(M, Note);
+
+  // Create a zero-length global in hwasan_globals so that the linker will
+  // always create start and stop symbols.
+  auto Dummy = new GlobalVariable(
+      M, Int8Arr0Ty, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(Int8Arr0Ty), "hwasan.dummy.global");
+  Dummy->setSection("hwasan_globals");
+  Dummy->setComdat(NoteComdat);
+  Dummy->setMetadata(LLVMContext::MD_associated,
+                     MDNode::get(*C, ValueAsMetadata::get(Note)));
+  appendToCompilerUsed(M, Dummy);
+
+  std::vector<GlobalVariable *> Globals;
+  for (GlobalVariable &GV : M.globals()) {
+    if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
+        GV.isThreadLocal())
+      continue;
+
+    // Common symbols can't have aliases point to them, so they can't be tagged.
+    if (GV.hasCommonLinkage())
+      continue;
+
+    // Globals with custom sections may be used in __start_/__stop_ enumeration,
+    // which would be broken both by adding tags and potentially by the extra
+    // padding/alignment that we insert.
+    if (GV.hasSection())
+      continue;
+
+    Globals.push_back(&GV);
+  }
+
+  MD5 Hasher;
+  Hasher.update(M.getSourceFileName());
+  MD5::MD5Result Hash;
+  Hasher.final(Hash);
+  uint8_t Tag = Hash[0];
+
+  for (GlobalVariable *GV : Globals) {
+    // Skip tag 0 in order to avoid collisions with untagged memory.
+    if (Tag == 0)
+      Tag = 1;
+    instrumentGlobal(GV, Tag++);
+  }
+}
+
+void HWAddressSanitizer::instrumentPersonalityFunctions() {
+  // We need to untag stack frames as we unwind past them. That is the job of
+  // the personality function wrapper, which either wraps an existing
+  // personality function or acts as a personality function on its own. Each
+  // function that has a personality function or that can be unwound past has
+  // its personality function changed to a thunk that calls the personality
+  // function wrapper in the runtime.
+  MapVector<Constant *, std::vector<Function *>> PersonalityFns;
+  for (Function &F : M) {
+    if (F.isDeclaration() || !F.hasFnAttribute(Attribute::SanitizeHWAddress))
+      continue;
+
+    if (F.hasPersonalityFn()) {
+      PersonalityFns[F.getPersonalityFn()->stripPointerCasts()].push_back(&F);
+    } else if (!F.hasFnAttribute(Attribute::NoUnwind)) {
+      PersonalityFns[nullptr].push_back(&F);
+    }
+  }
+
+  if (PersonalityFns.empty())
+    return;
+
+  FunctionCallee HwasanPersonalityWrapper = M.getOrInsertFunction(
+      "__hwasan_personality_wrapper", Int32Ty, Int32Ty, Int32Ty, Int64Ty,
+      Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy);
+  FunctionCallee UnwindGetGR = M.getOrInsertFunction("_Unwind_GetGR", VoidTy);
+  FunctionCallee UnwindGetCFA = M.getOrInsertFunction("_Unwind_GetCFA", VoidTy);
+
+  for (auto &P : PersonalityFns) {
+    std::string ThunkName = kHwasanPersonalityThunkName;
+    if (P.first)
+      ThunkName += ("." + P.first->getName()).str();
+    FunctionType *ThunkFnTy = FunctionType::get(
+        Int32Ty, {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int8PtrTy}, false);
+    bool IsLocal = P.first && (!isa<GlobalValue>(P.first) ||
+                               cast<GlobalValue>(P.first)->hasLocalLinkage());
+    auto *ThunkFn = Function::Create(ThunkFnTy,
+                                     IsLocal ? GlobalValue::InternalLinkage
+                                             : GlobalValue::LinkOnceODRLinkage,
+                                     ThunkName, &M);
+    if (!IsLocal) {
+      ThunkFn->setVisibility(GlobalValue::HiddenVisibility);
+      ThunkFn->setComdat(M.getOrInsertComdat(ThunkName));
+    }
+
+    auto *BB = BasicBlock::Create(*C, "entry", ThunkFn);
+    IRBuilder<> IRB(BB);
+    CallInst *WrapperCall = IRB.CreateCall(
+        HwasanPersonalityWrapper,
+        {ThunkFn->getArg(0), ThunkFn->getArg(1), ThunkFn->getArg(2),
+         ThunkFn->getArg(3), ThunkFn->getArg(4),
+         P.first ? IRB.CreateBitCast(P.first, Int8PtrTy)
+                 : Constant::getNullValue(Int8PtrTy),
+         IRB.CreateBitCast(UnwindGetGR.getCallee(), Int8PtrTy),
+         IRB.CreateBitCast(UnwindGetCFA.getCallee(), Int8PtrTy)});
+    WrapperCall->setTailCall();
+    IRB.CreateRet(WrapperCall);
+
+    for (Function *F : P.second)
+      F->setPersonalityFn(ThunkFn);
+  }
+}
+
+void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
+  Scale = kDefaultShadowScale;
+  if (ClMappingOffset.getNumOccurrences() > 0) {
+    InGlobal = false;
+    InTls = false;
+    Offset = ClMappingOffset;
+  } else if (ClEnableKhwasan || ClInstrumentWithCalls) {
+    InGlobal = false;
+    InTls = false;
+    Offset = 0;
+  } else if (ClWithIfunc) {
+    InGlobal = true;
+    InTls = false;
+    Offset = kDynamicShadowSentinel;
+  } else if (ClWithTls) {
+    InGlobal = false;
+    InTls = true;
+    Offset = kDynamicShadowSentinel;
+  } else {
+    InGlobal = false;
+    InTls = false;
+    Offset = kDynamicShadowSentinel;
+  }
+}
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
new file mode 100644
index 000000000000..74d6e76eceb6
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -0,0 +1,443 @@
+//===- IndirectCallPromotion.cpp - Optimizations based on value profiling -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that promotes indirect calls to
+// conditional direct calls when the indirect-call value profile metadata is
+// available.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-icall-prom"
+
+STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
+STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
+
+// Command line option to disable indirect-call promotion with the default as
+// false. This is for debug purpose.
+static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
+                                cl::desc("Disable indirect call promotion"));
+
+// Set the cutoff value for the promotion. If the value is other than 0, we
+// stop the transformation once the total number of promotions equals the cutoff
+// value.
+// For debug use only.
+static cl::opt<unsigned>
+    ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Max number of promotions for this compilation"));
+
+// If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
+// For debug use only.
+static cl::opt<unsigned>
+    ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Skip Callsite up to this number for this compilation"));
+
+// Set if the pass is called in LTO optimization. The difference for LTO mode
+// is the pass won't prefix the source module name to the internal linkage
+// symbols.
+static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
+                                cl::desc("Run indirect-call promotion in LTO "
+                                         "mode"));
+
+// Set if the pass is called in SamplePGO mode. The difference for SamplePGO
+// mode is it will add prof metadatato the created direct call.
+static cl::opt<bool>
+    ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
+                     cl::desc("Run indirect-call promotion in SamplePGO mode"));
+
+// If the option is set to true, only call instructions will be considered for
+// transformation -- invoke instructions will be ignored.
+static cl::opt<bool>
+    ICPCallOnly("icp-call-only", cl::init(false), cl::Hidden,
+                cl::desc("Run indirect-call promotion for call instructions "
+                         "only"));
+
+// If the option is set to true, only invoke instructions will be considered for
+// transformation -- call instructions will be ignored.
+static cl::opt<bool> ICPInvokeOnly("icp-invoke-only", cl::init(false),
+                                   cl::Hidden,
+                                   cl::desc("Run indirect-call promotion for "
+                                            "invoke instruction only"));
+
+// Dump the function level IR if the transformation happened in this
+// function. For debug use only.
+static cl::opt<bool>
+    ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
+                 cl::desc("Dump IR after transformation happens"));
+
+namespace {
+
+class PGOIndirectCallPromotionLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
+      : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
+    initializePGOIndirectCallPromotionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  }
+
+  StringRef getPassName() const override { return "PGOIndirectCallPromotion"; }
+
+private:
+  bool runOnModule(Module &M) override;
+
+  // If this pass is called in LTO. We need to special handling the PGOFuncName
+  // for the static variables due to LTO's internalization.
+  bool InLTO;
+
+  // If this pass is called in SamplePGO. We need to add the prof metadata to
+  // the promoted direct call.
+  bool SamplePGO;
+};
+
+} // end anonymous namespace
+
+char PGOIndirectCallPromotionLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
+                      "Use PGO instrumentation profile to promote indirect "
+                      "calls to direct calls.",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
+                    "Use PGO instrumentation profile to promote indirect "
+                    "calls to direct calls.",
+                    false, false)
+
+ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
+                                                           bool SamplePGO) {
+  return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
+}
+
+namespace {
+
+// The class for main data structure to promote indirect calls to conditional
+// direct calls.
+class ICallPromotionFunc {
+private:
+  Function &F;
+  Module *M;
+
+  // Symtab that maps indirect call profile values to function names and
+  // defines.
+  InstrProfSymtab *Symtab;
+
+  bool SamplePGO;
+
+  OptimizationRemarkEmitter &ORE;
+
+  // A struct that records the direct target and it's call count.
+  struct PromotionCandidate {
+    Function *TargetFunction;
+    uint64_t Count;
+
+    PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
+  };
+
+  // Check if the indirect-call call site should be promoted. Return the number
+  // of promotions. Inst is the candidate indirect call, ValueDataRef
+  // contains the array of value profile data for profiled targets,
+  // TotalCount is the total profiled count of call executions, and
+  // NumCandidates is the number of candidate entries in ValueDataRef.
+  std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
+      Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
+      uint64_t TotalCount, uint32_t NumCandidates);
+
+  // Promote a list of targets for one indirect-call callsite. Return
+  // the number of promotions.
+  uint32_t tryToPromote(Instruction *Inst,
+                        const std::vector<PromotionCandidate> &Candidates,
+                        uint64_t &TotalCount);
+
+public:
+  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
+                     bool SamplePGO, OptimizationRemarkEmitter &ORE)
+      : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO), ORE(ORE) {}
+  ICallPromotionFunc(const ICallPromotionFunc &) = delete;
+  ICallPromotionFunc &operator=(const ICallPromotionFunc &) = delete;
+
+  bool processFunction(ProfileSummaryInfo *PSI);
+};
+
+} // end anonymous namespace
+
+// Indirect-call promotion heuristic. The direct targets are sorted based on
+// the count. Stop at the first target that is not promoted.
+std::vector<ICallPromotionFunc::PromotionCandidate>
+ICallPromotionFunc::getPromotionCandidatesForCallSite(
+    Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
+    uint64_t TotalCount, uint32_t NumCandidates) {
+  std::vector<PromotionCandidate> Ret;
+
+  LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
+                    << " Num_targets: " << ValueDataRef.size()
+                    << " Num_candidates: " << NumCandidates << "\n");
+  NumOfPGOICallsites++;
+  if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
+    LLVM_DEBUG(dbgs() << " Skip: User options.\n");
+    return Ret;
+  }
+
+  for (uint32_t I = 0; I < NumCandidates; I++) {
+    uint64_t Count = ValueDataRef[I].Count;
+    assert(Count <= TotalCount);
+    uint64_t Target = ValueDataRef[I].Value;
+    LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+                      << "  Target_func: " << Target << "\n");
+
+    if (ICPInvokeOnly && isa<CallInst>(Inst)) {
+      LLVM_DEBUG(dbgs() << " Not promote: User options.\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", Inst)
+               << " Not promote: User options";
+      });
+      break;
+    }
+    if (ICPCallOnly && isa<InvokeInst>(Inst)) {
+      LLVM_DEBUG(dbgs() << " Not promote: User option.\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", Inst)
+               << " Not promote: User options";
+      });
+      break;
+    }
+    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+      LLVM_DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", Inst)
+               << " Not promote: Cutoff reached";
+      });
+      break;
+    }
+
+    Function *TargetFunction = Symtab->getFunction(Target);
+    if (TargetFunction == nullptr) {
+      LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", Inst)
+               << "Cannot promote indirect call: target with md5sum "
+               << ore::NV("target md5sum", Target) << " not found";
+      });
+      break;
+    }
+
+    const char *Reason = nullptr;
+    if (!isLegalToPromote(CallSite(Inst), TargetFunction, &Reason)) {
+      using namespace ore;
+
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", Inst)
+               << "Cannot promote indirect call to "
+               << NV("TargetFunction", TargetFunction) << " with count of "
+               << NV("Count", Count) << ": " << Reason;
+      });
+      break;
+    }
+
+    Ret.push_back(PromotionCandidate(TargetFunction, Count));
+    TotalCount -= Count;
+  }
+  return Ret;
+}
+
+Instruction *llvm::pgo::promoteIndirectCall(Instruction *Inst,
+                                            Function *DirectCallee,
+                                            uint64_t Count, uint64_t TotalCount,
+                                            bool AttachProfToDirectCall,
+                                            OptimizationRemarkEmitter *ORE) {
+
+  uint64_t ElseCount = TotalCount - Count;
+  uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
+  uint64_t Scale = calculateCountScale(MaxCount);
+  MDBuilder MDB(Inst->getContext());
+  MDNode *BranchWeights = MDB.createBranchWeights(
+      scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
+
+  Instruction *NewInst =
+      promoteCallWithIfThenElse(CallSite(Inst), DirectCallee, BranchWeights);
+
+  if (AttachProfToDirectCall) {
+    MDBuilder MDB(NewInst->getContext());
+    NewInst->setMetadata(
+        LLVMContext::MD_prof,
+        MDB.createBranchWeights({static_cast<uint32_t>(Count)}));
+  }
+
+  using namespace ore;
+
+  if (ORE)
+    ORE->emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Promoted", Inst)
+             << "Promote indirect call to " << NV("DirectCallee", DirectCallee)
+             << " with count " << NV("Count", Count) << " out of "
+             << NV("TotalCount", TotalCount);
+    });
+  return NewInst;
+}
+
+// Promote indirect-call to conditional direct-call for one callsite.
+uint32_t ICallPromotionFunc::tryToPromote(
+    Instruction *Inst, const std::vector<PromotionCandidate> &Candidates,
+    uint64_t &TotalCount) {
+  uint32_t NumPromoted = 0;
+
+  for (auto &C : Candidates) {
+    uint64_t Count = C.Count;
+    pgo::promoteIndirectCall(Inst, C.TargetFunction, Count, TotalCount,
+                             SamplePGO, &ORE);
+    assert(TotalCount >= Count);
+    TotalCount -= Count;
+    NumOfPGOICallPromotion++;
+    NumPromoted++;
+  }
+  return NumPromoted;
+}
+
+// Traverse all the indirect-call callsite and get the value profile
+// annotation to perform indirect-call promotion.
+bool ICallPromotionFunc::processFunction(ProfileSummaryInfo *PSI) {
+  bool Changed = false;
+  ICallPromotionAnalysis ICallAnalysis;
+  for (auto &I : findIndirectCalls(F)) {
+    uint32_t NumVals, NumCandidates;
+    uint64_t TotalCount;
+    auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
+        I, NumVals, TotalCount, NumCandidates);
+    if (!NumCandidates ||
+        (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
+      continue;
+    auto PromotionCandidates = getPromotionCandidatesForCallSite(
+        I, ICallProfDataRef, TotalCount, NumCandidates);
+    uint32_t NumPromoted = tryToPromote(I, PromotionCandidates, TotalCount);
+    if (NumPromoted == 0)
+      continue;
+
+    Changed = true;
+    // Adjust the MD.prof metadata. First delete the old one.
+    I->setMetadata(LLVMContext::MD_prof, nullptr);
+    // If all promoted, we don't need the MD.prof metadata.
+    if (TotalCount == 0 || NumPromoted == NumVals)
+      continue;
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*M, *I, ICallProfDataRef.slice(NumPromoted), TotalCount,
+                      IPVK_IndirectCallTarget, NumCandidates);
+  }
+  return Changed;
+}
+
+// A wrapper function that does the actual work.
+static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
+                                 bool InLTO, bool SamplePGO,
+                                 ModuleAnalysisManager *AM = nullptr) {
+  if (DisableICP)
+    return false;
+  InstrProfSymtab Symtab;
+  if (Error E = Symtab.create(M, InLTO)) {
+    std::string SymtabFailure = toString(std::move(E));
+    LLVM_DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
+    (void)SymtabFailure;
+    return false;
+  }
+  bool Changed = false;
+  for (auto &F : M) {
+    if (F.isDeclaration() || F.hasOptNone())
+      continue;
+
+    std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
+    OptimizationRemarkEmitter *ORE;
+    if (AM) {
+      auto &FAM =
+          AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+      ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+    } else {
+      OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
+      ORE = OwnedORE.get();
+    }
+
+    ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO, *ORE);
+    bool FuncChanged = ICallPromotion.processFunction(PSI);
+    if (ICPDUMPAFTER && FuncChanged) {
+      LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
+    }
+    Changed |= FuncChanged;
+    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+      LLVM_DEBUG(dbgs() << " Stop: Cutoff reached.\n");
+      break;
+    }
+  }
+  return Changed;
+}
+
+bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
+  ProfileSummaryInfo *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+  // Command-line option has the priority for InLTO.
+  return promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
+                              SamplePGO | ICPSamplePGOMode);
+}
+
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+  if (!promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
+                            SamplePGO | ICPSamplePGOMode, &AM))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
new file mode 100644
index 000000000000..93d3a8a14d5c
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -0,0 +1,212 @@
+//===- InstrOrderFile.cpp ---- Late IR instrumentation for order file ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
+#include <fstream>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+
+using namespace llvm;
+#define DEBUG_TYPE "instrorderfile"
+
+static cl::opt<std::string> ClOrderFileWriteMapping(
+    "orderfile-write-mapping", cl::init(""),
+    cl::desc(
+        "Dump functions and their MD5 hash to deobfuscate profile data"),
+    cl::Hidden);
+
+namespace {
+
+// We need a global bitmap to tell if a function is executed. We also
+// need a global variable to save the order of functions. We can use a
+// fixed-size buffer that saves the MD5 hash of the function. We need
+// a global variable to save the index into the buffer.
+
+std::mutex MappingMutex;
+
+struct InstrOrderFile {
+private:
+  GlobalVariable *OrderFileBuffer;
+  GlobalVariable *BufferIdx;
+  GlobalVariable *BitMap;
+  ArrayType *BufferTy;
+  ArrayType *MapTy;
+
+public:
+  InstrOrderFile() {}
+
+  void createOrderFileData(Module &M) {
+    LLVMContext &Ctx = M.getContext();
+    int NumFunctions = 0;
+    for (Function &F : M) {
+      if (!F.isDeclaration())
+        NumFunctions++;
+    }
+
+    BufferTy =
+        ArrayType::get(Type::getInt64Ty(Ctx), INSTR_ORDER_FILE_BUFFER_SIZE);
+    Type *IdxTy = Type::getInt32Ty(Ctx);
+    MapTy = ArrayType::get(Type::getInt8Ty(Ctx), NumFunctions);
+
+    // Create the global variables.
+    std::string SymbolName = INSTR_PROF_ORDERFILE_BUFFER_NAME_STR;
+    OrderFileBuffer = new GlobalVariable(M, BufferTy, false, GlobalValue::LinkOnceODRLinkage,
+                           Constant::getNullValue(BufferTy), SymbolName);
+    Triple TT = Triple(M.getTargetTriple());
+    OrderFileBuffer->setSection(
+        getInstrProfSectionName(IPSK_orderfile, TT.getObjectFormat()));
+
+    std::string IndexName = INSTR_PROF_ORDERFILE_BUFFER_IDX_NAME_STR;
+    BufferIdx = new GlobalVariable(M, IdxTy, false, GlobalValue::LinkOnceODRLinkage,
+                           Constant::getNullValue(IdxTy), IndexName);
+
+    std::string BitMapName = "bitmap_0";
+    BitMap = new GlobalVariable(M, MapTy, false, GlobalValue::PrivateLinkage,
+                                Constant::getNullValue(MapTy), BitMapName);
+  }
+
+  // Generate the code sequence in the entry block of each function to
+  // update the buffer.
+  void generateCodeSequence(Module &M, Function &F, int FuncId) {
+    if (!ClOrderFileWriteMapping.empty()) {
+      std::lock_guard<std::mutex> LogLock(MappingMutex);
+      std::error_code EC;
+      llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC,
+                              llvm::sys::fs::OF_Append);
+      if (EC) {
+        report_fatal_error(Twine("Failed to open ") + ClOrderFileWriteMapping +
+                           " to save mapping file for order file instrumentation\n");
+      } else {
+        std::stringstream stream;
+        stream << std::hex << MD5Hash(F.getName());
+        std::string singleLine = "MD5 " + stream.str() + " " +
+                                 std::string(F.getName()) + '\n';
+        OS << singleLine;
+      }
+    }
+
+    BasicBlock *OrigEntry = &F.getEntryBlock();
+
+    LLVMContext &Ctx = M.getContext();
+    IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
+    IntegerType *Int8Ty = Type::getInt8Ty(Ctx);
+
+    // Create a new entry block for instrumentation. We will check the bitmap
+    // in this basic block.
+    BasicBlock *NewEntry =
+        BasicBlock::Create(M.getContext(), "order_file_entry", &F, OrigEntry);
+    IRBuilder<> entryB(NewEntry);
+    // Create a basic block for updating the circular buffer.
+    BasicBlock *UpdateOrderFileBB =
+        BasicBlock::Create(M.getContext(), "order_file_set", &F, OrigEntry);
+    IRBuilder<> updateB(UpdateOrderFileBB);
+
+    // Check the bitmap, if it is already 1, do nothing.
+    // Otherwise, set the bit, grab the index, update the buffer.
+    Value *IdxFlags[] = {ConstantInt::get(Int32Ty, 0),
+                         ConstantInt::get(Int32Ty, FuncId)};
+    Value *MapAddr = entryB.CreateGEP(MapTy, BitMap, IdxFlags, "");
+    LoadInst *loadBitMap = entryB.CreateLoad(Int8Ty, MapAddr, "");
+    entryB.CreateStore(ConstantInt::get(Int8Ty, 1), MapAddr);
+    Value *IsNotExecuted =
+        entryB.CreateICmpEQ(loadBitMap, ConstantInt::get(Int8Ty, 0));
+    entryB.CreateCondBr(IsNotExecuted, UpdateOrderFileBB, OrigEntry);
+
+    // Fill up UpdateOrderFileBB: grab the index, update the buffer!
+    Value *IdxVal = updateB.CreateAtomicRMW(
+        AtomicRMWInst::Add, BufferIdx, ConstantInt::get(Int32Ty, 1),
+        AtomicOrdering::SequentiallyConsistent);
+    // We need to wrap around the index to fit it inside the buffer.
+    Value *WrappedIdx = updateB.CreateAnd(
+        IdxVal, ConstantInt::get(Int32Ty, INSTR_ORDER_FILE_BUFFER_MASK));
+    Value *BufferGEPIdx[] = {ConstantInt::get(Int32Ty, 0), WrappedIdx};
+    Value *BufferAddr =
+        updateB.CreateGEP(BufferTy, OrderFileBuffer, BufferGEPIdx, "");
+    updateB.CreateStore(ConstantInt::get(Type::getInt64Ty(Ctx), MD5Hash(F.getName())),
+                        BufferAddr);
+    updateB.CreateBr(OrigEntry);
+  }
+
+  bool run(Module &M) {
+    createOrderFileData(M);
+
+    int FuncId = 0;
+    for (Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+      generateCodeSequence(M, F, FuncId);
+      ++FuncId;
+    }
+
+    return true;
+  }
+
+}; // End of InstrOrderFile struct
+
+class InstrOrderFileLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  InstrOrderFileLegacyPass() : ModulePass(ID) {
+    initializeInstrOrderFileLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // End anonymous namespace
+
+bool InstrOrderFileLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  return InstrOrderFile().run(M);
+}
+
+PreservedAnalyses
+InstrOrderFilePass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (InstrOrderFile().run(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS_BEGIN(InstrOrderFileLegacyPass, "instrorderfile",
+                      "Instrumentation for Order File", false, false)
+INITIALIZE_PASS_END(InstrOrderFileLegacyPass, "instrorderfile",
+                    "Instrumentation for Order File", false, false)
+
+char InstrOrderFileLegacyPass::ID = 0;
+
+ModulePass *llvm::createInstrOrderFilePass() {
+  return new InstrOrderFileLegacyPass();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
new file mode 100644
index 000000000000..1f092a5f3103
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -0,0 +1,1048 @@
+//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling.
+// It also builds the data structures and initialization code needed for
+// updating execution counts and emitting the profile at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instrprof"
+
+// The start and end values of precise value profile range for memory
+// intrinsic sizes
+cl::opt<std::string> MemOPSizeRange(
+    "memop-size-range",
+    cl::desc("Set the range of size in memory intrinsic calls to be profiled "
+             "precisely, in a format of <start_val>:<end_val>"),
+    cl::init(""));
+
+// The value that considered to be large value in  memory intrinsic.
+cl::opt<unsigned> MemOPSizeLarge(
+    "memop-size-large",
+    cl::desc("Set large value thresthold in memory intrinsic size profiling. "
+             "Value of 0 disables the large value profiling."),
+    cl::init(8192));
+
+namespace {
+
+cl::opt<bool> DoNameCompression("enable-name-compression",
+                                cl::desc("Enable name string compression"),
+                                cl::init(true));
+
+cl::opt<bool> DoHashBasedCounterSplit(
+    "hash-based-counter-split",
+    cl::desc("Rename counter variable of a comdat function based on cfg hash"),
+    cl::init(true));
+
+cl::opt<bool> ValueProfileStaticAlloc(
+    "vp-static-alloc",
+    cl::desc("Do static counter allocation for value profiler"),
+    cl::init(true));
+
+cl::opt<double> NumCountersPerValueSite(
+    "vp-counters-per-site",
+    cl::desc("The average number of profile counters allocated "
+             "per value profiling site."),
+    // This is set to a very small value because in real programs, only
+    // a very small percentage of value sites have non-zero targets, e.g, 1/30.
+    // For those sites with non-zero profile, the average number of targets
+    // is usually smaller than 2.
+    cl::init(1.0));
+
+cl::opt<bool> AtomicCounterUpdateAll(
+    "instrprof-atomic-counter-update-all", cl::ZeroOrMore,
+    cl::desc("Make all profile counter updates atomic (for testing only)"),
+    cl::init(false));
+
+cl::opt<bool> AtomicCounterUpdatePromoted(
+    "atomic-counter-update-promoted", cl::ZeroOrMore,
+    cl::desc("Do counter update using atomic fetch add "
+             " for promoted counters only"),
+    cl::init(false));
+
+// If the option is not specified, the default behavior about whether
+// counter promotion is done depends on how instrumentaiton lowering
+// pipeline is setup, i.e., the default value of true of this option
+// does not mean the promotion will be done by default. Explicitly
+// setting this option can override the default behavior.
+cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
+                                 cl::desc("Do counter register promotion"),
+                                 cl::init(false));
+cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
+    cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
+    cl::desc("Max number counter promotions per loop to avoid"
+             " increasing register pressure too much"));
+
+// A debug option
+cl::opt<int>
+    MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
+                       cl::desc("Max number of allowed counter promotions"));
+
+cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting(
+    cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
+    cl::desc("The max number of exiting blocks of a loop to allow "
+             " speculative counter promotion"));
+
+cl::opt<bool> SpeculativeCounterPromotionToLoop(
+    cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
+    cl::desc("When the option is false, if the target block is in a loop, "
+             "the promotion will be disallowed unless the promoted counter "
+             " update can be further/iteratively promoted into an acyclic "
+             " region."));
+
+cl::opt<bool> IterativeCounterPromotion(
+    cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
+    cl::desc("Allow counter promotion across the whole loop nest."));
+
+class InstrProfilingLegacyPass : public ModulePass {
+  InstrProfiling InstrProf;
+
+public:
+  static char ID;
+
+  InstrProfilingLegacyPass() : ModulePass(ID) {}
+  InstrProfilingLegacyPass(const InstrProfOptions &Options, bool IsCS = false)
+      : ModulePass(ID), InstrProf(Options, IsCS) {}
+
+  StringRef getPassName() const override {
+    return "Frontend instrumentation-based coverage lowering";
+  }
+
+  bool runOnModule(Module &M) override {
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    return InstrProf.run(M, GetTLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+///
+/// A helper class to promote one counter RMW operation in the loop
+/// into register update.
+///
+/// RWM update for the counter will be sinked out of the loop after
+/// the transformation.
+///
+class PGOCounterPromoterHelper : public LoadAndStorePromoter {
+public:
+  PGOCounterPromoterHelper(
+      Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init,
+      BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks,
+      ArrayRef<Instruction *> InsertPts,
+      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+      LoopInfo &LI)
+      : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
+        InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) {
+    assert(isa<LoadInst>(L));
+    assert(isa<StoreInst>(S));
+    SSA.AddAvailableValue(PH, Init);
+  }
+
+  void doExtraRewritesBeforeFinalDeletion() override {
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = ExitBlocks[i];
+      Instruction *InsertPos = InsertPts[i];
+      // Get LiveIn value into the ExitBlock. If there are multiple
+      // predecessors, the value is defined by a PHI node in this
+      // block.
+      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+      Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
+      Type *Ty = LiveInValue->getType();
+      IRBuilder<> Builder(InsertPos);
+      if (AtomicCounterUpdatePromoted)
+        // automic update currently can only be promoted across the current
+        // loop, not the whole loop nest.
+        Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
+                                AtomicOrdering::SequentiallyConsistent);
+      else {
+        LoadInst *OldVal = Builder.CreateLoad(Ty, Addr, "pgocount.promoted");
+        auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
+        auto *NewStore = Builder.CreateStore(NewVal, Addr);
+
+        // Now update the parent loop's candidate list:
+        if (IterativeCounterPromotion) {
+          auto *TargetLoop = LI.getLoopFor(ExitBlock);
+          if (TargetLoop)
+            LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore);
+        }
+      }
+    }
+  }
+
+private:
+  Instruction *Store;
+  ArrayRef<BasicBlock *> ExitBlocks;
+  ArrayRef<Instruction *> InsertPts;
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+  LoopInfo &LI;
+};
+
+/// A helper class to do register promotion for all profile counter
+/// updates in a loop.
+///
+class PGOCounterPromoter {
+public:
+  PGOCounterPromoter(
+      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+      Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI)
+      : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
+        LI(LI), BFI(BFI) {
+
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    SmallPtrSet<BasicBlock *, 8> BlockSet;
+    L.getExitBlocks(LoopExitBlocks);
+
+    for (BasicBlock *ExitBlock : LoopExitBlocks) {
+      if (BlockSet.insert(ExitBlock).second) {
+        ExitBlocks.push_back(ExitBlock);
+        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+      }
+    }
+  }
+
+  bool run(int64_t *NumPromoted) {
+    // Skip 'infinite' loops:
+    if (ExitBlocks.size() == 0)
+      return false;
+    unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
+    if (MaxProm == 0)
+      return false;
+
+    unsigned Promoted = 0;
+    for (auto &Cand : LoopToCandidates[&L]) {
+
+      SmallVector<PHINode *, 4> NewPHIs;
+      SSAUpdater SSA(&NewPHIs);
+      Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
+
+      // If BFI is set, we will use it to guide the promotions.
+      if (BFI) {
+        auto *BB = Cand.first->getParent();
+        auto InstrCount = BFI->getBlockProfileCount(BB);
+        if (!InstrCount)
+          continue;
+        auto PreheaderCount = BFI->getBlockProfileCount(L.getLoopPreheader());
+        // If the average loop trip count is not greater than 1.5, we skip
+        // promotion.
+        if (PreheaderCount &&
+            (PreheaderCount.getValue() * 3) >= (InstrCount.getValue() * 2))
+          continue;
+      }
+
+      PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
+                                        L.getLoopPreheader(), ExitBlocks,
+                                        InsertPts, LoopToCandidates, LI);
+      Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second}));
+      Promoted++;
+      if (Promoted >= MaxProm)
+        break;
+
+      (*NumPromoted)++;
+      if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
+        break;
+    }
+
+    LLVM_DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
+                      << L.getLoopDepth() << ")\n");
+    return Promoted != 0;
+  }
+
+private:
+  bool allowSpeculativeCounterPromotion(Loop *LP) {
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    L.getExitingBlocks(ExitingBlocks);
+    // Not considierered speculative.
+    if (ExitingBlocks.size() == 1)
+      return true;
+    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+      return false;
+    return true;
+  }
+
+  // Returns the max number of Counter Promotions for LP.
+  unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
+    // We can't insert into a catchswitch.
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    LP->getExitBlocks(LoopExitBlocks);
+    if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
+          return isa<CatchSwitchInst>(Exit->getTerminator());
+        }))
+      return 0;
+
+    if (!LP->hasDedicatedExits())
+      return 0;
+
+    BasicBlock *PH = LP->getLoopPreheader();
+    if (!PH)
+      return 0;
+
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    LP->getExitingBlocks(ExitingBlocks);
+
+    // If BFI is set, we do more aggressive promotions based on BFI.
+    if (BFI)
+      return (unsigned)-1;
+
+    // Not considierered speculative.
+    if (ExitingBlocks.size() == 1)
+      return MaxNumOfPromotionsPerLoop;
+
+    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+      return 0;
+
+    // Whether the target block is in a loop does not matter:
+    if (SpeculativeCounterPromotionToLoop)
+      return MaxNumOfPromotionsPerLoop;
+
+    // Now check the target block:
+    unsigned MaxProm = MaxNumOfPromotionsPerLoop;
+    for (auto *TargetBlock : LoopExitBlocks) {
+      auto *TargetLoop = LI.getLoopFor(TargetBlock);
+      if (!TargetLoop)
+        continue;
+      unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop);
+      unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size();
+      MaxProm =
+          std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) -
+                                PendingCandsInTarget);
+    }
+    return MaxProm;
+  }
+
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  SmallVector<Instruction *, 8> InsertPts;
+  Loop &L;
+  LoopInfo &LI;
+  BlockFrequencyInfo *BFI;
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  if (!run(M, GetTLI))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+char InstrProfilingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    InstrProfilingLegacyPass, "instrprof",
+    "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    InstrProfilingLegacyPass, "instrprof",
+    "Frontend instrumentation-based coverage lowering.", false, false)
+
+ModulePass *
+llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
+                                     bool IsCS) {
+  return new InstrProfilingLegacyPass(Options, IsCS);
+}
+
+static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
+  InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
+  if (Inc)
+    return Inc;
+  return dyn_cast<InstrProfIncrementInst>(Instr);
+}
+
+bool InstrProfiling::lowerIntrinsics(Function *F) {
+  bool MadeChange = false;
+  PromotionCandidates.clear();
+  for (BasicBlock &BB : *F) {
+    for (auto I = BB.begin(), E = BB.end(); I != E;) {
+      auto Instr = I++;
+      InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
+      if (Inc) {
+        lowerIncrement(Inc);
+        MadeChange = true;
+      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+        lowerValueProfileInst(Ind);
+        MadeChange = true;
+      }
+    }
+  }
+
+  if (!MadeChange)
+    return false;
+
+  promoteCounterLoadStores(F);
+  return true;
+}
+
+bool InstrProfiling::isCounterPromotionEnabled() const {
+  if (DoCounterPromotion.getNumOccurrences() > 0)
+    return DoCounterPromotion;
+
+  return Options.DoCounterPromotion;
+}
+
+void InstrProfiling::promoteCounterLoadStores(Function *F) {
+  if (!isCounterPromotionEnabled())
+    return;
+
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates;
+
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  if (Options.UseBFIInPromotion) {
+    std::unique_ptr<BranchProbabilityInfo> BPI;
+    BPI.reset(new BranchProbabilityInfo(*F, LI, &GetTLI(*F)));
+    BFI.reset(new BlockFrequencyInfo(*F, *BPI, LI));
+  }
+
+  for (const auto &LoadStore : PromotionCandidates) {
+    auto *CounterLoad = LoadStore.first;
+    auto *CounterStore = LoadStore.second;
+    BasicBlock *BB = CounterLoad->getParent();
+    Loop *ParentLoop = LI.getLoopFor(BB);
+    if (!ParentLoop)
+      continue;
+    LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore);
+  }
+
+  SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder();
+
+  // Do a post-order traversal of the loops so that counter updates can be
+  // iteratively hoisted outside the loop nest.
+  for (auto *Loop : llvm::reverse(Loops)) {
+    PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI, BFI.get());
+    Promoter.run(&TotalCountersPromoted);
+  }
+}
+
+/// Check if the module contains uses of any profiling intrinsics.
+static bool containsProfilingIntrinsics(Module &M) {
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_increment)))
+    if (!F->use_empty())
+      return true;
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step)))
+    if (!F->use_empty())
+      return true;
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile)))
+    if (!F->use_empty())
+      return true;
+  return false;
+}
+
+bool InstrProfiling::run(
+    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
+  this->M = &M;
+  this->GetTLI = std::move(GetTLI);
+  NamesVar = nullptr;
+  NamesSize = 0;
+  ProfileDataMap.clear();
+  UsedVars.clear();
+  getMemOPSizeRangeFromOption(MemOPSizeRange, MemOPSizeRangeStart,
+                              MemOPSizeRangeLast);
+  TT = Triple(M.getTargetTriple());
+
+  // Emit the runtime hook even if no counters are present.
+  bool MadeChange = emitRuntimeHook();
+
+  // Improve compile time by avoiding linear scans when there is no work.
+  GlobalVariable *CoverageNamesVar =
+      M.getNamedGlobal(getCoverageUnusedNamesVarName());
+  if (!containsProfilingIntrinsics(M) && !CoverageNamesVar)
+    return MadeChange;
+
+  // We did not know how many value sites there would be inside
+  // the instrumented function. This is counting the number of instrumented
+  // target value sites to enter it as field in the profile data variable.
+  for (Function &F : M) {
+    InstrProfIncrementInst *FirstProfIncInst = nullptr;
+    for (BasicBlock &BB : F)
+      for (auto I = BB.begin(), E = BB.end(); I != E; I++)
+        if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
+          computeNumValueSiteCounts(Ind);
+        else if (FirstProfIncInst == nullptr)
+          FirstProfIncInst = dyn_cast<InstrProfIncrementInst>(I);
+
+    // Value profiling intrinsic lowering requires per-function profile data
+    // variable to be created first.
+    if (FirstProfIncInst != nullptr)
+      static_cast<void>(getOrCreateRegionCounters(FirstProfIncInst));
+  }
+
+  for (Function &F : M)
+    MadeChange |= lowerIntrinsics(&F);
+
+  if (CoverageNamesVar) {
+    lowerCoverageData(CoverageNamesVar);
+    MadeChange = true;
+  }
+
+  if (!MadeChange)
+    return false;
+
+  emitVNodes();
+  emitNameData();
+  emitRegistration();
+  emitUses();
+  emitInitialization();
+  return true;
+}
+
+static FunctionCallee
+getOrInsertValueProfilingCall(Module &M, const TargetLibraryInfo &TLI,
+                              bool IsRange = false) {
+  LLVMContext &Ctx = M.getContext();
+  auto *ReturnTy = Type::getVoidTy(M.getContext());
+
+  AttributeList AL;
+  if (auto AK = TLI.getExtAttrForI32Param(false))
+    AL = AL.addParamAttribute(M.getContext(), 2, AK);
+
+  if (!IsRange) {
+    Type *ParamTypes[] = {
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
+    };
+    auto *ValueProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
+    return M.getOrInsertFunction(getInstrProfValueProfFuncName(),
+                                 ValueProfilingCallTy, AL);
+  } else {
+    Type *RangeParamTypes[] = {
+#define VALUE_RANGE_PROF 1
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
+#undef VALUE_RANGE_PROF
+    };
+    auto *ValueRangeProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(RangeParamTypes), false);
+    return M.getOrInsertFunction(getInstrProfValueRangeProfFuncName(),
+                                 ValueRangeProfilingCallTy, AL);
+  }
+}
+
+void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
+  GlobalVariable *Name = Ind->getName();
+  uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+  uint64_t Index = Ind->getIndex()->getZExtValue();
+  auto It = ProfileDataMap.find(Name);
+  if (It == ProfileDataMap.end()) {
+    PerFunctionProfileData PD;
+    PD.NumValueSites[ValueKind] = Index + 1;
+    ProfileDataMap[Name] = PD;
+  } else if (It->second.NumValueSites[ValueKind] <= Index)
+    It->second.NumValueSites[ValueKind] = Index + 1;
+}
+
+void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
+  GlobalVariable *Name = Ind->getName();
+  auto It = ProfileDataMap.find(Name);
+  assert(It != ProfileDataMap.end() && It->second.DataVar &&
+         "value profiling detected in function with no counter incerement");
+
+  GlobalVariable *DataVar = It->second.DataVar;
+  uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+  uint64_t Index = Ind->getIndex()->getZExtValue();
+  for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind)
+    Index += It->second.NumValueSites[Kind];
+
+  IRBuilder<> Builder(Ind);
+  bool IsRange = (Ind->getValueKind()->getZExtValue() ==
+                  llvm::InstrProfValueKind::IPVK_MemOPSize);
+  CallInst *Call = nullptr;
+  auto *TLI = &GetTLI(*Ind->getFunction());
+  if (!IsRange) {
+    Value *Args[3] = {Ind->getTargetValue(),
+                      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+                      Builder.getInt32(Index)};
+    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args);
+  } else {
+    Value *Args[6] = {
+        Ind->getTargetValue(),
+        Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+        Builder.getInt32(Index),
+        Builder.getInt64(MemOPSizeRangeStart),
+        Builder.getInt64(MemOPSizeRangeLast),
+        Builder.getInt64(MemOPSizeLarge == 0 ? INT64_MIN : MemOPSizeLarge)};
+    Call =
+        Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true), Args);
+  }
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    Call->addParamAttr(2, AK);
+  Ind->replaceAllUsesWith(Call);
+  Ind->eraseFromParent();
+}
+
+void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
+  GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
+
+  IRBuilder<> Builder(Inc);
+  uint64_t Index = Inc->getIndex()->getZExtValue();
+  Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters->getValueType(),
+                                                   Counters, 0, Index);
+
+  if (Options.Atomic || AtomicCounterUpdateAll) {
+    Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
+                            AtomicOrdering::Monotonic);
+  } else {
+    Value *IncStep = Inc->getStep();
+    Value *Load = Builder.CreateLoad(IncStep->getType(), Addr, "pgocount");
+    auto *Count = Builder.CreateAdd(Load, Inc->getStep());
+    auto *Store = Builder.CreateStore(Count, Addr);
+    if (isCounterPromotionEnabled())
+      PromotionCandidates.emplace_back(cast<Instruction>(Load), Store);
+  }
+  Inc->eraseFromParent();
+}
+
+void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
+  ConstantArray *Names =
+      cast<ConstantArray>(CoverageNamesVar->getInitializer());
+  for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) {
+    Constant *NC = Names->getOperand(I);
+    Value *V = NC->stripPointerCasts();
+    assert(isa<GlobalVariable>(V) && "Missing reference to function name");
+    GlobalVariable *Name = cast<GlobalVariable>(V);
+
+    Name->setLinkage(GlobalValue::PrivateLinkage);
+    ReferencedNames.push_back(Name);
+    NC->dropAllReferences();
+  }
+  CoverageNamesVar->eraseFromParent();
+}
+
+/// Get the name of a profiling variable for a particular function.
+static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
+  StringRef NamePrefix = getInstrProfNameVarPrefix();
+  StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
+  Function *F = Inc->getParent()->getParent();
+  Module *M = F->getParent();
+  if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
+      !canRenameComdatFunc(*F))
+    return (Prefix + Name).str();
+  uint64_t FuncHash = Inc->getHash()->getZExtValue();
+  SmallVector<char, 24> HashPostfix;
+  if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
+    return (Prefix + Name).str();
+  return (Prefix + Name + "." + Twine(FuncHash)).str();
+}
+
+static inline bool shouldRecordFunctionAddr(Function *F) {
+  // Check the linkage
+  bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage();
+  if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
+      !HasAvailableExternallyLinkage)
+    return true;
+
+  // A function marked 'alwaysinline' with available_externally linkage can't
+  // have its address taken. Doing so would create an undefined external ref to
+  // the function, which would fail to link.
+  if (HasAvailableExternallyLinkage &&
+      F->hasFnAttribute(Attribute::AlwaysInline))
+    return false;
+
+  // Prohibit function address recording if the function is both internal and
+  // COMDAT. This avoids the profile data variable referencing internal symbols
+  // in COMDAT.
+  if (F->hasLocalLinkage() && F->hasComdat())
+    return false;
+
+  // Check uses of this function for other than direct calls or invokes to it.
+  // Inline virtual functions have linkeOnceODR linkage. When a key method
+  // exists, the vtable will only be emitted in the TU where the key method
+  // is defined. In a TU where vtable is not available, the function won't
+  // be 'addresstaken'. If its address is not recorded here, the profile data
+  // with missing address may be picked by the linker leading  to missing
+  // indirect call target info.
+  return F->hasAddressTaken() || F->hasLinkOnceLinkage();
+}
+
+static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
+  // Don't do this for Darwin.  compiler-rt uses linker magic.
+  if (TT.isOSDarwin())
+    return false;
+  // Use linker script magic to get data/cnts/name start/end.
+  if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
+      TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() ||
+      TT.isOSWindows())
+    return false;
+
+  return true;
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
+  GlobalVariable *NamePtr = Inc->getName();
+  auto It = ProfileDataMap.find(NamePtr);
+  PerFunctionProfileData PD;
+  if (It != ProfileDataMap.end()) {
+    if (It->second.RegionCounters)
+      return It->second.RegionCounters;
+    PD = It->second;
+  }
+
+  // Match the linkage and visibility of the name global. COFF supports using
+  // comdats with internal symbols, so do that if we can.
+  Function *Fn = Inc->getParent()->getParent();
+  GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
+  GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
+  if (TT.isOSBinFormatCOFF()) {
+    Linkage = GlobalValue::InternalLinkage;
+    Visibility = GlobalValue::DefaultVisibility;
+  }
+
+  // Move the name variable to the right section. Place them in a COMDAT group
+  // if the associated function is a COMDAT. This will make sure that only one
+  // copy of counters of the COMDAT function will be emitted after linking. Keep
+  // in mind that this pass may run before the inliner, so we need to create a
+  // new comdat group for the counters and profiling data. If we use the comdat
+  // of the parent function, that will result in relocations against discarded
+  // sections.
+  bool NeedComdat = needsComdatForCounter(*Fn, *M);
+  if (NeedComdat) {
+    if (TT.isOSBinFormatCOFF()) {
+      // For COFF, put the counters, data, and values each into their own
+      // comdats. We can't use a group because the Visual C++ linker will
+      // report duplicate symbol errors if there are multiple external symbols
+      // with the same name marked IMAGE_COMDAT_SELECT_ASSOCIATIVE.
+      Linkage = GlobalValue::LinkOnceODRLinkage;
+      Visibility = GlobalValue::HiddenVisibility;
+    }
+  }
+  auto MaybeSetComdat = [=](GlobalVariable *GV) {
+    if (NeedComdat)
+      GV->setComdat(M->getOrInsertComdat(GV->getName()));
+  };
+
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  LLVMContext &Ctx = M->getContext();
+  ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+
+  // Create the counters variable.
+  auto *CounterPtr =
+      new GlobalVariable(*M, CounterTy, false, Linkage,
+                         Constant::getNullValue(CounterTy),
+                         getVarName(Inc, getInstrProfCountersVarPrefix()));
+  CounterPtr->setVisibility(Visibility);
+  CounterPtr->setSection(
+      getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
+  CounterPtr->setAlignment(Align(8));
+  MaybeSetComdat(CounterPtr);
+  CounterPtr->setLinkage(Linkage);
+
+  auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+  // Allocate statically the array of pointers to value profile nodes for
+  // the current function.
+  Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy);
+  if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(TT)) {
+    uint64_t NS = 0;
+    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+      NS += PD.NumValueSites[Kind];
+    if (NS) {
+      ArrayType *ValuesTy = ArrayType::get(Type::getInt64Ty(Ctx), NS);
+
+      auto *ValuesVar =
+          new GlobalVariable(*M, ValuesTy, false, Linkage,
+                             Constant::getNullValue(ValuesTy),
+                             getVarName(Inc, getInstrProfValuesVarPrefix()));
+      ValuesVar->setVisibility(Visibility);
+      ValuesVar->setSection(
+          getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
+      ValuesVar->setAlignment(Align(8));
+      MaybeSetComdat(ValuesVar);
+      ValuesPtrExpr =
+          ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
+    }
+  }
+
+  // Create data variable.
+  auto *Int16Ty = Type::getInt16Ty(Ctx);
+  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
+  Type *DataTypes[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
+
+  Constant *FunctionAddr = shouldRecordFunctionAddr(Fn)
+                               ? ConstantExpr::getBitCast(Fn, Int8PtrTy)
+                               : ConstantPointerNull::get(Int8PtrTy);
+
+  Constant *Int16ArrayVals[IPVK_Last + 1];
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
+
+  Constant *DataVals[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *Data = new GlobalVariable(*M, DataTy, false, Linkage,
+                                  ConstantStruct::get(DataTy, DataVals),
+                                  getVarName(Inc, getInstrProfDataVarPrefix()));
+  Data->setVisibility(Visibility);
+  Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
+  Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
+  MaybeSetComdat(Data);
+  Data->setLinkage(Linkage);
+
+  PD.RegionCounters = CounterPtr;
+  PD.DataVar = Data;
+  ProfileDataMap[NamePtr] = PD;
+
+  // Mark the data variable as used so that it isn't stripped out.
+  UsedVars.push_back(Data);
+  // Now that the linkage set by the FE has been passed to the data and counter
+  // variables, reset Name variable's linkage and visibility to private so that
+  // it can be removed later by the compiler.
+  NamePtr->setLinkage(GlobalValue::PrivateLinkage);
+  // Collect the referenced names to be used by emitNameData.
+  ReferencedNames.push_back(NamePtr);
+
+  return CounterPtr;
+}
+
+void InstrProfiling::emitVNodes() {
+  if (!ValueProfileStaticAlloc)
+    return;
+
+  // For now only support this on platforms that do
+  // not require runtime registration to discover
+  // named section start/end.
+  if (needsRuntimeRegistrationOfSectionRange(TT))
+    return;
+
+  size_t TotalNS = 0;
+  for (auto &PD : ProfileDataMap) {
+    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+      TotalNS += PD.second.NumValueSites[Kind];
+  }
+
+  if (!TotalNS)
+    return;
+
+  uint64_t NumCounters = TotalNS * NumCountersPerValueSite;
+// Heuristic for small programs with very few total value sites.
+// The default value of vp-counters-per-site is chosen based on
+// the observation that large apps usually have a low percentage
+// of value sites that actually have any profile data, and thus
+// the average number of counters per site is low. For small
+// apps with very few sites, this may not be true. Bump up the
+// number of counters in this case.
+#define INSTR_PROF_MIN_VAL_COUNTS 10
+  if (NumCounters < INSTR_PROF_MIN_VAL_COUNTS)
+    NumCounters = std::max(INSTR_PROF_MIN_VAL_COUNTS, (int)NumCounters * 2);
+
+  auto &Ctx = M->getContext();
+  Type *VNodeTypes[] = {
+#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *VNodeTy = StructType::get(Ctx, makeArrayRef(VNodeTypes));
+
+  ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters);
+  auto *VNodesVar = new GlobalVariable(
+      *M, VNodesTy, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName());
+  VNodesVar->setSection(
+      getInstrProfSectionName(IPSK_vnodes, TT.getObjectFormat()));
+  UsedVars.push_back(VNodesVar);
+}
+
+void InstrProfiling::emitNameData() {
+  std::string UncompressedData;
+
+  if (ReferencedNames.empty())
+    return;
+
+  std::string CompressedNameStr;
+  if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
+                                          DoNameCompression)) {
+    report_fatal_error(toString(std::move(E)), false);
+  }
+
+  auto &Ctx = M->getContext();
+  auto *NamesVal = ConstantDataArray::getString(
+      Ctx, StringRef(CompressedNameStr), false);
+  NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
+                                GlobalValue::PrivateLinkage, NamesVal,
+                                getInstrProfNamesVarName());
+  NamesSize = CompressedNameStr.size();
+  NamesVar->setSection(
+      getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
+  // On COFF, it's important to reduce the alignment down to 1 to prevent the
+  // linker from inserting padding before the start of the names section or
+  // between names entries.
+  NamesVar->setAlignment(Align::None());
+  UsedVars.push_back(NamesVar);
+
+  for (auto *NamePtr : ReferencedNames)
+    NamePtr->eraseFromParent();
+}
+
+void InstrProfiling::emitRegistration() {
+  if (!needsRuntimeRegistrationOfSectionRange(TT))
+    return;
+
+  // Construct the function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
+  auto *Int64Ty = Type::getInt64Ty(M->getContext());
+  auto *RegisterFTy = FunctionType::get(VoidTy, false);
+  auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
+                                     getInstrProfRegFuncsName(), M);
+  RegisterF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  if (Options.NoRedZone)
+    RegisterF->addFnAttr(Attribute::NoRedZone);
+
+  auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);
+  auto *RuntimeRegisterF =
+      Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,
+                       getInstrProfRegFuncName(), M);
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
+  for (Value *Data : UsedVars)
+    if (Data != NamesVar && !isa<Function>(Data))
+      IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
+
+  if (NamesVar) {
+    Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
+    auto *NamesRegisterTy =
+        FunctionType::get(VoidTy, makeArrayRef(ParamTypes), false);
+    auto *NamesRegisterF =
+        Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
+                         getInstrProfNamesRegFuncName(), M);
+    IRB.CreateCall(NamesRegisterF, {IRB.CreateBitCast(NamesVar, VoidPtrTy),
+                                    IRB.getInt64(NamesSize)});
+  }
+
+  IRB.CreateRetVoid();
+}
+
+bool InstrProfiling::emitRuntimeHook() {
+  // We expect the linker to be invoked with -u<hook_var> flag for linux,
+  // for which case there is no need to emit the user function.
+  if (TT.isOSLinux())
+    return false;
+
+  // If the module's provided its own runtime, we don't need to do anything.
+  if (M->getGlobalVariable(getInstrProfRuntimeHookVarName()))
+    return false;
+
+  // Declare an external variable that will pull in the runtime initialization.
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *Var =
+      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         nullptr, getInstrProfRuntimeHookVarName());
+
+  // Make a function that uses it.
+  auto *User = Function::Create(FunctionType::get(Int32Ty, false),
+                                GlobalValue::LinkOnceODRLinkage,
+                                getInstrProfRuntimeHookVarUseFuncName(), M);
+  User->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    User->addFnAttr(Attribute::NoRedZone);
+  User->setVisibility(GlobalValue::HiddenVisibility);
+  if (TT.supportsCOMDAT())
+    User->setComdat(M->getOrInsertComdat(User->getName()));
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
+  auto *Load = IRB.CreateLoad(Int32Ty, Var);
+  IRB.CreateRet(Load);
+
+  // Mark the user variable as used so that it isn't stripped out.
+  UsedVars.push_back(User);
+  return true;
+}
+
+void InstrProfiling::emitUses() {
+  if (!UsedVars.empty())
+    appendToUsed(*M, UsedVars);
+}
+
+void InstrProfiling::emitInitialization() {
+  // Create ProfileFileName variable. Don't don't this for the
+  // context-sensitive instrumentation lowering: This lowering is after
+  // LTO/ThinLTO linking. Pass PGOInstrumentationGenCreateVar should
+  // have already create the variable before LTO/ThinLTO linking.
+  if (!IsCS)
+    createProfileFileNameVar(*M, Options.InstrProfileOutput);
+  Function *RegisterF = M->getFunction(getInstrProfRegFuncsName());
+  if (!RegisterF)
+    return;
+
+  // Create the initialization function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *F = Function::Create(FunctionType::get(VoidTy, false),
+                             GlobalValue::InternalLinkage,
+                             getInstrProfInitFuncName(), M);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  F->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    F->addFnAttr(Attribute::NoRedZone);
+
+  // Add the basic block and the necessary calls.
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
+  IRB.CreateCall(RegisterF, {});
+  IRB.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
new file mode 100644
index 000000000000..a6c2c9b464b6
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -0,0 +1,128 @@
+//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// Instrumentation library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// Moves I before IP. Returns new insert point.
+static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) {
+  // If I is IP, move the insert point down.
+  if (I == IP) {
+    ++IP;
+  } else {
+    // Otherwise, move I before IP and return IP.
+    I->moveBefore(&*IP);
+  }
+  return IP;
+}
+
+/// Instrumentation passes often insert conditional checks into entry blocks.
+/// Call this function before splitting the entry block to move instructions
+/// that must remain in the entry block up before the split point. Static
+/// allocas and llvm.localescape calls, for example, must remain in the entry
+/// block.
+BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
+                                                    BasicBlock::iterator IP) {
+  assert(&BB.getParent()->getEntryBlock() == &BB);
+  for (auto I = IP, E = BB.end(); I != E; ++I) {
+    bool KeepInEntry = false;
+    if (auto *AI = dyn_cast<AllocaInst>(I)) {
+      if (AI->isStaticAlloca())
+        KeepInEntry = true;
+    } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == llvm::Intrinsic::localescape)
+        KeepInEntry = true;
+    }
+    if (KeepInEntry)
+      IP = moveBeforeInsertPoint(I, IP);
+  }
+  return IP;
+}
+
+// Create a constant for Str so that we can pass it to the run-time lib.
+GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
+                                                   bool AllowMerging,
+                                                   const char *NamePrefix) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  // We use private linkage for module-local strings. If they can be merged
+  // with another one, we set the unnamed_addr attribute.
+  GlobalVariable *GV =
+      new GlobalVariable(M, StrConst->getType(), true,
+                         GlobalValue::PrivateLinkage, StrConst, NamePrefix);
+  if (AllowMerging)
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(Align::None()); // Strings may not be merged w/o setting
+                                   // alignment explicitly.
+  return GV;
+}
+
+Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
+                                        const std::string &ModuleId) {
+  if (auto Comdat = F.getComdat()) return Comdat;
+  assert(F.hasName());
+  Module *M = F.getParent();
+  std::string Name = F.getName();
+
+  // Make a unique comdat name for internal linkage things on ELF. On COFF, the
+  // name of the comdat group identifies the leader symbol of the comdat group.
+  // The linkage of the leader symbol is considered during comdat resolution,
+  // and internal symbols with the same name from different objects will not be
+  // merged.
+  if (T.isOSBinFormatELF() && F.hasLocalLinkage()) {
+    if (ModuleId.empty())
+      return nullptr;
+    Name += ModuleId;
+  }
+
+  // Make a new comdat for the function. Use the "no duplicates" selection kind
+  // for non-weak symbols if the object file format supports it.
+  Comdat *C = M->getOrInsertComdat(Name);
+  if (T.isOSBinFormatCOFF() && !F.isWeakForLinker())
+    C->setSelectionKind(Comdat::NoDuplicates);
+  F.setComdat(C);
+  return C;
+}
+
+/// initializeInstrumentation - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeInstrumentation(PassRegistry &Registry) {
+  initializeAddressSanitizerLegacyPassPass(Registry);
+  initializeModuleAddressSanitizerLegacyPassPass(Registry);
+  initializeBoundsCheckingLegacyPassPass(Registry);
+  initializeControlHeightReductionLegacyPassPass(Registry);
+  initializeGCOVProfilerLegacyPassPass(Registry);
+  initializePGOInstrumentationGenLegacyPassPass(Registry);
+  initializePGOInstrumentationUseLegacyPassPass(Registry);
+  initializePGOIndirectCallPromotionLegacyPassPass(Registry);
+  initializePGOMemOPSizeOptLegacyPassPass(Registry);
+  initializeInstrOrderFileLegacyPassPass(Registry);
+  initializeInstrProfilingLegacyPassPass(Registry);
+  initializeMemorySanitizerLegacyPassPass(Registry);
+  initializeHWAddressSanitizerLegacyPassPass(Registry);
+  initializeThreadSanitizerLegacyPassPass(Registry);
+  initializeModuleSanitizerCoverageLegacyPassPass(Registry);
+  initializeDataFlowSanitizerPass(Registry);
+}
+
+/// LLVMInitializeInstrumentation - C binding for
+/// initializeInstrumentation.
+void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) {
+  initializeInstrumentation(*unwrap(R));
+}
diff --git a/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
new file mode 100644
index 000000000000..892a6a26da91
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -0,0 +1,109 @@
+//===- llvm/Analysis/MaximumSpanningTree.h - Interface ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This module provides means for calculating a maximum spanning tree for a
+// given set of weighted edges. The type parameter T is the type of a node.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/IR/BasicBlock.h"
+#include <algorithm>
+#include <vector>
+
+namespace llvm {
+
+  /// MaximumSpanningTree - A MST implementation.
+  /// The type parameter T determines the type of the nodes of the graph.
+  template <typename T>
+  class MaximumSpanningTree {
+  public:
+    typedef std::pair<const T*, const T*> Edge;
+    typedef std::pair<Edge, double> EdgeWeight;
+    typedef std::vector<EdgeWeight> EdgeWeights;
+  protected:
+    typedef std::vector<Edge> MaxSpanTree;
+
+    MaxSpanTree MST;
+
+  private:
+    // A comparing class for comparing weighted edges.
+    struct EdgeWeightCompare {
+      static bool getBlockSize(const T *X) {
+        const BasicBlock *BB = dyn_cast_or_null<BasicBlock>(X);
+        return BB ? BB->size() : 0;
+      }
+
+      bool operator()(EdgeWeight X, EdgeWeight Y) const {
+        if (X.second > Y.second) return true;
+        if (X.second < Y.second) return false;
+
+        // Equal edge weights: break ties by comparing block sizes.
+        size_t XSizeA = getBlockSize(X.first.first);
+        size_t YSizeA = getBlockSize(Y.first.first);
+        if (XSizeA > YSizeA) return true;
+        if (XSizeA < YSizeA) return false;
+
+        size_t XSizeB = getBlockSize(X.first.second);
+        size_t YSizeB = getBlockSize(Y.first.second);
+        if (XSizeB > YSizeB) return true;
+        if (XSizeB < YSizeB) return false;
+
+        return false;
+      }
+    };
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+
+    /// MaximumSpanningTree() - Takes a vector of weighted edges and returns a
+    /// spanning tree.
+    MaximumSpanningTree(EdgeWeights &EdgeVector) {
+      llvm::stable_sort(EdgeVector, EdgeWeightCompare());
+
+      // Create spanning tree, Forest contains a special data structure
+      // that makes checking if two nodes are already in a common (sub-)tree
+      // fast and cheap.
+      EquivalenceClasses<const T*> Forest;
+      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
+           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
+        Edge e = (*EWi).first;
+
+        Forest.insert(e.first);
+        Forest.insert(e.second);
+      }
+
+      // Iterate over the sorted edges, biggest first.
+      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
+           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
+        Edge e = (*EWi).first;
+
+        if (Forest.findLeader(e.first) != Forest.findLeader(e.second)) {
+          Forest.unionSets(e.first, e.second);
+          // So we know now that the edge is not already in a subtree, so we push
+          // the edge to the MST.
+          MST.push_back(e);
+        }
+      }
+    }
+
+    typename MaxSpanTree::iterator begin() {
+      return MST.begin();
+    }
+
+    typename MaxSpanTree::iterator end() {
+      return MST.end();
+    }
+  };
+
+} // End llvm namespace
+
+#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
new file mode 100644
index 000000000000..69c9020e060b
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -0,0 +1,4602 @@
+//===- MemorySanitizer.cpp - detector of uninitialized reads --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of MemorySanitizer, a detector of uninitialized
+/// reads.
+///
+/// The algorithm of the tool is similar to Memcheck
+/// (http://goo.gl/QKbem). We associate a few shadow bits with every
+/// byte of the application memory, poison the shadow of the malloc-ed
+/// or alloca-ed memory, load the shadow bits on every memory read,
+/// propagate the shadow bits through some of the arithmetic
+/// instruction (including MOV), store the shadow bits on every memory
+/// write, report a bug on some other instructions (e.g. JMP) if the
+/// associated shadow is poisoned.
+///
+/// But there are differences too. The first and the major one:
+/// compiler instrumentation instead of binary instrumentation. This
+/// gives us much better register allocation, possible compiler
+/// optimizations and a fast start-up. But this brings the major issue
+/// as well: msan needs to see all program events, including system
+/// calls and reads/writes in system libraries, so we either need to
+/// compile *everything* with msan or use a binary translation
+/// component (e.g. DynamoRIO) to instrument pre-built libraries.
+/// Another difference from Memcheck is that we use 8 shadow bits per
+/// byte of application memory and use a direct shadow mapping. This
+/// greatly simplifies the instrumentation code and avoids races on
+/// shadow updates (Memcheck is single-threaded so races are not a
+/// concern there. Memcheck uses 2 shadow bits per byte with a slow
+/// path storage that uses 8 bits per byte).
+///
+/// The default value of shadow is 0, which means "clean" (not poisoned).
+///
+/// Every module initializer should call __msan_init to ensure that the
+/// shadow memory is ready. On error, __msan_warning is called. Since
+/// parameters and return values may be passed via registers, we have a
+/// specialized thread-local shadow for return values
+/// (__msan_retval_tls) and parameters (__msan_param_tls).
+///
+///                           Origin tracking.
+///
+/// MemorySanitizer can track origins (allocation points) of all uninitialized
+/// values. This behavior is controlled with a flag (msan-track-origins) and is
+/// disabled by default.
+///
+/// Origins are 4-byte values created and interpreted by the runtime library.
+/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
+/// of application memory. Propagation of origins is basically a bunch of
+/// "select" instructions that pick the origin of a dirty argument, if an
+/// instruction has one.
+///
+/// Every 4 aligned, consecutive bytes of application memory have one origin
+/// value associated with them. If these bytes contain uninitialized data
+/// coming from 2 different allocations, the last store wins. Because of this,
+/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
+/// practice.
+///
+/// Origins are meaningless for fully initialized values, so MemorySanitizer
+/// avoids storing origin to memory when a fully initialized value is stored.
+/// This way it avoids needless overwritting origin of the 4-byte region on
+/// a short (i.e. 1 byte) clean store, and it is also good for performance.
+///
+///                            Atomic handling.
+///
+/// Ideally, every atomic store of application value should update the
+/// corresponding shadow location in an atomic way. Unfortunately, atomic store
+/// of two disjoint locations can not be done without severe slowdown.
+///
+/// Therefore, we implement an approximation that may err on the safe side.
+/// In this implementation, every atomically accessed location in the program
+/// may only change from (partially) uninitialized to fully initialized, but
+/// not the other way around. We load the shadow _after_ the application load,
+/// and we store the shadow _before_ the app store. Also, we always store clean
+/// shadow (if the application store is atomic). This way, if the store-load
+/// pair constitutes a happens-before arc, shadow store and load are correctly
+/// ordered such that the load will get either the value that was stored, or
+/// some later value (which is always clean).
+///
+/// This does not work very well with Compare-And-Swap (CAS) and
+/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
+/// must store the new shadow before the app operation, and load the shadow
+/// after the app operation. Computers don't work this way. Current
+/// implementation ignores the load aspect of CAS/RMW, always returning a clean
+/// value. It implements the store part as a simple atomic store by storing a
+/// clean shadow.
+///
+///                      Instrumenting inline assembly.
+///
+/// For inline assembly code LLVM has little idea about which memory locations
+/// become initialized depending on the arguments. It can be possible to figure
+/// out which arguments are meant to point to inputs and outputs, but the
+/// actual semantics can be only visible at runtime. In the Linux kernel it's
+/// also possible that the arguments only indicate the offset for a base taken
+/// from a segment register, so it's dangerous to treat any asm() arguments as
+/// pointers. We take a conservative approach generating calls to
+///   __msan_instrument_asm_store(ptr, size)
+/// , which defer the memory unpoisoning to the runtime library.
+/// The latter can perform more complex address checks to figure out whether
+/// it's safe to touch the shadow memory.
+/// Like with atomic operations, we call __msan_instrument_asm_store() before
+/// the assembly call, so that changes to the shadow memory will be seen by
+/// other threads together with main memory initialization.
+///
+///                  KernelMemorySanitizer (KMSAN) implementation.
+///
+/// The major differences between KMSAN and MSan instrumentation are:
+///  - KMSAN always tracks the origins and implies msan-keep-going=true;
+///  - KMSAN allocates shadow and origin memory for each page separately, so
+///    there are no explicit accesses to shadow and origin in the
+///    instrumentation.
+///    Shadow and origin values for a particular X-byte memory location
+///    (X=1,2,4,8) are accessed through pointers obtained via the
+///      __msan_metadata_ptr_for_load_X(ptr)
+///      __msan_metadata_ptr_for_store_X(ptr)
+///    functions. The corresponding functions check that the X-byte accesses
+///    are possible and returns the pointers to shadow and origin memory.
+///    Arbitrary sized accesses are handled with:
+///      __msan_metadata_ptr_for_load_n(ptr, size)
+///      __msan_metadata_ptr_for_store_n(ptr, size);
+///  - TLS variables are stored in a single per-task struct. A call to a
+///    function __msan_get_context_state() returning a pointer to that struct
+///    is inserted into every instrumented function before the entry block;
+///  - __msan_warning() takes a 32-bit origin parameter;
+///  - local variables are poisoned with __msan_poison_alloca() upon function
+///    entry and unpoisoned with __msan_unpoison_alloca() before leaving the
+///    function;
+///  - the pass doesn't declare any global variables or add global constructors
+///    to the translation unit.
+///
+/// Also, KMSAN currently ignores uninitialized memory passed into inline asm
+/// calls, making sure we're on the safe side wrt. possible false positives.
+///
+///  KernelMemorySanitizer only supports X86_64 at the moment.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msan"
+
+static const unsigned kOriginSize = 4;
+static const unsigned kMinOriginAlignment = 4;
+static const unsigned kShadowTLSAlignment = 8;
+
+// These constants must be kept in sync with the ones in msan.h.
+static const unsigned kParamTLSSize = 800;
+static const unsigned kRetvalTLSSize = 800;
+
+// Accesses sizes are powers of two: 1, 2, 4, 8.
+static const size_t kNumberOfAccessSizes = 4;
+
+/// Track origins of uninitialized values.
+///
+/// Adds a section to MemorySanitizer report that points to the allocation
+/// (stack or heap) the uninitialized bits came from originally.
+static cl::opt<int> ClTrackOrigins("msan-track-origins",
+       cl::desc("Track origins (allocation sites) of poisoned memory"),
+       cl::Hidden, cl::init(0));
+
+static cl::opt<bool> ClKeepGoing("msan-keep-going",
+       cl::desc("keep going after reporting a UMR"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClPoisonStack("msan-poison-stack",
+       cl::desc("poison uninitialized stack variables"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
+       cl::desc("poison uninitialized stack variables with a call"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
+       cl::desc("poison uninitialized stack variables with the given pattern"),
+       cl::Hidden, cl::init(0xff));
+
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+       cl::desc("poison undef temps"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
+       cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+       cl::desc("exact handling of relational integer ICmp"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClHandleLifetimeIntrinsics(
+    "msan-handle-lifetime-intrinsics",
+    cl::desc(
+        "when possible, poison scoped variables at the beginning of the scope "
+        "(slower, but more precise)"),
+    cl::Hidden, cl::init(true));
+
+// When compiling the Linux kernel, we sometimes see false positives related to
+// MSan being unable to understand that inline assembly calls may initialize
+// local variables.
+// This flag makes the compiler conservatively unpoison every memory location
+// passed into an assembly call. Note that this may cause false positives.
+// Because it's impossible to figure out the array sizes, we can only unpoison
+// the first sizeof(type) bytes for each type* pointer.
+// The instrumentation is only enabled in KMSAN builds, and only if
+// -msan-handle-asm-conservative is on. This is done because we may want to
+// quickly disable assembly instrumentation when it breaks.
+static cl::opt<bool> ClHandleAsmConservative(
+    "msan-handle-asm-conservative",
+    cl::desc("conservative handling of inline assembly"), cl::Hidden,
+    cl::init(true));
+
+// This flag controls whether we check the shadow of the address
+// operand of load or store. Such bugs are very rare, since load from
+// a garbage address typically results in SEGV, but still happen
+// (e.g. only lower bits of address are garbage, or the access happens
+// early at program startup where malloc-ed memory is more likely to
+// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
+static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
+       cl::desc("report accesses through a pointer which has poisoned shadow"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
+       cl::desc("print out instructions with default strict semantics"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClInstrumentationWithCallThreshold(
+    "msan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented requires more than "
+        "this number of checks and origin stores, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(3500));
+
+static cl::opt<bool>
+    ClEnableKmsan("msan-kernel",
+                  cl::desc("Enable KernelMemorySanitizer instrumentation"),
+                  cl::Hidden, cl::init(false));
+
+// This is an experiment to enable handling of cases where shadow is a non-zero
+// compile-time constant. For some unexplainable reason they were silently
+// ignored in the instrumentation.
+static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
+       cl::desc("Insert checks for constant shadow values"),
+       cl::Hidden, cl::init(false));
+
+// This is off by default because of a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool> ClWithComdat("msan-with-comdat",
+       cl::desc("Place MSan constructors in comdat sections"),
+       cl::Hidden, cl::init(false));
+
+// These options allow to specify custom memory map parameters
+// See MemoryMapParams for details.
+static cl::opt<uint64_t> ClAndMask("msan-and-mask",
+                                   cl::desc("Define custom MSan AndMask"),
+                                   cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClXorMask("msan-xor-mask",
+                                   cl::desc("Define custom MSan XorMask"),
+                                   cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClShadowBase("msan-shadow-base",
+                                      cl::desc("Define custom MSan ShadowBase"),
+                                      cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClOriginBase("msan-origin-base",
+                                      cl::desc("Define custom MSan OriginBase"),
+                                      cl::Hidden, cl::init(0));
+
+static const char *const kMsanModuleCtorName = "msan.module_ctor";
+static const char *const kMsanInitName = "__msan_init";
+
+namespace {
+
+// Memory map parameters used in application-to-shadow address calculation.
+// Offset = (Addr & ~AndMask) ^ XorMask
+// Shadow = ShadowBase + Offset
+// Origin = OriginBase + Offset
+struct MemoryMapParams {
+  uint64_t AndMask;
+  uint64_t XorMask;
+  uint64_t ShadowBase;
+  uint64_t OriginBase;
+};
+
+struct PlatformMemoryMapParams {
+  const MemoryMapParams *bits32;
+  const MemoryMapParams *bits64;
+};
+
+} // end anonymous namespace
+
+// i386 Linux
+static const MemoryMapParams Linux_I386_MemoryMapParams = {
+  0x000080000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x000040000000,  // OriginBase
+};
+
+// x86_64 Linux
+static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
+#ifdef MSAN_LINUX_X86_64_OLD_MAPPING
+  0x400000000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x200000000000,  // OriginBase
+#else
+  0,               // AndMask (not used)
+  0x500000000000,  // XorMask
+  0,               // ShadowBase (not used)
+  0x100000000000,  // OriginBase
+#endif
+};
+
+// mips64 Linux
+static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
+  0,               // AndMask (not used)
+  0x008000000000,  // XorMask
+  0,               // ShadowBase (not used)
+  0x002000000000,  // OriginBase
+};
+
+// ppc64 Linux
+static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
+  0xE00000000000,  // AndMask
+  0x100000000000,  // XorMask
+  0x080000000000,  // ShadowBase
+  0x1C0000000000,  // OriginBase
+};
+
+// aarch64 Linux
+static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
+  0,               // AndMask (not used)
+  0x06000000000,   // XorMask
+  0,               // ShadowBase (not used)
+  0x01000000000,   // OriginBase
+};
+
+// i386 FreeBSD
+static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
+  0x000180000000,  // AndMask
+  0x000040000000,  // XorMask
+  0x000020000000,  // ShadowBase
+  0x000700000000,  // OriginBase
+};
+
+// x86_64 FreeBSD
+static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
+  0xc00000000000,  // AndMask
+  0x200000000000,  // XorMask
+  0x100000000000,  // ShadowBase
+  0x380000000000,  // OriginBase
+};
+
+// x86_64 NetBSD
+static const MemoryMapParams NetBSD_X86_64_MemoryMapParams = {
+  0,               // AndMask
+  0x500000000000,  // XorMask
+  0,               // ShadowBase
+  0x100000000000,  // OriginBase
+};
+
+static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
+  &Linux_I386_MemoryMapParams,
+  &Linux_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
+  nullptr,
+  &Linux_MIPS64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
+  nullptr,
+  &Linux_PowerPC64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
+  nullptr,
+  &Linux_AArch64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
+  &FreeBSD_I386_MemoryMapParams,
+  &FreeBSD_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams NetBSD_X86_MemoryMapParams = {
+  nullptr,
+  &NetBSD_X86_64_MemoryMapParams,
+};
+
+namespace {
+
+/// Instrument functions of a module to detect uninitialized reads.
+///
+/// Instantiating MemorySanitizer inserts the msan runtime library API function
+/// declarations into the module if they don't exist already. Instantiating
+/// ensures the __msan_init function is in the list of global constructors for
+/// the module.
+class MemorySanitizer {
+public:
+  MemorySanitizer(Module &M, MemorySanitizerOptions Options)
+      : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
+        Recover(Options.Recover) {
+    initializeModule(M);
+  }
+
+  // MSan cannot be moved or copied because of MapParams.
+  MemorySanitizer(MemorySanitizer &&) = delete;
+  MemorySanitizer &operator=(MemorySanitizer &&) = delete;
+  MemorySanitizer(const MemorySanitizer &) = delete;
+  MemorySanitizer &operator=(const MemorySanitizer &) = delete;
+
+  bool sanitizeFunction(Function &F, TargetLibraryInfo &TLI);
+
+private:
+  friend struct MemorySanitizerVisitor;
+  friend struct VarArgAMD64Helper;
+  friend struct VarArgMIPS64Helper;
+  friend struct VarArgAArch64Helper;
+  friend struct VarArgPowerPC64Helper;
+
+  void initializeModule(Module &M);
+  void initializeCallbacks(Module &M);
+  void createKernelApi(Module &M);
+  void createUserspaceApi(Module &M);
+
+  /// True if we're compiling the Linux kernel.
+  bool CompileKernel;
+  /// Track origins (allocation points) of uninitialized values.
+  int TrackOrigins;
+  bool Recover;
+
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *OriginTy;
+
+  // XxxTLS variables represent the per-thread state in MSan and per-task state
+  // in KMSAN.
+  // For the userspace these point to thread-local globals. In the kernel land
+  // they point to the members of a per-task struct obtained via a call to
+  // __msan_get_context_state().
+
+  /// Thread-local shadow storage for function parameters.
+  Value *ParamTLS;
+
+  /// Thread-local origin storage for function parameters.
+  Value *ParamOriginTLS;
+
+  /// Thread-local shadow storage for function return value.
+  Value *RetvalTLS;
+
+  /// Thread-local origin storage for function return value.
+  Value *RetvalOriginTLS;
+
+  /// Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  Value *VAArgTLS;
+
+  /// Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  Value *VAArgOriginTLS;
+
+  /// Thread-local shadow storage for va_arg overflow area
+  /// (x86_64-specific).
+  Value *VAArgOverflowSizeTLS;
+
+  /// Thread-local space used to pass origin value to the UMR reporting
+  /// function.
+  Value *OriginTLS;
+
+  /// Are the instrumentation callbacks set up?
+  bool CallbacksInitialized = false;
+
+  /// The run-time callback to print a warning.
+  FunctionCallee WarningFn;
+
+  // These arrays are indexed by log2(AccessSize).
+  FunctionCallee MaybeWarningFn[kNumberOfAccessSizes];
+  FunctionCallee MaybeStoreOriginFn[kNumberOfAccessSizes];
+
+  /// Run-time helper that generates a new origin value for a stack
+  /// allocation.
+  FunctionCallee MsanSetAllocaOrigin4Fn;
+
+  /// Run-time helper that poisons stack on function entry.
+  FunctionCallee MsanPoisonStackFn;
+
+  /// Run-time helper that records a store (or any event) of an
+  /// uninitialized value and returns an updated origin id encoding this info.
+  FunctionCallee MsanChainOriginFn;
+
+  /// MSan runtime replacements for memmove, memcpy and memset.
+  FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
+
+  /// KMSAN callback for task-local function argument shadow.
+  StructType *MsanContextStateTy;
+  FunctionCallee MsanGetContextStateFn;
+
+  /// Functions for poisoning/unpoisoning local variables
+  FunctionCallee MsanPoisonAllocaFn, MsanUnpoisonAllocaFn;
+
+  /// Each of the MsanMetadataPtrXxx functions returns a pair of shadow/origin
+  /// pointers.
+  FunctionCallee MsanMetadataPtrForLoadN, MsanMetadataPtrForStoreN;
+  FunctionCallee MsanMetadataPtrForLoad_1_8[4];
+  FunctionCallee MsanMetadataPtrForStore_1_8[4];
+  FunctionCallee MsanInstrumentAsmStoreFn;
+
+  /// Helper to choose between different MsanMetadataPtrXxx().
+  FunctionCallee getKmsanShadowOriginAccessFn(bool isStore, int size);
+
+  /// Memory map parameters used in application-to-shadow calculation.
+  const MemoryMapParams *MapParams;
+
+  /// Custom memory map parameters used when -msan-shadow-base or
+  // -msan-origin-base is provided.
+  MemoryMapParams CustomMapParams;
+
+  MDNode *ColdCallWeights;
+
+  /// Branch weights for origin store.
+  MDNode *OriginStoreWeights;
+
+  /// An empty volatile inline asm that prevents callback merge.
+  InlineAsm *EmptyAsm;
+};
+
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kMsanModuleCtorName, kMsanInitName,
+      /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) {
+        if (!ClWithComdat) {
+          appendToGlobalCtors(M, Ctor, 0);
+          return;
+        }
+        Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
+        Ctor->setComdat(MsanCtorComdat);
+        appendToGlobalCtors(M, Ctor, 0, Ctor);
+      });
+}
+
+/// A legacy function pass for msan instrumentation.
+///
+/// Instruments functions to detect unitialized reads.
+struct MemorySanitizerLegacyPass : public FunctionPass {
+  // Pass identification, replacement for typeid.
+  static char ID;
+
+  MemorySanitizerLegacyPass(MemorySanitizerOptions Options = {})
+      : FunctionPass(ID), Options(Options) {}
+  StringRef getPassName() const override { return "MemorySanitizerLegacyPass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    return MSan->sanitizeFunction(
+        F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
+  }
+  bool doInitialization(Module &M) override;
+
+  Optional<MemorySanitizer> MSan;
+  MemorySanitizerOptions Options;
+};
+
+template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
+  return (Opt.getNumOccurrences() > 0) ? Opt : Default;
+}
+
+} // end anonymous namespace
+
+MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K)
+    : Kernel(getOptOrDefault(ClEnableKmsan, K)),
+      TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)),
+      Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {}
+
+PreservedAnalyses MemorySanitizerPass::run(Function &F,
+                                           FunctionAnalysisManager &FAM) {
+  MemorySanitizer Msan(*F.getParent(), Options);
+  if (Msan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses MemorySanitizerPass::run(Module &M,
+                                           ModuleAnalysisManager &AM) {
+  if (Options.Kernel)
+    return PreservedAnalyses::all();
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
+char MemorySanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan",
+                      "MemorySanitizer: detects uninitialized reads.", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MemorySanitizerLegacyPass, "msan",
+                    "MemorySanitizer: detects uninitialized reads.", false,
+                    false)
+
+FunctionPass *
+llvm::createMemorySanitizerLegacyPassPass(MemorySanitizerOptions Options) {
+  return new MemorySanitizerLegacyPass(Options);
+}
+
+/// Create a non-const global initialized with the given string.
+///
+/// Creates a writable global for Str so that we can pass it to the
+/// run-time lib. Runtime uses first 4 bytes of the string to store the
+/// frame ID, so the string needs to be mutable.
+static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
+                                                            StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+/// Create KMSAN API callbacks.
+void MemorySanitizer::createKernelApi(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  // These will be initialized in insertKmsanPrologue().
+  RetvalTLS = nullptr;
+  RetvalOriginTLS = nullptr;
+  ParamTLS = nullptr;
+  ParamOriginTLS = nullptr;
+  VAArgTLS = nullptr;
+  VAArgOriginTLS = nullptr;
+  VAArgOverflowSizeTLS = nullptr;
+  // OriginTLS is unused in the kernel.
+  OriginTLS = nullptr;
+
+  // __msan_warning() in the kernel takes an origin.
+  WarningFn = M.getOrInsertFunction("__msan_warning", IRB.getVoidTy(),
+                                    IRB.getInt32Ty());
+  // Requests the per-task context state (kmsan_context_state*) from the
+  // runtime library.
+  MsanContextStateTy = StructType::get(
+      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8),
+      ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8),
+      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8),
+      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), /* va_arg_origin */
+      IRB.getInt64Ty(), ArrayType::get(OriginTy, kParamTLSSize / 4), OriginTy,
+      OriginTy);
+  MsanGetContextStateFn = M.getOrInsertFunction(
+      "__msan_get_context_state", PointerType::get(MsanContextStateTy, 0));
+
+  Type *RetTy = StructType::get(PointerType::get(IRB.getInt8Ty(), 0),
+                                PointerType::get(IRB.getInt32Ty(), 0));
+
+  for (int ind = 0, size = 1; ind < 4; ind++, size <<= 1) {
+    std::string name_load =
+        "__msan_metadata_ptr_for_load_" + std::to_string(size);
+    std::string name_store =
+        "__msan_metadata_ptr_for_store_" + std::to_string(size);
+    MsanMetadataPtrForLoad_1_8[ind] = M.getOrInsertFunction(
+        name_load, RetTy, PointerType::get(IRB.getInt8Ty(), 0));
+    MsanMetadataPtrForStore_1_8[ind] = M.getOrInsertFunction(
+        name_store, RetTy, PointerType::get(IRB.getInt8Ty(), 0));
+  }
+
+  MsanMetadataPtrForLoadN = M.getOrInsertFunction(
+      "__msan_metadata_ptr_for_load_n", RetTy,
+      PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
+  MsanMetadataPtrForStoreN = M.getOrInsertFunction(
+      "__msan_metadata_ptr_for_store_n", RetTy,
+      PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
+
+  // Functions for poisoning and unpoisoning memory.
+  MsanPoisonAllocaFn =
+      M.getOrInsertFunction("__msan_poison_alloca", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, IRB.getInt8PtrTy());
+  MsanUnpoisonAllocaFn = M.getOrInsertFunction(
+      "__msan_unpoison_alloca", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy);
+}
+
+static Constant *getOrInsertGlobal(Module &M, StringRef Name, Type *Ty) {
+  return M.getOrInsertGlobal(Name, Ty, [&] {
+    return new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                              nullptr, Name, nullptr,
+                              GlobalVariable::InitialExecTLSModel);
+  });
+}
+
+/// Insert declarations for userspace-specific functions and globals.
+void MemorySanitizer::createUserspaceApi(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Create the callback.
+  // FIXME: this function should have "Cold" calling conv,
+  // which is not yet implemented.
+  StringRef WarningFnName = Recover ? "__msan_warning"
+                                    : "__msan_warning_noreturn";
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy());
+
+  // Create the global TLS variables.
+  RetvalTLS =
+      getOrInsertGlobal(M, "__msan_retval_tls",
+                        ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8));
+
+  RetvalOriginTLS = getOrInsertGlobal(M, "__msan_retval_origin_tls", OriginTy);
+
+  ParamTLS =
+      getOrInsertGlobal(M, "__msan_param_tls",
+                        ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8));
+
+  ParamOriginTLS =
+      getOrInsertGlobal(M, "__msan_param_origin_tls",
+                        ArrayType::get(OriginTy, kParamTLSSize / 4));
+
+  VAArgTLS =
+      getOrInsertGlobal(M, "__msan_va_arg_tls",
+                        ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8));
+
+  VAArgOriginTLS =
+      getOrInsertGlobal(M, "__msan_va_arg_origin_tls",
+                        ArrayType::get(OriginTy, kParamTLSSize / 4));
+
+  VAArgOverflowSizeTLS =
+      getOrInsertGlobal(M, "__msan_va_arg_overflow_size_tls", IRB.getInt64Ty());
+  OriginTLS = getOrInsertGlobal(M, "__msan_origin_tls", IRB.getInt32Ty());
+
+  for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+       AccessSizeIndex++) {
+    unsigned AccessSize = 1 << AccessSizeIndex;
+    std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
+    MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
+        IRB.getInt32Ty());
+
+    FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
+    MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
+        IRB.getInt8PtrTy(), IRB.getInt32Ty());
+  }
+
+  MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), IntptrTy);
+  MsanPoisonStackFn =
+      M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy);
+}
+
+/// Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+  // Only do this once.
+  if (CallbacksInitialized)
+    return;
+
+  IRBuilder<> IRB(*C);
+  // Initialize callbacks that are common for kernel and userspace
+  // instrumentation.
+  MsanChainOriginFn = M.getOrInsertFunction(
+    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
+  MemmoveFn = M.getOrInsertFunction(
+    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy);
+  MemcpyFn = M.getOrInsertFunction(
+    "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy);
+  MemsetFn = M.getOrInsertFunction(
+    "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy);
+  // We insert an empty inline asm after __msan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+
+  MsanInstrumentAsmStoreFn =
+      M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(),
+                            PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
+
+  if (CompileKernel) {
+    createKernelApi(M);
+  } else {
+    createUserspaceApi(M);
+  }
+  CallbacksInitialized = true;
+}
+
+FunctionCallee MemorySanitizer::getKmsanShadowOriginAccessFn(bool isStore,
+                                                             int size) {
+  FunctionCallee *Fns =
+      isStore ? MsanMetadataPtrForStore_1_8 : MsanMetadataPtrForLoad_1_8;
+  switch (size) {
+  case 1:
+    return Fns[0];
+  case 2:
+    return Fns[1];
+  case 4:
+    return Fns[2];
+  case 8:
+    return Fns[3];
+  default:
+    return nullptr;
+  }
+}
+
+/// Module-level initialization.
+///
+/// inserts a call to __msan_init to the module's constructor list.
+void MemorySanitizer::initializeModule(Module &M) {
+  auto &DL = M.getDataLayout();
+
+  bool ShadowPassed = ClShadowBase.getNumOccurrences() > 0;
+  bool OriginPassed = ClOriginBase.getNumOccurrences() > 0;
+  // Check the overrides first
+  if (ShadowPassed || OriginPassed) {
+    CustomMapParams.AndMask = ClAndMask;
+    CustomMapParams.XorMask = ClXorMask;
+    CustomMapParams.ShadowBase = ClShadowBase;
+    CustomMapParams.OriginBase = ClOriginBase;
+    MapParams = &CustomMapParams;
+  } else {
+    Triple TargetTriple(M.getTargetTriple());
+    switch (TargetTriple.getOS()) {
+      case Triple::FreeBSD:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = FreeBSD_X86_MemoryMapParams.bits64;
+            break;
+          case Triple::x86:
+            MapParams = FreeBSD_X86_MemoryMapParams.bits32;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      case Triple::NetBSD:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = NetBSD_X86_MemoryMapParams.bits64;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      case Triple::Linux:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = Linux_X86_MemoryMapParams.bits64;
+            break;
+          case Triple::x86:
+            MapParams = Linux_X86_MemoryMapParams.bits32;
+            break;
+          case Triple::mips64:
+          case Triple::mips64el:
+            MapParams = Linux_MIPS_MemoryMapParams.bits64;
+            break;
+          case Triple::ppc64:
+          case Triple::ppc64le:
+            MapParams = Linux_PowerPC_MemoryMapParams.bits64;
+            break;
+          case Triple::aarch64:
+          case Triple::aarch64_be:
+            MapParams = Linux_ARM_MemoryMapParams.bits64;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      default:
+        report_fatal_error("unsupported operating system");
+    }
+  }
+
+  C = &(M.getContext());
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(DL);
+  OriginTy = IRB.getInt32Ty();
+
+  ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+  OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+
+  if (!CompileKernel) {
+    if (TrackOrigins)
+      M.getOrInsertGlobal("__msan_track_origins", IRB.getInt32Ty(), [&] {
+        return new GlobalVariable(
+            M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+            IRB.getInt32(TrackOrigins), "__msan_track_origins");
+      });
+
+    if (Recover)
+      M.getOrInsertGlobal("__msan_keep_going", IRB.getInt32Ty(), [&] {
+        return new GlobalVariable(M, IRB.getInt32Ty(), true,
+                                  GlobalValue::WeakODRLinkage,
+                                  IRB.getInt32(Recover), "__msan_keep_going");
+      });
+}
+}
+
+bool MemorySanitizerLegacyPass::doInitialization(Module &M) {
+  if (!Options.Kernel)
+    insertModuleCtor(M);
+  MSan.emplace(M, Options);
+  return true;
+}
+
+namespace {
+
+/// A helper class that handles instrumentation of VarArg
+/// functions on a particular platform.
+///
+/// Implementations are expected to insert the instrumentation
+/// necessary to propagate argument shadow through VarArg function
+/// calls. Visit* methods are called during an InstVisitor pass over
+/// the function, and should avoid creating new basic blocks. A new
+/// instance of this class is created for each instrumented function.
+struct VarArgHelper {
+  virtual ~VarArgHelper() = default;
+
+  /// Visit a CallSite.
+  virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
+
+  /// Visit a va_start call.
+  virtual void visitVAStartInst(VAStartInst &I) = 0;
+
+  /// Visit a va_copy call.
+  virtual void visitVACopyInst(VACopyInst &I) = 0;
+
+  /// Finalize function instrumentation.
+  ///
+  /// This method is called after visiting all interesting (see above)
+  /// instructions in a function.
+  virtual void finalizeInstrumentation() = 0;
+};
+
+struct MemorySanitizerVisitor;
+
+} // end anonymous namespace
+
+static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                        MemorySanitizerVisitor &Visitor);
+
+static unsigned TypeSizeToSizeIndex(unsigned TypeSize) {
+  if (TypeSize <= 8) return 0;
+  return Log2_32_Ceil((TypeSize + 7) / 8);
+}
+
+namespace {
+
+/// This class does all the work for a given function. Store and Load
+/// instructions store and load corresponding shadow and origin
+/// values. Most instructions propagate shadow from arguments to their
+/// return values. Certain instructions (most importantly, BranchInst)
+/// test their argument shadow and print reports (with a runtime call) if it's
+/// non-zero.
+struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
+  Function &F;
+  MemorySanitizer &MS;
+  SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
+  ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  std::unique_ptr<VarArgHelper> VAHelper;
+  const TargetLibraryInfo *TLI;
+  BasicBlock *ActualFnStart;
+
+  // The following flags disable parts of MSan instrumentation based on
+  // blacklist contents and command-line options.
+  bool InsertChecks;
+  bool PropagateShadow;
+  bool PoisonStack;
+  bool PoisonUndef;
+  bool CheckReturnValue;
+
+  struct ShadowOriginAndInsertPoint {
+    Value *Shadow;
+    Value *Origin;
+    Instruction *OrigIns;
+
+    ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
+      : Shadow(S), Origin(O), OrigIns(I) {}
+  };
+  SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
+  bool InstrumentLifetimeStart = ClHandleLifetimeIntrinsics;
+  SmallSet<AllocaInst *, 16> AllocaSet;
+  SmallVector<std::pair<IntrinsicInst *, AllocaInst *>, 16> LifetimeStartList;
+  SmallVector<StoreInst *, 16> StoreList;
+
+  MemorySanitizerVisitor(Function &F, MemorySanitizer &MS,
+                         const TargetLibraryInfo &TLI)
+      : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) {
+    bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
+    InsertChecks = SanitizeFunction;
+    PropagateShadow = SanitizeFunction;
+    PoisonStack = SanitizeFunction && ClPoisonStack;
+    PoisonUndef = SanitizeFunction && ClPoisonUndef;
+    // FIXME: Consider using SpecialCaseList to specify a list of functions that
+    // must always return fully initialized values. For now, we hardcode "main".
+    CheckReturnValue = SanitizeFunction && (F.getName() == "main");
+
+    MS.initializeCallbacks(*F.getParent());
+    if (MS.CompileKernel)
+      ActualFnStart = insertKmsanPrologue(F);
+    else
+      ActualFnStart = &F.getEntryBlock();
+
+    LLVM_DEBUG(if (!InsertChecks) dbgs()
+               << "MemorySanitizer is not inserting checks into '"
+               << F.getName() << "'\n");
+  }
+
+  Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
+    if (MS.TrackOrigins <= 1) return V;
+    return IRB.CreateCall(MS.MsanChainOriginFn, V);
+  }
+
+  Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+    if (IntptrSize == kOriginSize) return Origin;
+    assert(IntptrSize == kOriginSize * 2);
+    Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false);
+    return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
+  }
+
+  /// Fill memory range with the given origin value.
+  void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
+                   unsigned Size, unsigned Alignment) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrAlignment = DL.getABITypeAlignment(MS.IntptrTy);
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+    assert(IntptrAlignment >= kMinOriginAlignment);
+    assert(IntptrSize >= kOriginSize);
+
+    unsigned Ofs = 0;
+    unsigned CurrentAlignment = Alignment;
+    if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
+      Value *IntptrOrigin = originToIntptr(IRB, Origin);
+      Value *IntptrOriginPtr =
+          IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0));
+      for (unsigned i = 0; i < Size / IntptrSize; ++i) {
+        Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
+                       : IntptrOriginPtr;
+        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
+        Ofs += IntptrSize / kOriginSize;
+        CurrentAlignment = IntptrAlignment;
+      }
+    }
+
+    for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
+      Value *GEP =
+          i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr;
+      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
+      CurrentAlignment = kMinOriginAlignment;
+    }
+  }
+
+  void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
+                   Value *OriginPtr, unsigned Alignment, bool AsCall) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+    unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+    if (Shadow->getType()->isAggregateType()) {
+      paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
+                  OriginAlignment);
+    } else {
+      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+      Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
+      if (ConstantShadow) {
+        if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
+          paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
+                      OriginAlignment);
+        return;
+      }
+
+      unsigned TypeSizeInBits =
+          DL.getTypeSizeInBits(ConvertedShadow->getType());
+      unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+      if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) {
+        FunctionCallee Fn = MS.MaybeStoreOriginFn[SizeIndex];
+        Value *ConvertedShadow2 = IRB.CreateZExt(
+            ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+        IRB.CreateCall(Fn, {ConvertedShadow2,
+                            IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                            Origin});
+      } else {
+        Value *Cmp = IRB.CreateICmpNE(
+            ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp");
+        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+            Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
+        IRBuilder<> IRBNew(CheckTerm);
+        paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize,
+                    OriginAlignment);
+      }
+    }
+  }
+
+  void materializeStores(bool InstrumentWithCalls) {
+    for (StoreInst *SI : StoreList) {
+      IRBuilder<> IRB(SI);
+      Value *Val = SI->getValueOperand();
+      Value *Addr = SI->getPointerOperand();
+      Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
+      Value *ShadowPtr, *OriginPtr;
+      Type *ShadowTy = Shadow->getType();
+      unsigned Alignment = SI->getAlignment();
+      unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
+
+      StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment);
+      LLVM_DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
+      (void)NewSI;
+
+      if (SI->isAtomic())
+        SI->setOrdering(addReleaseOrdering(SI->getOrdering()));
+
+      if (MS.TrackOrigins && !SI->isAtomic())
+        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), OriginPtr,
+                    OriginAlignment, InstrumentWithCalls);
+    }
+  }
+
+  /// Helper function to insert a warning at IRB's current insert point.
+  void insertWarningFn(IRBuilder<> &IRB, Value *Origin) {
+    if (!Origin)
+      Origin = (Value *)IRB.getInt32(0);
+    if (MS.CompileKernel) {
+      IRB.CreateCall(MS.WarningFn, Origin);
+    } else {
+      if (MS.TrackOrigins) {
+        IRB.CreateStore(Origin, MS.OriginTLS);
+      }
+      IRB.CreateCall(MS.WarningFn, {});
+    }
+    IRB.CreateCall(MS.EmptyAsm, {});
+    // FIXME: Insert UnreachableInst if !MS.Recover?
+    // This may invalidate some of the following checks and needs to be done
+    // at the very end.
+  }
+
+  void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
+                           bool AsCall) {
+    IRBuilder<> IRB(OrigIns);
+    LLVM_DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
+    Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+    LLVM_DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+
+    Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
+    if (ConstantShadow) {
+      if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
+        insertWarningFn(IRB, Origin);
+      }
+      return;
+    }
+
+    const DataLayout &DL = OrigIns->getModule()->getDataLayout();
+
+    unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
+    unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+    if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) {
+      FunctionCallee Fn = MS.MaybeWarningFn[SizeIndex];
+      Value *ConvertedShadow2 =
+          IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+      IRB.CreateCall(Fn, {ConvertedShadow2, MS.TrackOrigins && Origin
+                                                ? Origin
+                                                : (Value *)IRB.getInt32(0)});
+    } else {
+      Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+                                    getCleanShadow(ConvertedShadow), "_mscmp");
+      Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+          Cmp, OrigIns,
+          /* Unreachable */ !MS.Recover, MS.ColdCallWeights);
+
+      IRB.SetInsertPoint(CheckTerm);
+      insertWarningFn(IRB, Origin);
+      LLVM_DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
+    }
+  }
+
+  void materializeChecks(bool InstrumentWithCalls) {
+    for (const auto &ShadowData : InstrumentationList) {
+      Instruction *OrigIns = ShadowData.OrigIns;
+      Value *Shadow = ShadowData.Shadow;
+      Value *Origin = ShadowData.Origin;
+      materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
+    }
+    LLVM_DEBUG(dbgs() << "DONE:\n" << F);
+  }
+
+  BasicBlock *insertKmsanPrologue(Function &F) {
+    BasicBlock *ret =
+        SplitBlock(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHI());
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    Value *ContextState = IRB.CreateCall(MS.MsanGetContextStateFn, {});
+    Constant *Zero = IRB.getInt32(0);
+    MS.ParamTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                {Zero, IRB.getInt32(0)}, "param_shadow");
+    MS.RetvalTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                 {Zero, IRB.getInt32(1)}, "retval_shadow");
+    MS.VAArgTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                {Zero, IRB.getInt32(2)}, "va_arg_shadow");
+    MS.VAArgOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                      {Zero, IRB.getInt32(3)}, "va_arg_origin");
+    MS.VAArgOverflowSizeTLS =
+        IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                      {Zero, IRB.getInt32(4)}, "va_arg_overflow_size");
+    MS.ParamOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                      {Zero, IRB.getInt32(5)}, "param_origin");
+    MS.RetvalOriginTLS =
+        IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                      {Zero, IRB.getInt32(6)}, "retval_origin");
+    return ret;
+  }
+
+  /// Add MemorySanitizer instrumentation to a function.
+  bool runOnFunction() {
+    // In the presence of unreachable blocks, we may see Phi nodes with
+    // incoming nodes from such blocks. Since InstVisitor skips unreachable
+    // blocks, such nodes will not have any shadow value associated with them.
+    // It's easier to remove unreachable blocks than deal with missing shadow.
+    removeUnreachableBlocks(F);
+
+    // Iterate all BBs in depth-first order and create shadow instructions
+    // for all instructions (where applicable).
+    // For PHI nodes we create dummy shadow PHIs which will be finalized later.
+    for (BasicBlock *BB : depth_first(ActualFnStart))
+      visit(*BB);
+
+    // Finalize PHI nodes.
+    for (PHINode *PN : ShadowPHINodes) {
+      PHINode *PNS = cast<PHINode>(getShadow(PN));
+      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
+      size_t NumValues = PN->getNumIncomingValues();
+      for (size_t v = 0; v < NumValues; v++) {
+        PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
+        if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+      }
+    }
+
+    VAHelper->finalizeInstrumentation();
+
+    // Poison llvm.lifetime.start intrinsics, if we haven't fallen back to
+    // instrumenting only allocas.
+    if (InstrumentLifetimeStart) {
+      for (auto Item : LifetimeStartList) {
+        instrumentAlloca(*Item.second, Item.first);
+        AllocaSet.erase(Item.second);
+      }
+    }
+    // Poison the allocas for which we didn't instrument the corresponding
+    // lifetime intrinsics.
+    for (AllocaInst *AI : AllocaSet)
+      instrumentAlloca(*AI);
+
+    bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 &&
+                               InstrumentationList.size() + StoreList.size() >
+                                   (unsigned)ClInstrumentationWithCallThreshold;
+
+    // Insert shadow value checks.
+    materializeChecks(InstrumentWithCalls);
+
+    // Delayed instrumentation of StoreInst.
+    // This may not add new address checks.
+    materializeStores(InstrumentWithCalls);
+
+    return true;
+  }
+
+  /// Compute the shadow type that corresponds to a given Value.
+  Type *getShadowTy(Value *V) {
+    return getShadowTy(V->getType());
+  }
+
+  /// Compute the shadow type that corresponds to a given Type.
+  Type *getShadowTy(Type *OrigTy) {
+    if (!OrigTy->isSized()) {
+      return nullptr;
+    }
+    // For integer type, shadow is the same as the original type.
+    // This may return weird-sized types like i1.
+    if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
+      return IT;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
+      uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
+      return VectorType::get(IntegerType::get(*MS.C, EltSize),
+                             VT->getNumElements());
+    }
+    if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
+      return ArrayType::get(getShadowTy(AT->getElementType()),
+                            AT->getNumElements());
+    }
+    if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+      SmallVector<Type*, 4> Elements;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Elements.push_back(getShadowTy(ST->getElementType(i)));
+      StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
+      LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+      return Res;
+    }
+    uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
+    return IntegerType::get(*MS.C, TypeSize);
+  }
+
+  /// Flatten a vector type.
+  Type *getShadowTyNoVec(Type *ty) {
+    if (VectorType *vt = dyn_cast<VectorType>(ty))
+      return IntegerType::get(*MS.C, vt->getBitWidth());
+    return ty;
+  }
+
+  /// Convert a shadow value to it's flattened variant.
+  Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
+    Type *Ty = V->getType();
+    Type *NoVecTy = getShadowTyNoVec(Ty);
+    if (Ty == NoVecTy) return V;
+    return IRB.CreateBitCast(V, NoVecTy);
+  }
+
+  /// Compute the integer shadow offset that corresponds to a given
+  /// application address.
+  ///
+  /// Offset = (Addr & ~AndMask) ^ XorMask
+  Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) {
+    Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy);
+
+    uint64_t AndMask = MS.MapParams->AndMask;
+    if (AndMask)
+      OffsetLong =
+          IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask));
+
+    uint64_t XorMask = MS.MapParams->XorMask;
+    if (XorMask)
+      OffsetLong =
+          IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask));
+    return OffsetLong;
+  }
+
+  /// Compute the shadow and origin addresses corresponding to a given
+  /// application address.
+  ///
+  /// Shadow = ShadowBase + Offset
+  /// Origin = (OriginBase + Offset) & ~3ULL
+  std::pair<Value *, Value *> getShadowOriginPtrUserspace(Value *Addr,
+                                                          IRBuilder<> &IRB,
+                                                          Type *ShadowTy,
+                                                          unsigned Alignment) {
+    Value *ShadowOffset = getShadowPtrOffset(Addr, IRB);
+    Value *ShadowLong = ShadowOffset;
+    uint64_t ShadowBase = MS.MapParams->ShadowBase;
+    if (ShadowBase != 0) {
+      ShadowLong =
+        IRB.CreateAdd(ShadowLong,
+                      ConstantInt::get(MS.IntptrTy, ShadowBase));
+    }
+    Value *ShadowPtr =
+        IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+    Value *OriginPtr = nullptr;
+    if (MS.TrackOrigins) {
+      Value *OriginLong = ShadowOffset;
+      uint64_t OriginBase = MS.MapParams->OriginBase;
+      if (OriginBase != 0)
+        OriginLong = IRB.CreateAdd(OriginLong,
+                                   ConstantInt::get(MS.IntptrTy, OriginBase));
+      if (Alignment < kMinOriginAlignment) {
+        uint64_t Mask = kMinOriginAlignment - 1;
+        OriginLong =
+            IRB.CreateAnd(OriginLong, ConstantInt::get(MS.IntptrTy, ~Mask));
+      }
+      OriginPtr =
+          IRB.CreateIntToPtr(OriginLong, PointerType::get(MS.OriginTy, 0));
+    }
+    return std::make_pair(ShadowPtr, OriginPtr);
+  }
+
+  std::pair<Value *, Value *>
+  getShadowOriginPtrKernel(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy,
+                           unsigned Alignment, bool isStore) {
+    Value *ShadowOriginPtrs;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    int Size = DL.getTypeStoreSize(ShadowTy);
+
+    FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, Size);
+    Value *AddrCast =
+        IRB.CreatePointerCast(Addr, PointerType::get(IRB.getInt8Ty(), 0));
+    if (Getter) {
+      ShadowOriginPtrs = IRB.CreateCall(Getter, AddrCast);
+    } else {
+      Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
+      ShadowOriginPtrs = IRB.CreateCall(isStore ? MS.MsanMetadataPtrForStoreN
+                                                : MS.MsanMetadataPtrForLoadN,
+                                        {AddrCast, SizeVal});
+    }
+    Value *ShadowPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 0);
+    ShadowPtr = IRB.CreatePointerCast(ShadowPtr, PointerType::get(ShadowTy, 0));
+    Value *OriginPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 1);
+
+    return std::make_pair(ShadowPtr, OriginPtr);
+  }
+
+  std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB,
+                                                 Type *ShadowTy,
+                                                 unsigned Alignment,
+                                                 bool isStore) {
+    std::pair<Value *, Value *> ret;
+    if (MS.CompileKernel)
+      ret = getShadowOriginPtrKernel(Addr, IRB, ShadowTy, Alignment, isStore);
+    else
+      ret = getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
+    return ret;
+  }
+
+  /// Compute the shadow address for a given function argument.
+  ///
+  /// Shadow = ParamTLS+ArgOffset.
+  Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
+    if (ArgOffset)
+      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  /// Compute the origin address for a given function argument.
+  Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    if (!MS.TrackOrigins)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
+    if (ArgOffset)
+      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_o");
+  }
+
+  /// Compute the shadow address for a retval.
+  Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
+    return IRB.CreatePointerCast(MS.RetvalTLS,
+                                 PointerType::get(getShadowTy(A), 0),
+                                 "_msret");
+  }
+
+  /// Compute the origin address for a retval.
+  Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
+    // We keep a single origin for the entire retval. Might be too optimistic.
+    return MS.RetvalOriginTLS;
+  }
+
+  /// Set SV to be the shadow value for V.
+  void setShadow(Value *V, Value *SV) {
+    assert(!ShadowMap.count(V) && "Values may only have one shadow");
+    ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
+  }
+
+  /// Set Origin to be the origin value for V.
+  void setOrigin(Value *V, Value *Origin) {
+    if (!MS.TrackOrigins) return;
+    assert(!OriginMap.count(V) && "Values may only have one origin");
+    LLVM_DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
+    OriginMap[V] = Origin;
+  }
+
+  Constant *getCleanShadow(Type *OrigTy) {
+    Type *ShadowTy = getShadowTy(OrigTy);
+    if (!ShadowTy)
+      return nullptr;
+    return Constant::getNullValue(ShadowTy);
+  }
+
+  /// Create a clean shadow value for a given value.
+  ///
+  /// Clean shadow (all zeroes) means all bits of the value are defined
+  /// (initialized).
+  Constant *getCleanShadow(Value *V) {
+    return getCleanShadow(V->getType());
+  }
+
+  /// Create a dirty shadow of a given shadow type.
+  Constant *getPoisonedShadow(Type *ShadowTy) {
+    assert(ShadowTy);
+    if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
+      return Constant::getAllOnesValue(ShadowTy);
+    if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals(AT->getNumElements(),
+                                      getPoisonedShadow(AT->getElementType()));
+      return ConstantArray::get(AT, Vals);
+    }
+    if (StructType *ST = dyn_cast<StructType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+      return ConstantStruct::get(ST, Vals);
+    }
+    llvm_unreachable("Unexpected shadow type");
+  }
+
+  /// Create a dirty shadow for a given value.
+  Constant *getPoisonedShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return nullptr;
+    return getPoisonedShadow(ShadowTy);
+  }
+
+  /// Create a clean (zero) origin.
+  Value *getCleanOrigin() {
+    return Constant::getNullValue(MS.OriginTy);
+  }
+
+  /// Get the shadow value for a given Value.
+  ///
+  /// This function either returns the value set earlier with setShadow,
+  /// or extracts if from ParamTLS (for function arguments).
+  Value *getShadow(Value *V) {
+    if (!PropagateShadow) return getCleanShadow(V);
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      if (I->getMetadata("nosanitize"))
+        return getCleanShadow(V);
+      // For instructions the shadow is already stored in the map.
+      Value *Shadow = ShadowMap[V];
+      if (!Shadow) {
+        LLVM_DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+        (void)I;
+        assert(Shadow && "No shadow for a value");
+      }
+      return Shadow;
+    }
+    if (UndefValue *U = dyn_cast<UndefValue>(V)) {
+      Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+      LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+      (void)U;
+      return AllOnes;
+    }
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      // For arguments we compute the shadow on demand and store it in the map.
+      Value **ShadowPtr = &ShadowMap[V];
+      if (*ShadowPtr)
+        return *ShadowPtr;
+      Function *F = A->getParent();
+      IRBuilder<> EntryIRB(ActualFnStart->getFirstNonPHI());
+      unsigned ArgOffset = 0;
+      const DataLayout &DL = F->getParent()->getDataLayout();
+      for (auto &FArg : F->args()) {
+        if (!FArg.getType()->isSized()) {
+          LLVM_DEBUG(dbgs() << "Arg is not sized\n");
+          continue;
+        }
+        unsigned Size =
+            FArg.hasByValAttr()
+                ? DL.getTypeAllocSize(FArg.getType()->getPointerElementType())
+                : DL.getTypeAllocSize(FArg.getType());
+        if (A == &FArg) {
+          bool Overflow = ArgOffset + Size > kParamTLSSize;
+          Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+          if (FArg.hasByValAttr()) {
+            // ByVal pointer itself has clean shadow. We copy the actual
+            // argument shadow to the underlying memory.
+            // Figure out maximal valid memcpy alignment.
+            unsigned ArgAlign = FArg.getParamAlignment();
+            if (ArgAlign == 0) {
+              Type *EltType = A->getType()->getPointerElementType();
+              ArgAlign = DL.getABITypeAlignment(EltType);
+            }
+            Value *CpShadowPtr =
+                getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign,
+                                   /*isStore*/ true)
+                    .first;
+            // TODO(glider): need to copy origins.
+            if (Overflow) {
+              // ParamTLS overflow.
+              EntryIRB.CreateMemSet(
+                  CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()),
+                  Size, ArgAlign);
+            } else {
+              unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
+              Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
+                                                 CopyAlign, Size);
+              LLVM_DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+              (void)Cpy;
+            }
+            *ShadowPtr = getCleanShadow(V);
+          } else {
+            if (Overflow) {
+              // ParamTLS overflow.
+              *ShadowPtr = getCleanShadow(V);
+            } else {
+              *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
+                                                      kShadowTLSAlignment);
+            }
+          }
+          LLVM_DEBUG(dbgs()
+                     << "  ARG:    " << FArg << " ==> " << **ShadowPtr << "\n");
+          if (MS.TrackOrigins && !Overflow) {
+            Value *OriginPtr =
+                getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
+            setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr));
+          } else {
+            setOrigin(A, getCleanOrigin());
+          }
+        }
+        ArgOffset += alignTo(Size, kShadowTLSAlignment);
+      }
+      assert(*ShadowPtr && "Could not find shadow for an argument");
+      return *ShadowPtr;
+    }
+    // For everything else the shadow is zero.
+    return getCleanShadow(V);
+  }
+
+  /// Get the shadow for i-th argument of the instruction I.
+  Value *getShadow(Instruction *I, int i) {
+    return getShadow(I->getOperand(i));
+  }
+
+  /// Get the origin for a value.
+  Value *getOrigin(Value *V) {
+    if (!MS.TrackOrigins) return nullptr;
+    if (!PropagateShadow) return getCleanOrigin();
+    if (isa<Constant>(V)) return getCleanOrigin();
+    assert((isa<Instruction>(V) || isa<Argument>(V)) &&
+           "Unexpected value type in getOrigin()");
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      if (I->getMetadata("nosanitize"))
+        return getCleanOrigin();
+    }
+    Value *Origin = OriginMap[V];
+    assert(Origin && "Missing origin");
+    return Origin;
+  }
+
+  /// Get the origin for i-th argument of the instruction I.
+  Value *getOrigin(Instruction *I, int i) {
+    return getOrigin(I->getOperand(i));
+  }
+
+  /// Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the shadow value is not 0.
+  void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
+    assert(Shadow);
+    if (!InsertChecks) return;
+#ifndef NDEBUG
+    Type *ShadowTy = Shadow->getType();
+    assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
+           "Can only insert checks for integer and vector shadow types");
+#endif
+    InstrumentationList.push_back(
+        ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  /// Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertShadowCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    Value *Shadow, *Origin;
+    if (ClCheckConstantShadow) {
+      Shadow = getShadow(Val);
+      if (!Shadow) return;
+      Origin = getOrigin(Val);
+    } else {
+      Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+      if (!Shadow) return;
+      Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    }
+    insertShadowCheck(Shadow, Origin, OrigIns);
+  }
+
+  AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
+    switch (a) {
+      case AtomicOrdering::NotAtomic:
+        return AtomicOrdering::NotAtomic;
+      case AtomicOrdering::Unordered:
+      case AtomicOrdering::Monotonic:
+      case AtomicOrdering::Release:
+        return AtomicOrdering::Release;
+      case AtomicOrdering::Acquire:
+      case AtomicOrdering::AcquireRelease:
+        return AtomicOrdering::AcquireRelease;
+      case AtomicOrdering::SequentiallyConsistent:
+        return AtomicOrdering::SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
+    switch (a) {
+      case AtomicOrdering::NotAtomic:
+        return AtomicOrdering::NotAtomic;
+      case AtomicOrdering::Unordered:
+      case AtomicOrdering::Monotonic:
+      case AtomicOrdering::Acquire:
+        return AtomicOrdering::Acquire;
+      case AtomicOrdering::Release:
+      case AtomicOrdering::AcquireRelease:
+        return AtomicOrdering::AcquireRelease;
+      case AtomicOrdering::SequentiallyConsistent:
+        return AtomicOrdering::SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  // ------------------- Visitors.
+  using InstVisitor<MemorySanitizerVisitor>::visit;
+  void visit(Instruction &I) {
+    if (!I.getMetadata("nosanitize"))
+      InstVisitor<MemorySanitizerVisitor>::visit(I);
+  }
+
+  /// Instrument LoadInst
+  ///
+  /// Loads the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the load address is fully defined.
+  void visitLoadInst(LoadInst &I) {
+    assert(I.getType()->isSized() && "Load type must have size");
+    assert(!I.getMetadata("nosanitize"));
+    IRBuilder<> IRB(I.getNextNode());
+    Type *ShadowTy = getShadowTy(&I);
+    Value *Addr = I.getPointerOperand();
+    Value *ShadowPtr, *OriginPtr;
+    unsigned Alignment = I.getAlignment();
+    if (PropagateShadow) {
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(I.getPointerOperand(), &I);
+
+    if (I.isAtomic())
+      I.setOrdering(addAcquireOrdering(I.getOrdering()));
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow) {
+        unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+        setOrigin(
+            &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment));
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+  }
+
+  /// Instrument StoreInst
+  ///
+  /// Stores the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the store address is fully defined.
+  void visitStoreInst(StoreInst &I) {
+    StoreList.push_back(&I);
+    if (ClCheckAccessAddress)
+      insertShadowCheck(I.getPointerOperand(), &I);
+  }
+
+  void handleCASOrRMW(Instruction &I) {
+    assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getOperand(0);
+    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(),
+                                          /*Alignment*/ 1, /*isStore*/ true)
+                           .first;
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // Only test the conditional argument of cmpxchg instruction.
+    // The other argument can potentially be uninitialized, but we can not
+    // detect this situation reliably without possible false positives.
+    if (isa<AtomicCmpXchgInst>(I))
+      insertShadowCheck(I.getOperand(1), &I);
+
+    IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
+
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    handleCASOrRMW(I);
+    I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering()));
+  }
+
+  // Vector manipulation.
+  void visitExtractElementInst(ExtractElementInst &I) {
+    insertShadowCheck(I.getOperand(1), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
+              "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitInsertElementInst(InsertElementInst &I) {
+    insertShadowCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShuffleVectorInst(ShuffleVectorInst &I) {
+    insertShadowCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  // Casts.
+  void visitSExtInst(SExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitZExtInst(ZExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitTruncInst(TruncInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitBitCastInst(BitCastInst &I) {
+    // Special case: if this is the bitcast (there is exactly 1 allowed) between
+    // a musttail call and a ret, don't instrument. New instructions are not
+    // allowed after a musttail call.
+    if (auto *CI = dyn_cast<CallInst>(I.getOperand(0)))
+      if (CI->isMustTailCall())
+        return;
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_ptrtoint"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitIntToPtrInst(IntToPtrInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_inttoptr"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
+  void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
+
+  /// Propagate shadow for bitwise AND.
+  ///
+  /// This code is exact, i.e. if, for example, a bit in the left argument
+  /// is defined and 0, then neither the value not definedness of the
+  /// corresponding bit in B don't affect the resulting shadow.
+  void visitAnd(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "And" of 0 and a poisoned value results in unpoisoned value.
+    //  1&1 => 1;     0&1 => 0;     p&1 => p;
+    //  1&0 => 0;     0&0 => 0;     p&0 => 0;
+    //  1&p => p;     0&p => 0;     p&p => p;
+    //  S = (S1 & S2) | (V1 & S2) | (S1 & V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2}));
+    setOriginForNaryOp(I);
+  }
+
+  void visitOr(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "Or" of 1 and a poisoned value results in unpoisoned value.
+    //  1|1 => 1;     0|1 => 1;     p|1 => 1;
+    //  1|0 => 1;     0|0 => 0;     p|0 => p;
+    //  1|p => 1;     0|p => p;     p|p => p;
+    //  S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = IRB.CreateNot(I.getOperand(0));
+    Value *V2 = IRB.CreateNot(I.getOperand(1));
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2}));
+    setOriginForNaryOp(I);
+  }
+
+  /// Default propagation of shadow and/or origin.
+  ///
+  /// This class implements the general case of shadow propagation, used in all
+  /// cases where we don't know and/or don't care about what the operation
+  /// actually does. It converts all input shadow values to a common type
+  /// (extending or truncating as necessary), and bitwise OR's them.
+  ///
+  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+  /// fully initialized), and less prone to false positives.
+  ///
+  /// This class also implements the general case of origin propagation. For a
+  /// Nary operation, result origin is set to the origin of an argument that is
+  /// not entirely initialized. If there is more than one such arguments, the
+  /// rightmost of them is picked. It does not matter which one is picked if all
+  /// arguments are initialized.
+  template <bool CombineShadow>
+  class Combiner {
+    Value *Shadow = nullptr;
+    Value *Origin = nullptr;
+    IRBuilder<> &IRB;
+    MemorySanitizerVisitor *MSV;
+
+  public:
+    Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB)
+        : IRB(IRB), MSV(MSV) {}
+
+    /// Add a pair of shadow and origin values to the mix.
+    Combiner &Add(Value *OpShadow, Value *OpOrigin) {
+      if (CombineShadow) {
+        assert(OpShadow);
+        if (!Shadow)
+          Shadow = OpShadow;
+        else {
+          OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
+          Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
+        }
+      }
+
+      if (MSV->MS.TrackOrigins) {
+        assert(OpOrigin);
+        if (!Origin) {
+          Origin = OpOrigin;
+        } else {
+          Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
+          // No point in adding something that might result in 0 origin value.
+          if (!ConstOrigin || !ConstOrigin->isNullValue()) {
+            Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+            Value *Cond =
+                IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
+            Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+          }
+        }
+      }
+      return *this;
+    }
+
+    /// Add an application value to the mix.
+    Combiner &Add(Value *V) {
+      Value *OpShadow = MSV->getShadow(V);
+      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
+      return Add(OpShadow, OpOrigin);
+    }
+
+    /// Set the current combined values as the given instruction's shadow
+    /// and origin.
+    void Done(Instruction *I) {
+      if (CombineShadow) {
+        assert(Shadow);
+        Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
+        MSV->setShadow(I, Shadow);
+      }
+      if (MSV->MS.TrackOrigins) {
+        assert(Origin);
+        MSV->setOrigin(I, Origin);
+      }
+    }
+  };
+
+  using ShadowAndOriginCombiner = Combiner<true>;
+  using OriginCombiner = Combiner<false>;
+
+  /// Propagate origin for arbitrary operation.
+  void setOriginForNaryOp(Instruction &I) {
+    if (!MS.TrackOrigins) return;
+    IRBuilder<> IRB(&I);
+    OriginCombiner OC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      OC.Add(OI->get());
+    OC.Done(&I);
+  }
+
+  size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
+    assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
+           "Vector of pointers is not a valid shadow type");
+    return Ty->isVectorTy() ?
+      Ty->getVectorNumElements() * Ty->getScalarSizeInBits() :
+      Ty->getPrimitiveSizeInBits();
+  }
+
+  /// Cast between two shadow types, extending or truncating as
+  /// necessary.
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
+                          bool Signed = false) {
+    Type *srcTy = V->getType();
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    if (srcSizeInBits > 1 && dstSizeInBits == 1)
+      return IRB.CreateICmpNE(V, getCleanShadow(V));
+
+    if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
+      return IRB.CreateIntCast(V, dstTy, Signed);
+    if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
+        dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
+      return IRB.CreateIntCast(V, dstTy, Signed);
+    Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
+    Value *V2 =
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
+    return IRB.CreateBitCast(V2, dstTy);
+    // TODO: handle struct types.
+  }
+
+  /// Cast an application value to the type of its own shadow.
+  Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (V->getType() == ShadowTy)
+      return V;
+    if (V->getType()->isPtrOrPtrVectorTy())
+      return IRB.CreatePtrToInt(V, ShadowTy);
+    else
+      return IRB.CreateBitCast(V, ShadowTy);
+  }
+
+  /// Propagate shadow for arbitrary operation.
+  void handleShadowOr(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      SC.Add(OI->get());
+    SC.Done(&I);
+  }
+
+  void visitFNeg(UnaryOperator &I) { handleShadowOr(I); }
+
+  // Handle multiplication by constant.
+  //
+  // Handle a special case of multiplication by constant that may have one or
+  // more zeros in the lower bits. This makes corresponding number of lower bits
+  // of the result zero as well. We model it by shifting the other operand
+  // shadow left by the required number of bits. Effectively, we transform
+  // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B).
+  // We use multiplication by 2**N instead of shift to cover the case of
+  // multiplication by 0, which may occur in some elements of a vector operand.
+  void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
+                           Value *OtherArg) {
+    Constant *ShadowMul;
+    Type *Ty = ConstArg->getType();
+    if (Ty->isVectorTy()) {
+      unsigned NumElements = Ty->getVectorNumElements();
+      Type *EltTy = Ty->getSequentialElementType();
+      SmallVector<Constant *, 16> Elements;
+      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        if (ConstantInt *Elt =
+                dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) {
+          const APInt &V = Elt->getValue();
+          APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+          Elements.push_back(ConstantInt::get(EltTy, V2));
+        } else {
+          Elements.push_back(ConstantInt::get(EltTy, 1));
+        }
+      }
+      ShadowMul = ConstantVector::get(Elements);
+    } else {
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) {
+        const APInt &V = Elt->getValue();
+        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+        ShadowMul = ConstantInt::get(Ty, V2);
+      } else {
+        ShadowMul = ConstantInt::get(Ty, 1);
+      }
+    }
+
+    IRBuilder<> IRB(&I);
+    setShadow(&I,
+              IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst"));
+    setOrigin(&I, getOrigin(OtherArg));
+  }
+
+  void visitMul(BinaryOperator &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    if (constOp0 && !constOp1)
+      handleMulByConstant(I, constOp0, I.getOperand(1));
+    else if (constOp1 && !constOp0)
+      handleMulByConstant(I, constOp1, I.getOperand(0));
+    else
+      handleShadowOr(I);
+  }
+
+  void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
+  void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitXor(BinaryOperator &I) { handleShadowOr(I); }
+
+  void handleIntegerDiv(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    // Strict on the second argument.
+    insertShadowCheck(I.getOperand(1), &I);
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitUDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitSDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitURem(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitSRem(BinaryOperator &I) { handleIntegerDiv(I); }
+
+  // Floating point division is side-effect free. We can not require that the
+  // divisor is fully initialized and must propagate shadow. See PR37523.
+  void visitFDiv(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFRem(BinaryOperator &I) { handleShadowOr(I); }
+
+  /// Instrument == and != comparisons.
+  ///
+  /// Sometimes the comparison result is known even if some of the bits of the
+  /// arguments are not.
+  void handleEqualityComparison(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // A == B  <==>  (C = A^B) == 0
+    // A != B  <==>  (C = A^B) != 0
+    // Sc = Sa | Sb
+    Value *C = IRB.CreateXor(A, B);
+    Value *Sc = IRB.CreateOr(Sa, Sb);
+    // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+    // Result is defined if one of the following is true
+    // * there is a defined 1 bit in C
+    // * C is fully defined
+    // Si = !(C & ~Sc) && Sc
+    Value *Zero = Constant::getNullValue(Sc->getType());
+    Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+    Value *Si =
+      IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
+                    IRB.CreateICmpEQ(
+                      IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
+    Si->setName("_msprop_icmp");
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// Build the lowest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Maximise the undefined shadow bit, minimize other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+    } else {
+      // Minimize undefined bits.
+      return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+    }
+  }
+
+  /// Build the highest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Minimise the undefined shadow bit, maximise other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+    } else {
+      // Maximize undefined bits.
+      return IRB.CreateOr(A, Sa);
+    }
+  }
+
+  /// Instrument relational comparisons.
+  ///
+  /// This function does exact shadow propagation for all relational
+  /// comparisons of integers, pointers and vectors of those.
+  /// FIXME: output seems suboptimal when one of the operands is a constant
+  void handleRelationalComparisonExact(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // Let [a0, a1] be the interval of possible values of A, taking into account
+    // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+    bool IsSigned = I.isSigned();
+    Value *S1 = IRB.CreateICmp(I.getPredicate(),
+                               getLowestPossibleValue(IRB, A, Sa, IsSigned),
+                               getHighestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *S2 = IRB.CreateICmp(I.getPredicate(),
+                               getHighestPossibleValue(IRB, A, Sa, IsSigned),
+                               getLowestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *Si = IRB.CreateXor(S1, S2);
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// Instrument signed relational comparisons.
+  ///
+  /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
+  /// bit of the shadow. Everything else is delegated to handleShadowOr().
+  void handleSignedRelationalComparison(ICmpInst &I) {
+    Constant *constOp;
+    Value *op = nullptr;
+    CmpInst::Predicate pre;
+    if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) {
+      op = I.getOperand(0);
+      pre = I.getPredicate();
+    } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) {
+      op = I.getOperand(1);
+      pre = I.getSwappedPredicate();
+    } else {
+      handleShadowOr(I);
+      return;
+    }
+
+    if ((constOp->isNullValue() &&
+         (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) ||
+        (constOp->isAllOnesValue() &&
+         (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) {
+      IRBuilder<> IRB(&I);
+      Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op),
+                                        "_msprop_icmp_s");
+      setShadow(&I, Shadow);
+      setOrigin(&I, getOrigin(op));
+    } else {
+      handleShadowOr(I);
+    }
+  }
+
+  void visitICmpInst(ICmpInst &I) {
+    if (!ClHandleICmp) {
+      handleShadowOr(I);
+      return;
+    }
+    if (I.isEquality()) {
+      handleEqualityComparison(I);
+      return;
+    }
+
+    assert(I.isRelational());
+    if (ClHandleICmpExact) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+    if (I.isSigned()) {
+      handleSignedRelationalComparison(I);
+      return;
+    }
+
+    assert(I.isUnsigned());
+    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+
+    handleShadowOr(I);
+  }
+
+  void visitFCmpInst(FCmpInst &I) {
+    handleShadowOr(I);
+  }
+
+  void handleShift(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
+                                   S2->getType());
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShl(BinaryOperator &I) { handleShift(I); }
+  void visitAShr(BinaryOperator &I) { handleShift(I); }
+  void visitLShr(BinaryOperator &I) { handleShift(I); }
+
+  /// Instrument llvm.memmove
+  ///
+  /// At this point we don't know if llvm.memmove will be inlined or not.
+  /// If we don't instrument it and it gets inlined,
+  /// our interceptor will not kick in and we will lose the memmove.
+  /// If we instrument the call here, but it does not get inlined,
+  /// we will memove the shadow twice: which is bad in case
+  /// of overlapping regions. So, we simply lower the intrinsic to a call.
+  ///
+  /// Similar situation exists for memcpy and memset.
+  void visitMemMoveInst(MemMoveInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemmoveFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  // Similar to memmove: avoid copying shadow twice.
+  // This is somewhat unfortunate as it may slowdown small constant memcpys.
+  // FIXME: consider doing manual inline for small constant sizes and proper
+  // alignment.
+  void visitMemCpyInst(MemCpyInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemcpyFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  // Same as memcpy.
+  void visitMemSetInst(MemSetInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemsetFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    VAHelper->visitVAStartInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    VAHelper->visitVACopyInst(I);
+  }
+
+  /// Handle vector store-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD store: writes memory,
+  /// has 1 pointer argument and 1 vector argument, returns void.
+  bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Value *Shadow = getShadow(&I, 1);
+    Value *ShadowPtr, *OriginPtr;
+
+    // We don't know the pointer alignment (could be unaligned SSE store!).
+    // Have to assume to worst case.
+    std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+        Addr, IRB, Shadow->getType(), /*Alignment*/ 1, /*isStore*/ true);
+    IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // FIXME: factor out common code from materializeStores
+    if (MS.TrackOrigins) IRB.CreateStore(getOrigin(&I, 1), OriginPtr);
+    return true;
+  }
+
+  /// Handle vector load-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD load: reads memory,
+  /// has 1 pointer argument, returns a vector.
+  bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+
+    Type *ShadowTy = getShadowTy(&I);
+    Value *ShadowPtr, *OriginPtr;
+    if (PropagateShadow) {
+      // We don't know the pointer alignment (could be unaligned SSE load!).
+      // Have to assume to worst case.
+      unsigned Alignment = 1;
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow)
+        setOrigin(&I, IRB.CreateLoad(MS.OriginTy, OriginPtr));
+      else
+        setOrigin(&I, getCleanOrigin());
+    }
+    return true;
+  }
+
+  /// Handle (SIMD arithmetic)-like intrinsics.
+  ///
+  /// Instrument intrinsics with any number of arguments of the same type,
+  /// equal to the return type. The type should be simple (no aggregates or
+  /// pointers; vectors are fine).
+  /// Caller guarantees that this intrinsic does not access memory.
+  bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
+    Type *RetTy = I.getType();
+    if (!(RetTy->isIntOrIntVectorTy() ||
+          RetTy->isFPOrFPVectorTy() ||
+          RetTy->isX86_MMXTy()))
+      return false;
+
+    unsigned NumArgOperands = I.getNumArgOperands();
+
+    for (unsigned i = 0; i < NumArgOperands; ++i) {
+      Type *Ty = I.getArgOperand(i)->getType();
+      if (Ty != RetTy)
+        return false;
+    }
+
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (unsigned i = 0; i < NumArgOperands; ++i)
+      SC.Add(I.getArgOperand(i));
+    SC.Done(&I);
+
+    return true;
+  }
+
+  /// Heuristically instrument unknown intrinsics.
+  ///
+  /// The main purpose of this code is to do something reasonable with all
+  /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
+  /// We recognize several classes of intrinsics by their argument types and
+  /// ModRefBehaviour and apply special intrumentation when we are reasonably
+  /// sure that we know what the intrinsic does.
+  ///
+  /// We special-case intrinsics where this approach fails. See llvm.bswap
+  /// handling as an example of that.
+  bool handleUnknownIntrinsic(IntrinsicInst &I) {
+    unsigned NumArgOperands = I.getNumArgOperands();
+    if (NumArgOperands == 0)
+      return false;
+
+    if (NumArgOperands == 2 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getArgOperand(1)->getType()->isVectorTy() &&
+        I.getType()->isVoidTy() &&
+        !I.onlyReadsMemory()) {
+      // This looks like a vector store.
+      return handleVectorStoreIntrinsic(I);
+    }
+
+    if (NumArgOperands == 1 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getType()->isVectorTy() &&
+        I.onlyReadsMemory()) {
+      // This looks like a vector load.
+      return handleVectorLoadIntrinsic(I);
+    }
+
+    if (I.doesNotAccessMemory())
+      if (maybeHandleSimpleNomemIntrinsic(I))
+        return true;
+
+    // FIXME: detect and handle SSE maskstore/maskload
+    return false;
+  }
+
+  void handleInvariantGroup(IntrinsicInst &I) {
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void handleLifetimeStart(IntrinsicInst &I) {
+    if (!PoisonStack)
+      return;
+    DenseMap<Value *, AllocaInst *> AllocaForValue;
+    AllocaInst *AI =
+        llvm::findAllocaForValue(I.getArgOperand(1), AllocaForValue);
+    if (!AI)
+      InstrumentLifetimeStart = false;
+    LifetimeStartList.push_back(std::make_pair(&I, AI));
+  }
+
+  void handleBswap(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Op = I.getArgOperand(0);
+    Type *OpType = Op->getType();
+    Function *BswapFunc = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::bswap, makeArrayRef(&OpType, 1));
+    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+    setOrigin(&I, getOrigin(Op));
+  }
+
+  // Instrument vector convert instrinsic.
+  //
+  // This function instruments intrinsics like cvtsi2ss:
+  // %Out = int_xxx_cvtyyy(%ConvertOp)
+  // or
+  // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
+  // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
+  // number \p Out elements, and (if has 2 arguments) copies the rest of the
+  // elements from \p CopyOp.
+  // In most cases conversion involves floating-point value which may trigger a
+  // hardware exception when not fully initialized. For this reason we require
+  // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
+  // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
+  // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
+  // return a fully initialized value.
+  void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) {
+    IRBuilder<> IRB(&I);
+    Value *CopyOp, *ConvertOp;
+
+    switch (I.getNumArgOperands()) {
+    case 3:
+      assert(isa<ConstantInt>(I.getArgOperand(2)) && "Invalid rounding mode");
+      LLVM_FALLTHROUGH;
+    case 2:
+      CopyOp = I.getArgOperand(0);
+      ConvertOp = I.getArgOperand(1);
+      break;
+    case 1:
+      ConvertOp = I.getArgOperand(0);
+      CopyOp = nullptr;
+      break;
+    default:
+      llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
+    }
+
+    // The first *NumUsedElements* elements of ConvertOp are converted to the
+    // same number of output elements. The rest of the output is copied from
+    // CopyOp, or (if not available) filled with zeroes.
+    // Combine shadow for elements of ConvertOp that are used in this operation,
+    // and insert a check.
+    // FIXME: consider propagating shadow of ConvertOp, at least in the case of
+    // int->any conversion.
+    Value *ConvertShadow = getShadow(ConvertOp);
+    Value *AggShadow = nullptr;
+    if (ConvertOp->getType()->isVectorTy()) {
+      AggShadow = IRB.CreateExtractElement(
+          ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+      for (int i = 1; i < NumUsedElements; ++i) {
+        Value *MoreShadow = IRB.CreateExtractElement(
+            ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+        AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
+      }
+    } else {
+      AggShadow = ConvertShadow;
+    }
+    assert(AggShadow->getType()->isIntegerTy());
+    insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
+
+    // Build result shadow by zero-filling parts of CopyOp shadow that come from
+    // ConvertOp.
+    if (CopyOp) {
+      assert(CopyOp->getType() == I.getType());
+      assert(CopyOp->getType()->isVectorTy());
+      Value *ResultShadow = getShadow(CopyOp);
+      Type *EltTy = ResultShadow->getType()->getVectorElementType();
+      for (int i = 0; i < NumUsedElements; ++i) {
+        ResultShadow = IRB.CreateInsertElement(
+            ResultShadow, ConstantInt::getNullValue(EltTy),
+            ConstantInt::get(IRB.getInt32Ty(), i));
+      }
+      setShadow(&I, ResultShadow);
+      setOrigin(&I, getOrigin(CopyOp));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+    }
+  }
+
+  // Given a scalar or vector, extract lower 64 bits (or less), and return all
+  // zeroes if it is zero, and all ones otherwise.
+  Value *Lower64ShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+    if (S->getType()->isVectorTy())
+      S = CreateShadowCast(IRB, S, IRB.getInt64Ty(), /* Signed */ true);
+    assert(S->getType()->getPrimitiveSizeInBits() <= 64);
+    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+    return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+  }
+
+  // Given a vector, extract its first element, and return all
+  // zeroes if it is zero, and all ones otherwise.
+  Value *LowerElementShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+    Value *S1 = IRB.CreateExtractElement(S, (uint64_t)0);
+    Value *S2 = IRB.CreateICmpNE(S1, getCleanShadow(S1));
+    return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+  }
+
+  Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) {
+    Type *T = S->getType();
+    assert(T->isVectorTy());
+    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+    return IRB.CreateSExt(S2, T);
+  }
+
+  // Instrument vector shift instrinsic.
+  //
+  // This function instruments intrinsics like int_x86_avx2_psll_w.
+  // Intrinsic shifts %In by %ShiftSize bits.
+  // %ShiftSize may be a vector. In that case the lower 64 bits determine shift
+  // size, and the rest is ignored. Behavior is defined even if shift size is
+  // greater than register (or field) width.
+  void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) {
+    assert(I.getNumArgOperands() == 2);
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = Variable ? VariableShadowExtend(IRB, S2)
+                             : Lower64ShadowExtend(IRB, S2, getShadowTy(&I));
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateCall(I.getFunctionType(), I.getCalledValue(),
+                                  {IRB.CreateBitCast(S1, V1->getType()), V2});
+    Shift = IRB.CreateBitCast(Shift, getShadowTy(&I));
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  // Get an X86_MMX-sized vector type.
+  Type *getMMXVectorTy(unsigned EltSizeInBits) {
+    const unsigned X86_MMXSizeInBits = 64;
+    assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
+           "Illegal MMX vector element size");
+    return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+                           X86_MMXSizeInBits / EltSizeInBits);
+  }
+
+  // Returns a signed counterpart for an (un)signed-saturate-and-pack
+  // intrinsic.
+  Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
+    switch (id) {
+      case Intrinsic::x86_sse2_packsswb_128:
+      case Intrinsic::x86_sse2_packuswb_128:
+        return Intrinsic::x86_sse2_packsswb_128;
+
+      case Intrinsic::x86_sse2_packssdw_128:
+      case Intrinsic::x86_sse41_packusdw:
+        return Intrinsic::x86_sse2_packssdw_128;
+
+      case Intrinsic::x86_avx2_packsswb:
+      case Intrinsic::x86_avx2_packuswb:
+        return Intrinsic::x86_avx2_packsswb;
+
+      case Intrinsic::x86_avx2_packssdw:
+      case Intrinsic::x86_avx2_packusdw:
+        return Intrinsic::x86_avx2_packssdw;
+
+      case Intrinsic::x86_mmx_packsswb:
+      case Intrinsic::x86_mmx_packuswb:
+        return Intrinsic::x86_mmx_packsswb;
+
+      case Intrinsic::x86_mmx_packssdw:
+        return Intrinsic::x86_mmx_packssdw;
+      default:
+        llvm_unreachable("unexpected intrinsic id");
+    }
+  }
+
+  // Instrument vector pack instrinsic.
+  //
+  // This function instruments intrinsics like x86_mmx_packsswb, that
+  // packs elements of 2 input vectors into half as many bits with saturation.
+  // Shadow is propagated with the signed variant of the same intrinsic applied
+  // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
+  // EltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+    assert(I.getNumArgOperands() == 2);
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    IRBuilder<> IRB(&I);
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    assert(isX86_MMX || S1->getType()->isVectorTy());
+
+    // SExt and ICmpNE below must apply to individual elements of input vectors.
+    // In case of x86mmx arguments, cast them to appropriate vector types and
+    // back.
+    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
+    if (isX86_MMX) {
+      S1 = IRB.CreateBitCast(S1, T);
+      S2 = IRB.CreateBitCast(S2, T);
+    }
+    Value *S1_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T);
+    Value *S2_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T);
+    if (isX86_MMX) {
+      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
+      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
+      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    }
+
+    Function *ShadowFn = Intrinsic::getDeclaration(
+        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
+
+    Value *S =
+        IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
+    if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument sum-of-absolute-differencies intrinsic.
+  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+    const unsigned SignificantBitsPerResultElement = 16;
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    unsigned ZeroBitsPerResultElement =
+        ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
+
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateLShr(S, ZeroBitsPerResultElement);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument multiply-add intrinsic.
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+                                  unsigned EltSizeInBits = 0) {
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument compare-packed intrinsic.
+  // Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
+  // all-ones shadow.
+  void handleVectorComparePackedIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ResTy = getShadowTy(&I);
+    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    Value *S = IRB.CreateSExt(
+        IRB.CreateICmpNE(S0, Constant::getNullValue(ResTy)), ResTy);
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument compare-scalar intrinsic.
+  // This handles both cmp* intrinsics which return the result in the first
+  // element of a vector, and comi* which return the result as i32.
+  void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    Value *S = LowerElementShadowExtend(IRB, S0, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  void handleStmxcsr(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    Value *ShadowPtr =
+        getShadowOriginPtr(Addr, IRB, Ty, /*Alignment*/ 1, /*isStore*/ true)
+            .first;
+
+    IRB.CreateStore(getCleanShadow(Ty),
+                    IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+  }
+
+  void handleLdmxcsr(IntrinsicInst &I) {
+    if (!InsertChecks) return;
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    unsigned Alignment = 1;
+    Value *ShadowPtr, *OriginPtr;
+    std::tie(ShadowPtr, OriginPtr) =
+        getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr");
+    Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr)
+                                    : getCleanOrigin();
+    insertShadowCheck(Shadow, Origin, &I);
+  }
+
+  void handleMaskedStore(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *V = I.getArgOperand(0);
+    Value *Addr = I.getArgOperand(1);
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+    Value *Mask = I.getArgOperand(3);
+    Value *Shadow = getShadow(V);
+
+    Value *ShadowPtr;
+    Value *OriginPtr;
+    std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+        Addr, IRB, Shadow->getType(), Align, /*isStore*/ true);
+
+    if (ClCheckAccessAddress) {
+      insertShadowCheck(Addr, &I);
+      // Uninitialized mask is kind of like uninitialized address, but not as
+      // scary.
+      insertShadowCheck(Mask, &I);
+    }
+
+    IRB.CreateMaskedStore(Shadow, ShadowPtr, Align, Mask);
+
+    if (MS.TrackOrigins) {
+      auto &DL = F.getParent()->getDataLayout();
+      paintOrigin(IRB, getOrigin(V), OriginPtr,
+                  DL.getTypeStoreSize(Shadow->getType()),
+                  std::max(Align, kMinOriginAlignment));
+    }
+  }
+
+  bool handleMaskedLoad(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Value *Mask = I.getArgOperand(2);
+    Value *PassThru = I.getArgOperand(3);
+
+    Type *ShadowTy = getShadowTy(&I);
+    Value *ShadowPtr, *OriginPtr;
+    if (PropagateShadow) {
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Align, /*isStore*/ false);
+      setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Align, Mask,
+                                         getShadow(PassThru), "_msmaskedld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress) {
+      insertShadowCheck(Addr, &I);
+      insertShadowCheck(Mask, &I);
+    }
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow) {
+        // Choose between PassThru's and the loaded value's origins.
+        Value *MaskedPassThruShadow = IRB.CreateAnd(
+            getShadow(PassThru), IRB.CreateSExt(IRB.CreateNeg(Mask), ShadowTy));
+
+        Value *Acc = IRB.CreateExtractElement(
+            MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+        for (int i = 1, N = PassThru->getType()->getVectorNumElements(); i < N;
+             ++i) {
+          Value *More = IRB.CreateExtractElement(
+              MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+          Acc = IRB.CreateOr(Acc, More);
+        }
+
+        Value *Origin = IRB.CreateSelect(
+            IRB.CreateICmpNE(Acc, Constant::getNullValue(Acc->getType())),
+            getOrigin(PassThru), IRB.CreateLoad(MS.OriginTy, OriginPtr));
+
+        setOrigin(&I, Origin);
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+    return true;
+  }
+
+  // Instrument BMI / BMI2 intrinsics.
+  // All of these intrinsics are Z = I(X, Y)
+  // where the types of all operands and the result match, and are either i32 or i64.
+  // The following instrumentation happens to work for all of them:
+  //   Sz = I(Sx, Y) | (sext (Sy != 0))
+  void handleBmiIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ShadowTy = getShadowTy(&I);
+
+    // If any bit of the mask operand is poisoned, then the whole thing is.
+    Value *SMask = getShadow(&I, 1);
+    SMask = IRB.CreateSExt(IRB.CreateICmpNE(SMask, getCleanShadow(ShadowTy)),
+                           ShadowTy);
+    // Apply the same intrinsic to the shadow of the first operand.
+    Value *S = IRB.CreateCall(I.getCalledFunction(),
+                              {getShadow(&I, 0), I.getOperand(1)});
+    S = IRB.CreateOr(SMask, S);
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+      handleLifetimeStart(I);
+      break;
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group:
+      handleInvariantGroup(I);
+      break;
+    case Intrinsic::bswap:
+      handleBswap(I);
+      break;
+    case Intrinsic::masked_store:
+      handleMaskedStore(I);
+      break;
+    case Intrinsic::masked_load:
+      handleMaskedLoad(I);
+      break;
+    case Intrinsic::x86_sse_stmxcsr:
+      handleStmxcsr(I);
+      break;
+    case Intrinsic::x86_sse_ldmxcsr:
+      handleLdmxcsr(I);
+      break;
+    case Intrinsic::x86_avx512_vcvtsd2usi64:
+    case Intrinsic::x86_avx512_vcvtsd2usi32:
+    case Intrinsic::x86_avx512_vcvtss2usi64:
+    case Intrinsic::x86_avx512_vcvtss2usi32:
+    case Intrinsic::x86_avx512_cvttss2usi64:
+    case Intrinsic::x86_avx512_cvttss2usi:
+    case Intrinsic::x86_avx512_cvttsd2usi64:
+    case Intrinsic::x86_avx512_cvttsd2usi:
+    case Intrinsic::x86_avx512_cvtusi2ss:
+    case Intrinsic::x86_avx512_cvtusi642sd:
+    case Intrinsic::x86_avx512_cvtusi642ss:
+    case Intrinsic::x86_sse2_cvtsd2si64:
+    case Intrinsic::x86_sse2_cvtsd2si:
+    case Intrinsic::x86_sse2_cvtsd2ss:
+    case Intrinsic::x86_sse2_cvttsd2si64:
+    case Intrinsic::x86_sse2_cvttsd2si:
+    case Intrinsic::x86_sse_cvtss2si64:
+    case Intrinsic::x86_sse_cvtss2si:
+    case Intrinsic::x86_sse_cvttss2si64:
+    case Intrinsic::x86_sse_cvttss2si:
+      handleVectorConvertIntrinsic(I, 1);
+      break;
+    case Intrinsic::x86_sse_cvtps2pi:
+    case Intrinsic::x86_sse_cvttps2pi:
+      handleVectorConvertIntrinsic(I, 2);
+      break;
+
+    case Intrinsic::x86_avx512_psll_w_512:
+    case Intrinsic::x86_avx512_psll_d_512:
+    case Intrinsic::x86_avx512_psll_q_512:
+    case Intrinsic::x86_avx512_pslli_w_512:
+    case Intrinsic::x86_avx512_pslli_d_512:
+    case Intrinsic::x86_avx512_pslli_q_512:
+    case Intrinsic::x86_avx512_psrl_w_512:
+    case Intrinsic::x86_avx512_psrl_d_512:
+    case Intrinsic::x86_avx512_psrl_q_512:
+    case Intrinsic::x86_avx512_psra_w_512:
+    case Intrinsic::x86_avx512_psra_d_512:
+    case Intrinsic::x86_avx512_psra_q_512:
+    case Intrinsic::x86_avx512_psrli_w_512:
+    case Intrinsic::x86_avx512_psrli_d_512:
+    case Intrinsic::x86_avx512_psrli_q_512:
+    case Intrinsic::x86_avx512_psrai_w_512:
+    case Intrinsic::x86_avx512_psrai_d_512:
+    case Intrinsic::x86_avx512_psrai_q_512:
+    case Intrinsic::x86_avx512_psra_q_256:
+    case Intrinsic::x86_avx512_psra_q_128:
+    case Intrinsic::x86_avx512_psrai_q_256:
+    case Intrinsic::x86_avx512_psrai_q_128:
+    case Intrinsic::x86_avx2_psll_w:
+    case Intrinsic::x86_avx2_psll_d:
+    case Intrinsic::x86_avx2_psll_q:
+    case Intrinsic::x86_avx2_pslli_w:
+    case Intrinsic::x86_avx2_pslli_d:
+    case Intrinsic::x86_avx2_pslli_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+    case Intrinsic::x86_avx2_psra_w:
+    case Intrinsic::x86_avx2_psra_d:
+    case Intrinsic::x86_avx2_psrli_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+    case Intrinsic::x86_avx2_psrai_w:
+    case Intrinsic::x86_avx2_psrai_d:
+    case Intrinsic::x86_sse2_psll_w:
+    case Intrinsic::x86_sse2_psll_d:
+    case Intrinsic::x86_sse2_psll_q:
+    case Intrinsic::x86_sse2_pslli_w:
+    case Intrinsic::x86_sse2_pslli_d:
+    case Intrinsic::x86_sse2_pslli_q:
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_sse2_psra_w:
+    case Intrinsic::x86_sse2_psra_d:
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_sse2_psrai_w:
+    case Intrinsic::x86_sse2_psrai_d:
+    case Intrinsic::x86_mmx_psll_w:
+    case Intrinsic::x86_mmx_psll_d:
+    case Intrinsic::x86_mmx_psll_q:
+    case Intrinsic::x86_mmx_pslli_w:
+    case Intrinsic::x86_mmx_pslli_d:
+    case Intrinsic::x86_mmx_pslli_q:
+    case Intrinsic::x86_mmx_psrl_w:
+    case Intrinsic::x86_mmx_psrl_d:
+    case Intrinsic::x86_mmx_psrl_q:
+    case Intrinsic::x86_mmx_psra_w:
+    case Intrinsic::x86_mmx_psra_d:
+    case Intrinsic::x86_mmx_psrli_w:
+    case Intrinsic::x86_mmx_psrli_d:
+    case Intrinsic::x86_mmx_psrli_q:
+    case Intrinsic::x86_mmx_psrai_w:
+    case Intrinsic::x86_mmx_psrai_d:
+      handleVectorShiftIntrinsic(I, /* Variable */ false);
+      break;
+    case Intrinsic::x86_avx2_psllv_d:
+    case Intrinsic::x86_avx2_psllv_d_256:
+    case Intrinsic::x86_avx512_psllv_d_512:
+    case Intrinsic::x86_avx2_psllv_q:
+    case Intrinsic::x86_avx2_psllv_q_256:
+    case Intrinsic::x86_avx512_psllv_q_512:
+    case Intrinsic::x86_avx2_psrlv_d:
+    case Intrinsic::x86_avx2_psrlv_d_256:
+    case Intrinsic::x86_avx512_psrlv_d_512:
+    case Intrinsic::x86_avx2_psrlv_q:
+    case Intrinsic::x86_avx2_psrlv_q_256:
+    case Intrinsic::x86_avx512_psrlv_q_512:
+    case Intrinsic::x86_avx2_psrav_d:
+    case Intrinsic::x86_avx2_psrav_d_256:
+    case Intrinsic::x86_avx512_psrav_d_512:
+    case Intrinsic::x86_avx512_psrav_q_128:
+    case Intrinsic::x86_avx512_psrav_q_256:
+    case Intrinsic::x86_avx512_psrav_q_512:
+      handleVectorShiftIntrinsic(I, /* Variable */ true);
+      break;
+
+    case Intrinsic::x86_sse2_packsswb_128:
+    case Intrinsic::x86_sse2_packssdw_128:
+    case Intrinsic::x86_sse2_packuswb_128:
+    case Intrinsic::x86_sse41_packusdw:
+    case Intrinsic::x86_avx2_packsswb:
+    case Intrinsic::x86_avx2_packssdw:
+    case Intrinsic::x86_avx2_packuswb:
+    case Intrinsic::x86_avx2_packusdw:
+      handleVectorPackIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_mmx_packsswb:
+    case Intrinsic::x86_mmx_packuswb:
+      handleVectorPackIntrinsic(I, 16);
+      break;
+
+    case Intrinsic::x86_mmx_packssdw:
+      handleVectorPackIntrinsic(I, 32);
+      break;
+
+    case Intrinsic::x86_mmx_psad_bw:
+    case Intrinsic::x86_sse2_psad_bw:
+    case Intrinsic::x86_avx2_psad_bw:
+      handleVectorSadIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_sse2_pmadd_wd:
+    case Intrinsic::x86_avx2_pmadd_wd:
+    case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+    case Intrinsic::x86_avx2_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_ssse3_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I, 8);
+      break;
+
+    case Intrinsic::x86_mmx_pmadd_wd:
+      handleVectorPmaddIntrinsic(I, 16);
+      break;
+
+    case Intrinsic::x86_sse_cmp_ss:
+    case Intrinsic::x86_sse2_cmp_sd:
+    case Intrinsic::x86_sse_comieq_ss:
+    case Intrinsic::x86_sse_comilt_ss:
+    case Intrinsic::x86_sse_comile_ss:
+    case Intrinsic::x86_sse_comigt_ss:
+    case Intrinsic::x86_sse_comige_ss:
+    case Intrinsic::x86_sse_comineq_ss:
+    case Intrinsic::x86_sse_ucomieq_ss:
+    case Intrinsic::x86_sse_ucomilt_ss:
+    case Intrinsic::x86_sse_ucomile_ss:
+    case Intrinsic::x86_sse_ucomigt_ss:
+    case Intrinsic::x86_sse_ucomige_ss:
+    case Intrinsic::x86_sse_ucomineq_ss:
+    case Intrinsic::x86_sse2_comieq_sd:
+    case Intrinsic::x86_sse2_comilt_sd:
+    case Intrinsic::x86_sse2_comile_sd:
+    case Intrinsic::x86_sse2_comigt_sd:
+    case Intrinsic::x86_sse2_comige_sd:
+    case Intrinsic::x86_sse2_comineq_sd:
+    case Intrinsic::x86_sse2_ucomieq_sd:
+    case Intrinsic::x86_sse2_ucomilt_sd:
+    case Intrinsic::x86_sse2_ucomile_sd:
+    case Intrinsic::x86_sse2_ucomigt_sd:
+    case Intrinsic::x86_sse2_ucomige_sd:
+    case Intrinsic::x86_sse2_ucomineq_sd:
+      handleVectorCompareScalarIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_sse_cmp_ps:
+    case Intrinsic::x86_sse2_cmp_pd:
+      // FIXME: For x86_avx_cmp_pd_256 and x86_avx_cmp_ps_256 this function
+      // generates reasonably looking IR that fails in the backend with "Do not
+      // know how to split the result of this operator!".
+      handleVectorComparePackedIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_bmi_bextr_32:
+    case Intrinsic::x86_bmi_bextr_64:
+    case Intrinsic::x86_bmi_bzhi_32:
+    case Intrinsic::x86_bmi_bzhi_64:
+    case Intrinsic::x86_bmi_pdep_32:
+    case Intrinsic::x86_bmi_pdep_64:
+    case Intrinsic::x86_bmi_pext_32:
+    case Intrinsic::x86_bmi_pext_64:
+      handleBmiIntrinsic(I);
+      break;
+
+    case Intrinsic::is_constant:
+      // The result of llvm.is.constant() is always defined.
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+      break;
+
+    default:
+      if (!handleUnknownIntrinsic(I))
+        visitInstruction(I);
+      break;
+    }
+  }
+
+  void visitCallSite(CallSite CS) {
+    Instruction &I = *CS.getInstruction();
+    assert(!I.getMetadata("nosanitize"));
+    assert((CS.isCall() || CS.isInvoke() || CS.isCallBr()) &&
+           "Unknown type of CallSite");
+    if (CS.isCallBr() || (CS.isCall() && cast<CallInst>(&I)->isInlineAsm())) {
+      // For inline asm (either a call to asm function, or callbr instruction),
+      // do the usual thing: check argument shadow and mark all outputs as
+      // clean. Note that any side effects of the inline asm that are not
+      // immediately visible in its constraints are not handled.
+      if (ClHandleAsmConservative && MS.CompileKernel)
+        visitAsmInstruction(I);
+      else
+        visitInstruction(I);
+      return;
+    }
+    if (CS.isCall()) {
+      CallInst *Call = cast<CallInst>(&I);
+      assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere");
+
+      // We are going to insert code that relies on the fact that the callee
+      // will become a non-readonly function after it is instrumented by us. To
+      // prevent this code from being optimized out, mark that function
+      // non-readonly in advance.
+      if (Function *Func = Call->getCalledFunction()) {
+        // Clear out readonly/readnone attributes.
+        AttrBuilder B;
+        B.addAttribute(Attribute::ReadOnly)
+          .addAttribute(Attribute::ReadNone);
+        Func->removeAttributes(AttributeList::FunctionIndex, B);
+      }
+
+      maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
+    }
+    IRBuilder<> IRB(&I);
+
+    unsigned ArgOffset = 0;
+    LLVM_DEBUG(dbgs() << "  CallSite: " << I << "\n");
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned i = ArgIt - CS.arg_begin();
+      if (!A->getType()->isSized()) {
+        LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+        continue;
+      }
+      unsigned Size = 0;
+      Value *Store = nullptr;
+      // Compute the Shadow for arg even if it is ByVal, because
+      // in that case getShadow() will copy the actual arg shadow to
+      // __msan_param_tls.
+      Value *ArgShadow = getShadow(A);
+      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+      LLVM_DEBUG(dbgs() << "  Arg#" << i << ": " << *A
+                        << " Shadow: " << *ArgShadow << "\n");
+      bool ArgIsInitialized = false;
+      const DataLayout &DL = F.getParent()->getDataLayout();
+      if (CS.paramHasAttr(i, Attribute::ByVal)) {
+        assert(A->getType()->isPointerTy() &&
+               "ByVal argument is not a pointer!");
+        Size = DL.getTypeAllocSize(A->getType()->getPointerElementType());
+        if (ArgOffset + Size > kParamTLSSize) break;
+        unsigned ParamAlignment = CS.getParamAlignment(i);
+        unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
+        Value *AShadowPtr =
+            getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ false)
+                .first;
+
+        Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
+                                 Alignment, Size);
+        // TODO(glider): need to copy origins.
+      } else {
+        Size = DL.getTypeAllocSize(A->getType());
+        if (ArgOffset + Size > kParamTLSSize) break;
+        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+                                       kShadowTLSAlignment);
+        Constant *Cst = dyn_cast<Constant>(ArgShadow);
+        if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
+      }
+      if (MS.TrackOrigins && !ArgIsInitialized)
+        IRB.CreateStore(getOrigin(A),
+                        getOriginPtrForArgument(A, IRB, ArgOffset));
+      (void)Store;
+      assert(Size != 0 && Store != nullptr);
+      LLVM_DEBUG(dbgs() << "  Param:" << *Store << "\n");
+      ArgOffset += alignTo(Size, 8);
+    }
+    LLVM_DEBUG(dbgs() << "  done with call args\n");
+
+    FunctionType *FT = CS.getFunctionType();
+    if (FT->isVarArg()) {
+      VAHelper->visitCallSite(CS, IRB);
+    }
+
+    // Now, get the shadow for the RetVal.
+    if (!I.getType()->isSized()) return;
+    // Don't emit the epilogue for musttail call returns.
+    if (CS.isCall() && cast<CallInst>(&I)->isMustTailCall()) return;
+    IRBuilder<> IRBBefore(&I);
+    // Until we have full dynamic coverage, make sure the retval shadow is 0.
+    Value *Base = getShadowPtrForRetval(&I, IRBBefore);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
+    BasicBlock::iterator NextInsn;
+    if (CS.isCall()) {
+      NextInsn = ++I.getIterator();
+      assert(NextInsn != I.getParent()->end());
+    } else {
+      BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();
+      if (!NormalDest->getSinglePredecessor()) {
+        // FIXME: this case is tricky, so we are just conservative here.
+        // Perhaps we need to split the edge between this BB and NormalDest,
+        // but a naive attempt to use SplitEdge leads to a crash.
+        setShadow(&I, getCleanShadow(&I));
+        setOrigin(&I, getCleanOrigin());
+        return;
+      }
+      // FIXME: NextInsn is likely in a basic block that has not been visited yet.
+      // Anything inserted there will be instrumented by MSan later!
+      NextInsn = NormalDest->getFirstInsertionPt();
+      assert(NextInsn != NormalDest->end() &&
+             "Could not find insertion point for retval shadow load");
+    }
+    IRBuilder<> IRBAfter(&*NextInsn);
+    Value *RetvalShadow = IRBAfter.CreateAlignedLoad(
+        getShadowTy(&I), getShadowPtrForRetval(&I, IRBAfter),
+        kShadowTLSAlignment, "_msret");
+    setShadow(&I, RetvalShadow);
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRBAfter.CreateLoad(MS.OriginTy,
+                                        getOriginPtrForRetval(IRBAfter)));
+  }
+
+  bool isAMustTailRetVal(Value *RetVal) {
+    if (auto *I = dyn_cast<BitCastInst>(RetVal)) {
+      RetVal = I->getOperand(0);
+    }
+    if (auto *I = dyn_cast<CallInst>(RetVal)) {
+      return I->isMustTailCall();
+    }
+    return false;
+  }
+
+  void visitReturnInst(ReturnInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *RetVal = I.getReturnValue();
+    if (!RetVal) return;
+    // Don't emit the epilogue for musttail call returns.
+    if (isAMustTailRetVal(RetVal)) return;
+    Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+    if (CheckReturnValue) {
+      insertShadowCheck(RetVal, &I);
+      Value *Shadow = getCleanShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+    } else {
+      Value *Shadow = getShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      if (MS.TrackOrigins)
+        IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
+    }
+  }
+
+  void visitPHINode(PHINode &I) {
+    IRBuilder<> IRB(&I);
+    if (!PropagateShadow) {
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+      return;
+    }
+
+    ShadowPHINodes.push_back(&I);
+    setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
+                                "_msphi_s"));
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
+                                  "_msphi_o"));
+  }
+
+  Value *getLocalVarDescription(AllocaInst &I) {
+    SmallString<2048> StackDescriptionStorage;
+    raw_svector_ostream StackDescription(StackDescriptionStorage);
+    // We create a string with a description of the stack allocation and
+    // pass it into __msan_set_alloca_origin.
+    // It will be printed by the run-time if stack-originated UMR is found.
+    // The first 4 bytes of the string are set to '----' and will be replaced
+    // by __msan_va_arg_overflow_size_tls at the first call.
+    StackDescription << "----" << I.getName() << "@" << F.getName();
+    return createPrivateNonConstGlobalForString(*F.getParent(),
+                                                StackDescription.str());
+  }
+
+  void poisonAllocaUserspace(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
+    if (PoisonStack && ClPoisonStackWithCall) {
+      IRB.CreateCall(MS.MsanPoisonStackFn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
+    } else {
+      Value *ShadowBase, *OriginBase;
+      std::tie(ShadowBase, OriginBase) =
+          getShadowOriginPtr(&I, IRB, IRB.getInt8Ty(), 1, /*isStore*/ true);
+
+      Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment());
+    }
+
+    if (PoisonStack && MS.TrackOrigins) {
+      Value *Descr = getLocalVarDescription(I);
+      IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
+                      IRB.CreatePointerCast(&F, MS.IntptrTy)});
+    }
+  }
+
+  void poisonAllocaKmsan(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
+    Value *Descr = getLocalVarDescription(I);
+    if (PoisonStack) {
+      IRB.CreateCall(MS.MsanPoisonAllocaFn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy())});
+    } else {
+      IRB.CreateCall(MS.MsanUnpoisonAllocaFn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
+    }
+  }
+
+  void instrumentAlloca(AllocaInst &I, Instruction *InsPoint = nullptr) {
+    if (!InsPoint)
+      InsPoint = &I;
+    IRBuilder<> IRB(InsPoint->getNextNode());
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
+    Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
+    if (I.isArrayAllocation())
+      Len = IRB.CreateMul(Len, I.getArraySize());
+
+    if (MS.CompileKernel)
+      poisonAllocaKmsan(I, IRB, Len);
+    else
+      poisonAllocaUserspace(I, IRB, Len);
+  }
+
+  void visitAllocaInst(AllocaInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+    // We'll get to this alloca later unless it's poisoned at the corresponding
+    // llvm.lifetime.start.
+    AllocaSet.insert(&I);
+  }
+
+  void visitSelectInst(SelectInst& I) {
+    IRBuilder<> IRB(&I);
+    // a = select b, c, d
+    Value *B = I.getCondition();
+    Value *C = I.getTrueValue();
+    Value *D = I.getFalseValue();
+    Value *Sb = getShadow(B);
+    Value *Sc = getShadow(C);
+    Value *Sd = getShadow(D);
+
+    // Result shadow if condition shadow is 0.
+    Value *Sa0 = IRB.CreateSelect(B, Sc, Sd);
+    Value *Sa1;
+    if (I.getType()->isAggregateType()) {
+      // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
+      // an extra "select". This results in much more compact IR.
+      // Sa = select Sb, poisoned, (select b, Sc, Sd)
+      Sa1 = getPoisonedShadow(getShadowTy(I.getType()));
+    } else {
+      // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]
+      // If Sb (condition is poisoned), look for bits in c and d that are equal
+      // and both unpoisoned.
+      // If !Sb (condition is unpoisoned), simply pick one of Sc and Sd.
+
+      // Cast arguments to shadow-compatible type.
+      C = CreateAppToShadowCast(IRB, C);
+      D = CreateAppToShadowCast(IRB, D);
+
+      // Result shadow if condition shadow is 1.
+      Sa1 = IRB.CreateOr({IRB.CreateXor(C, D), Sc, Sd});
+    }
+    Value *Sa = IRB.CreateSelect(Sb, Sa1, Sa0, "_msprop_select");
+    setShadow(&I, Sa);
+    if (MS.TrackOrigins) {
+      // Origins are always i32, so any vector conditions must be flattened.
+      // FIXME: consider tracking vector origins for app vectors?
+      if (B->getType()->isVectorTy()) {
+        Type *FlatTy = getShadowTyNoVec(B->getType());
+        B = IRB.CreateICmpNE(IRB.CreateBitCast(B, FlatTy),
+                                ConstantInt::getNullValue(FlatTy));
+        Sb = IRB.CreateICmpNE(IRB.CreateBitCast(Sb, FlatTy),
+                                      ConstantInt::getNullValue(FlatTy));
+      }
+      // a = select b, c, d
+      // Oa = Sb ? Ob : (b ? Oc : Od)
+      setOrigin(
+          &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()),
+                               IRB.CreateSelect(B, getOrigin(I.getTrueValue()),
+                                                getOrigin(I.getFalseValue()))));
+    }
+  }
+
+  void visitLandingPadInst(LandingPadInst &I) {
+    // Do nothing.
+    // See https://github.com/google/sanitizers/issues/504
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitCatchSwitchInst(CatchSwitchInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitFuncletPadInst(FuncletPadInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &I) {
+    handleShadowOr(I);
+  }
+
+  void visitExtractValueInst(ExtractValueInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Agg = I.getAggregateOperand();
+    LLVM_DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
+    Value *AggShadow = getShadow(Agg);
+    LLVM_DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+    LLVM_DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
+    setShadow(&I, ResShadow);
+    setOriginForNaryOp(I);
+  }
+
+  void visitInsertValueInst(InsertValueInst &I) {
+    IRBuilder<> IRB(&I);
+    LLVM_DEBUG(dbgs() << "InsertValue:  " << I << "\n");
+    Value *AggShadow = getShadow(I.getAggregateOperand());
+    Value *InsShadow = getShadow(I.getInsertedValueOperand());
+    LLVM_DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    LLVM_DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
+    Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+    LLVM_DEBUG(dbgs() << "   Res:        " << *Res << "\n");
+    setShadow(&I, Res);
+    setOriginForNaryOp(I);
+  }
+
+  void dumpInst(Instruction &I) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
+    } else {
+      errs() << "ZZZ " << I.getOpcodeName() << "\n";
+    }
+    errs() << "QQQ " << I << "\n";
+  }
+
+  void visitResumeInst(ResumeInst &I) {
+    LLVM_DEBUG(dbgs() << "Resume: " << I << "\n");
+    // Nothing to do here.
+  }
+
+  void visitCleanupReturnInst(CleanupReturnInst &CRI) {
+    LLVM_DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
+    // Nothing to do here.
+  }
+
+  void visitCatchReturnInst(CatchReturnInst &CRI) {
+    LLVM_DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
+    // Nothing to do here.
+  }
+
+  void instrumentAsmArgument(Value *Operand, Instruction &I, IRBuilder<> &IRB,
+                             const DataLayout &DL, bool isOutput) {
+    // For each assembly argument, we check its value for being initialized.
+    // If the argument is a pointer, we assume it points to a single element
+    // of the corresponding type (or to a 8-byte word, if the type is unsized).
+    // Each such pointer is instrumented with a call to the runtime library.
+    Type *OpType = Operand->getType();
+    // Check the operand value itself.
+    insertShadowCheck(Operand, &I);
+    if (!OpType->isPointerTy() || !isOutput) {
+      assert(!isOutput);
+      return;
+    }
+    Type *ElType = OpType->getPointerElementType();
+    if (!ElType->isSized())
+      return;
+    int Size = DL.getTypeStoreSize(ElType);
+    Value *Ptr = IRB.CreatePointerCast(Operand, IRB.getInt8PtrTy());
+    Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
+    IRB.CreateCall(MS.MsanInstrumentAsmStoreFn, {Ptr, SizeVal});
+  }
+
+  /// Get the number of output arguments returned by pointers.
+  int getNumOutputArgs(InlineAsm *IA, CallBase *CB) {
+    int NumRetOutputs = 0;
+    int NumOutputs = 0;
+    Type *RetTy = cast<Value>(CB)->getType();
+    if (!RetTy->isVoidTy()) {
+      // Register outputs are returned via the CallInst return value.
+      auto *ST = dyn_cast<StructType>(RetTy);
+      if (ST)
+        NumRetOutputs = ST->getNumElements();
+      else
+        NumRetOutputs = 1;
+    }
+    InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+    for (size_t i = 0, n = Constraints.size(); i < n; i++) {
+      InlineAsm::ConstraintInfo Info = Constraints[i];
+      switch (Info.Type) {
+      case InlineAsm::isOutput:
+        NumOutputs++;
+        break;
+      default:
+        break;
+      }
+    }
+    return NumOutputs - NumRetOutputs;
+  }
+
+  void visitAsmInstruction(Instruction &I) {
+    // Conservative inline assembly handling: check for poisoned shadow of
+    // asm() arguments, then unpoison the result and all the memory locations
+    // pointed to by those arguments.
+    // An inline asm() statement in C++ contains lists of input and output
+    // arguments used by the assembly code. These are mapped to operands of the
+    // CallInst as follows:
+    //  - nR register outputs ("=r) are returned by value in a single structure
+    //  (SSA value of the CallInst);
+    //  - nO other outputs ("=m" and others) are returned by pointer as first
+    // nO operands of the CallInst;
+    //  - nI inputs ("r", "m" and others) are passed to CallInst as the
+    // remaining nI operands.
+    // The total number of asm() arguments in the source is nR+nO+nI, and the
+    // corresponding CallInst has nO+nI+1 operands (the last operand is the
+    // function to be called).
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    CallBase *CB = cast<CallBase>(&I);
+    IRBuilder<> IRB(&I);
+    InlineAsm *IA = cast<InlineAsm>(CB->getCalledValue());
+    int OutputArgs = getNumOutputArgs(IA, CB);
+    // The last operand of a CallInst is the function itself.
+    int NumOperands = CB->getNumOperands() - 1;
+
+    // Check input arguments. Doing so before unpoisoning output arguments, so
+    // that we won't overwrite uninit values before checking them.
+    for (int i = OutputArgs; i < NumOperands; i++) {
+      Value *Operand = CB->getOperand(i);
+      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ false);
+    }
+    // Unpoison output arguments. This must happen before the actual InlineAsm
+    // call, so that the shadow for memory published in the asm() statement
+    // remains valid.
+    for (int i = 0; i < OutputArgs; i++) {
+      Value *Operand = CB->getOperand(i);
+      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ true);
+    }
+
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitInstruction(Instruction &I) {
+    // Everything else: stop propagating and check for poisoned shadow.
+    if (ClDumpStrictInstructions)
+      dumpInst(I);
+    LLVM_DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+    for (size_t i = 0, n = I.getNumOperands(); i < n; i++) {
+      Value *Operand = I.getOperand(i);
+      if (Operand->getType()->isSized())
+        insertShadowCheck(Operand, &I);
+    }
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+};
+
+/// AMD64-specific implementation of VarArgHelper.
+struct VarArgAMD64Helper : public VarArgHelper {
+  // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+  // See a comment in visitCallSite for more details.
+  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64FpEndOffsetSSE = 176;
+  // If SSE is disabled, fp_offset in va_list is zero.
+  static const unsigned AMD64FpEndOffsetNoSSE = AMD64GpEndOffset;
+
+  unsigned AMD64FpEndOffset;
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgTLSOriginCopy = nullptr;
+  Value *VAArgOverflowSize = nullptr;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+      : F(F), MS(MS), MSV(MSV) {
+    AMD64FpEndOffset = AMD64FpEndOffsetSSE;
+    for (const auto &Attr : F.getAttributes().getFnAttributes()) {
+      if (Attr.isStringAttribute() &&
+          (Attr.getKindAsString() == "target-features")) {
+        if (Attr.getValueAsString().contains("-sse"))
+          AMD64FpEndOffset = AMD64FpEndOffsetNoSSE;
+        break;
+      }
+    }
+  }
+
+  ArgKind classifyArgument(Value* arg) {
+    // A very rough approximation of X86_64 argument classification rules.
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+      return AK_FloatingPoint;
+    if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+      return AK_GeneralPurpose;
+    if (T->isPointerTy())
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // For VarArg functions, store the argument shadow in an ABI-specific format
+  // that corresponds to va_list layout.
+  // We do this because Clang lowers va_arg in the frontend, and this pass
+  // only sees the low level code that deals with va_list internals.
+  // A much easier alternative (provided that Clang emits va_arg instructions)
+  // would have been to associate each live instance of va_list with a copy of
+  // MSanParamTLS, and extract shadow on va_arg() call in the argument list
+  // order.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned GpOffset = 0;
+    unsigned FpOffset = AMD64GpEndOffset;
+    unsigned OverflowOffset = AMD64FpEndOffset;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
+      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
+      if (IsByVal) {
+        // ByVal arguments always go to the overflow area.
+        // Fixed arguments passed through the overflow area will be stepped
+        // over by va_start, so don't count them towards the offset.
+        if (IsFixed)
+          continue;
+        assert(A->getType()->isPointerTy());
+        Type *RealTy = A->getType()->getPointerElementType();
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+        Value *ShadowBase = getShadowPtrForVAArgument(
+            RealTy, IRB, OverflowOffset, alignTo(ArgSize, 8));
+        Value *OriginBase = nullptr;
+        if (MS.TrackOrigins)
+          OriginBase = getOriginPtrForVAArgument(RealTy, IRB, OverflowOffset);
+        OverflowOffset += alignTo(ArgSize, 8);
+        if (!ShadowBase)
+          continue;
+        Value *ShadowPtr, *OriginPtr;
+        std::tie(ShadowPtr, OriginPtr) =
+            MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment,
+                                   /*isStore*/ false);
+
+        IRB.CreateMemCpy(ShadowBase, kShadowTLSAlignment, ShadowPtr,
+                         kShadowTLSAlignment, ArgSize);
+        if (MS.TrackOrigins)
+          IRB.CreateMemCpy(OriginBase, kShadowTLSAlignment, OriginPtr,
+                           kShadowTLSAlignment, ArgSize);
+      } else {
+        ArgKind AK = classifyArgument(A);
+        if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
+          AK = AK_Memory;
+        if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
+          AK = AK_Memory;
+        Value *ShadowBase, *OriginBase = nullptr;
+        switch (AK) {
+          case AK_GeneralPurpose:
+            ShadowBase =
+                getShadowPtrForVAArgument(A->getType(), IRB, GpOffset, 8);
+            if (MS.TrackOrigins)
+              OriginBase =
+                  getOriginPtrForVAArgument(A->getType(), IRB, GpOffset);
+            GpOffset += 8;
+            break;
+          case AK_FloatingPoint:
+            ShadowBase =
+                getShadowPtrForVAArgument(A->getType(), IRB, FpOffset, 16);
+            if (MS.TrackOrigins)
+              OriginBase =
+                  getOriginPtrForVAArgument(A->getType(), IRB, FpOffset);
+            FpOffset += 16;
+            break;
+          case AK_Memory:
+            if (IsFixed)
+              continue;
+            uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+            ShadowBase =
+                getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset, 8);
+            if (MS.TrackOrigins)
+              OriginBase =
+                  getOriginPtrForVAArgument(A->getType(), IRB, OverflowOffset);
+            OverflowOffset += alignTo(ArgSize, 8);
+        }
+        // Take fixed arguments into account for GpOffset and FpOffset,
+        // but don't actually store shadows for them.
+        // TODO(glider): don't call get*PtrForVAArgument() for them.
+        if (IsFixed)
+          continue;
+        if (!ShadowBase)
+          continue;
+        Value *Shadow = MSV.getShadow(A);
+        IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment);
+        if (MS.TrackOrigins) {
+          Value *Origin = MSV.getOrigin(A);
+          unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+          MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize,
+                          std::max(kShadowTLSAlignment, kMinOriginAlignment));
+        }
+      }
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   unsigned ArgOffset, unsigned ArgSize) {
+    // Make sure we don't overflow __msan_va_arg_tls.
+    if (ArgOffset + ArgSize > kParamTLSSize)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg_va_s");
+  }
+
+  /// Compute the origin address for a given va_arg.
+  Value *getOriginPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
+    // getOriginPtrForVAArgument() is always called after
+    // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never
+    // overflow.
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_va_o");
+  }
+
+  void unpoisonVAListTagForInst(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    unsigned Alignment = 8;
+    std::tie(ShadowPtr, OriginPtr) =
+        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ true);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 24, Alignment, false);
+    // We shouldn't need to zero out the origins, as they're only checked for
+    // nonzero shadow.
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    if (F.getCallingConv() == CallingConv::Win64)
+      return;
+    VAStartInstrumentationList.push_back(&I);
+    unpoisonVAListTagForInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    if (F.getCallingConv() == CallingConv::Win64) return;
+    unpoisonVAListTagForInst(I);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+      VAArgOverflowSize =
+          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
+      if (MS.TrackOrigins) {
+        VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+        IRB.CreateMemCpy(VAArgTLSOriginCopy, 8, MS.VAArgOriginTLS, 8, CopySize);
+      }
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+      Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 16)),
+          PointerType::get(RegSaveAreaPtrTy, 0));
+      Value *RegSaveAreaPtr =
+          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+      unsigned Alignment = 16;
+      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       AMD64FpEndOffset);
+      if (MS.TrackOrigins)
+        IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
+                         Alignment, AMD64FpEndOffset);
+      Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+      Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 8)),
+          PointerType::get(OverflowArgAreaPtrTy, 0));
+      Value *OverflowArgAreaPtr =
+          IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
+      Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
+      std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
+          MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Alignment, /*isStore*/ true);
+      Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
+                                             AMD64FpEndOffset);
+      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
+                       VAArgOverflowSize);
+      if (MS.TrackOrigins) {
+        SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy,
+                                        AMD64FpEndOffset);
+        IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment,
+                         VAArgOverflowSize);
+      }
+    }
+  }
+};
+
+/// MIPS64-specific implementation of VarArgHelper.
+struct VarArgMIPS64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgSize = nullptr;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgMIPS64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned VAArgOffset = 0;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin() +
+         CS.getFunctionType()->getNumParams(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Triple TargetTriple(F.getParent()->getTargetTriple());
+      Value *A = *ArgIt;
+      Value *Base;
+      uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+      if (TargetTriple.getArch() == Triple::mips64) {
+        // Adjusting the shadow for argument with size < 8 to match the placement
+        // of bits in big endian system
+        if (ArgSize < 8)
+          VAArgOffset += (8 - ArgSize);
+      }
+      Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset, ArgSize);
+      VAArgOffset += ArgSize;
+      VAArgOffset = alignTo(VAArgOffset, 8);
+      if (!Base)
+        continue;
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   unsigned ArgOffset, unsigned ArgSize) {
+    // Make sure we don't overflow __msan_va_arg_tls.
+    if (ArgOffset + ArgSize > kParamTLSSize)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    unsigned Alignment = 8;
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 8, Alignment, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    unsigned Alignment = 8;
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 8, Alignment, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+    VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+      Value *RegSaveAreaPtrPtr =
+          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                             PointerType::get(RegSaveAreaPtrTy, 0));
+      Value *RegSaveAreaPtr =
+          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+      unsigned Alignment = 8;
+      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       CopySize);
+    }
+  }
+};
+
+/// AArch64-specific implementation of VarArgHelper.
+struct VarArgAArch64Helper : public VarArgHelper {
+  static const unsigned kAArch64GrArgSize = 64;
+  static const unsigned kAArch64VrArgSize = 128;
+
+  static const unsigned AArch64GrBegOffset = 0;
+  static const unsigned AArch64GrEndOffset = kAArch64GrArgSize;
+  // Make VR space aligned to 16 bytes.
+  static const unsigned AArch64VrBegOffset = AArch64GrEndOffset;
+  static const unsigned AArch64VrEndOffset = AArch64VrBegOffset
+                                             + kAArch64VrArgSize;
+  static const unsigned AArch64VAEndOffset = AArch64VrEndOffset;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgOverflowSize = nullptr;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  VarArgAArch64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+  ArgKind classifyArgument(Value* arg) {
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy())
+      return AK_FloatingPoint;
+    if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+        || (T->isPointerTy()))
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // The instrumentation stores the argument shadow in a non ABI-specific
+  // format because it does not know which argument is named (since Clang,
+  // like x86_64 case, lowers the va_args in the frontend and this pass only
+  // sees the low level code that deals with va_list internals).
+  // The first seven GR registers are saved in the first 56 bytes of the
+  // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then
+  // the remaining arguments.
+  // Using constant offset within the va_arg TLS array allows fast copy
+  // in the finalize instrumentation.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned GrOffset = AArch64GrBegOffset;
+    unsigned VrOffset = AArch64VrBegOffset;
+    unsigned OverflowOffset = AArch64VAEndOffset;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
+      ArgKind AK = classifyArgument(A);
+      if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset)
+        AK = AK_Memory;
+      if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset)
+        AK = AK_Memory;
+      Value *Base;
+      switch (AK) {
+        case AK_GeneralPurpose:
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset, 8);
+          GrOffset += 8;
+          break;
+        case AK_FloatingPoint:
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset, 8);
+          VrOffset += 16;
+          break;
+        case AK_Memory:
+          // Don't count fixed arguments in the overflow area - va_start will
+          // skip right over them.
+          if (IsFixed)
+            continue;
+          uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset,
+                                           alignTo(ArgSize, 8));
+          OverflowOffset += alignTo(ArgSize, 8);
+          break;
+      }
+      // Count Gp/Vr fixed arguments to their respective offsets, but don't
+      // bother to actually store a shadow.
+      if (IsFixed)
+        continue;
+      if (!Base)
+        continue;
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   unsigned ArgOffset, unsigned ArgSize) {
+    // Make sure we don't overflow __msan_va_arg_tls.
+    if (ArgOffset + ArgSize > kParamTLSSize)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    unsigned Alignment = 8;
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 32, Alignment, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    unsigned Alignment = 8;
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 32, Alignment, false);
+  }
+
+  // Retrieve a va_list field of 'void*' size.
+  Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+    Value *SaveAreaPtrPtr =
+      IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                      ConstantInt::get(MS.IntptrTy, offset)),
+        Type::getInt64PtrTy(*MS.C));
+    return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr);
+  }
+
+  // Retrieve a va_list field of 'int' size.
+  Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+    Value *SaveAreaPtr =
+      IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                      ConstantInt::get(MS.IntptrTy, offset)),
+        Type::getInt32PtrTy(*MS.C));
+    Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr);
+    return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+      VAArgOverflowSize =
+          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
+    }
+
+    Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
+    Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize);
+
+    // Instrument va_start, copy va_list shadow from the backup copy of
+    // the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      // The variadic ABI for AArch64 creates two areas to save the incoming
+      // argument registers (one for 64-bit general register xn-x7 and another
+      // for 128-bit FP/SIMD vn-v7).
+      // We need then to propagate the shadow arguments on both regions
+      // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'.
+      // The remaning arguments are saved on shadow for 'va::stack'.
+      // One caveat is it requires only to propagate the non-named arguments,
+      // however on the call site instrumentation 'all' the arguments are
+      // saved. So to copy the shadow values from the va_arg TLS array
+      // we need to adjust the offset for both GR and VR fields based on
+      // the __{gr,vr}_offs value (since they are stores based on incoming
+      // named arguments).
+
+      // Read the stack pointer from the va_list.
+      Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0);
+
+      // Read both the __gr_top and __gr_off and add them up.
+      Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8);
+      Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24);
+
+      Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea);
+
+      // Read both the __vr_top and __vr_off and add them up.
+      Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16);
+      Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28);
+
+      Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea);
+
+      // It does not know how many named arguments is being used and, on the
+      // callsite all the arguments were saved.  Since __gr_off is defined as
+      // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic
+      // argument by ignoring the bytes of shadow from named arguments.
+      Value *GrRegSaveAreaShadowPtrOff =
+        IRB.CreateAdd(GrArgSize, GrOffSaveArea);
+
+      Value *GrRegSaveAreaShadowPtr =
+          MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 /*Alignment*/ 8, /*isStore*/ true)
+              .first;
+
+      Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                                              GrRegSaveAreaShadowPtrOff);
+      Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
+
+      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, 8, GrSrcPtr, 8, GrCopySize);
+
+      // Again, but for FP/SIMD values.
+      Value *VrRegSaveAreaShadowPtrOff =
+          IRB.CreateAdd(VrArgSize, VrOffSaveArea);
+
+      Value *VrRegSaveAreaShadowPtr =
+          MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 /*Alignment*/ 8, /*isStore*/ true)
+              .first;
+
+      Value *VrSrcPtr = IRB.CreateInBoundsGEP(
+        IRB.getInt8Ty(),
+        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                              IRB.getInt32(AArch64VrBegOffset)),
+        VrRegSaveAreaShadowPtrOff);
+      Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
+
+      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, 8, VrSrcPtr, 8, VrCopySize);
+
+      // And finally for remaining arguments.
+      Value *StackSaveAreaShadowPtr =
+          MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 /*Alignment*/ 16, /*isStore*/ true)
+              .first;
+
+      Value *StackSrcPtr =
+        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                              IRB.getInt32(AArch64VAEndOffset));
+
+      IRB.CreateMemCpy(StackSaveAreaShadowPtr, 16, StackSrcPtr, 16,
+                       VAArgOverflowSize);
+    }
+  }
+};
+
+/// PowerPC64-specific implementation of VarArgHelper.
+struct VarArgPowerPC64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgSize = nullptr;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    // For PowerPC, we need to deal with alignment of stack arguments -
+    // they are mostly aligned to 8 bytes, but vectors and i128 arrays
+    // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
+    // and QPX vectors are aligned to 32 bytes.  For that reason, we
+    // compute current offset from stack pointer (which is always properly
+    // aligned), and offset for the first vararg, then subtract them.
+    unsigned VAArgBase;
+    Triple TargetTriple(F.getParent()->getTargetTriple());
+    // Parameter save area starts at 48 bytes from frame pointer for ABIv1,
+    // and 32 bytes for ABIv2.  This is usually determined by target
+    // endianness, but in theory could be overriden by function attribute.
+    // For simplicity, we ignore it here (it'd only matter for QPX vectors).
+    if (TargetTriple.getArch() == Triple::ppc64)
+      VAArgBase = 48;
+    else
+      VAArgBase = 32;
+    unsigned VAArgOffset = VAArgBase;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
+      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
+      if (IsByVal) {
+        assert(A->getType()->isPointerTy());
+        Type *RealTy = A->getType()->getPointerElementType();
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+        uint64_t ArgAlign = CS.getParamAlignment(ArgNo);
+        if (ArgAlign < 8)
+          ArgAlign = 8;
+        VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+        if (!IsFixed) {
+          Value *Base = getShadowPtrForVAArgument(
+              RealTy, IRB, VAArgOffset - VAArgBase, ArgSize);
+          if (Base) {
+            Value *AShadowPtr, *AOriginPtr;
+            std::tie(AShadowPtr, AOriginPtr) =
+                MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(),
+                                       kShadowTLSAlignment, /*isStore*/ false);
+
+            IRB.CreateMemCpy(Base, kShadowTLSAlignment, AShadowPtr,
+                             kShadowTLSAlignment, ArgSize);
+          }
+        }
+        VAArgOffset += alignTo(ArgSize, 8);
+      } else {
+        Value *Base;
+        uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+        uint64_t ArgAlign = 8;
+        if (A->getType()->isArrayTy()) {
+          // Arrays are aligned to element size, except for long double
+          // arrays, which are aligned to 8 bytes.
+          Type *ElementTy = A->getType()->getArrayElementType();
+          if (!ElementTy->isPPC_FP128Ty())
+            ArgAlign = DL.getTypeAllocSize(ElementTy);
+        } else if (A->getType()->isVectorTy()) {
+          // Vectors are naturally aligned.
+          ArgAlign = DL.getTypeAllocSize(A->getType());
+        }
+        if (ArgAlign < 8)
+          ArgAlign = 8;
+        VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+        if (DL.isBigEndian()) {
+          // Adjusting the shadow for argument with size < 8 to match the placement
+          // of bits in big endian system
+          if (ArgSize < 8)
+            VAArgOffset += (8 - ArgSize);
+        }
+        if (!IsFixed) {
+          Base = getShadowPtrForVAArgument(A->getType(), IRB,
+                                           VAArgOffset - VAArgBase, ArgSize);
+          if (Base)
+            IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+        }
+        VAArgOffset += ArgSize;
+        VAArgOffset = alignTo(VAArgOffset, 8);
+      }
+      if (IsFixed)
+        VAArgBase = VAArgOffset;
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(),
+                                                VAArgOffset - VAArgBase);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   unsigned ArgOffset, unsigned ArgSize) {
+    // Make sure we don't overflow __msan_va_arg_tls.
+    if (ArgOffset + ArgSize > kParamTLSSize)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    unsigned Alignment = 8;
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 8, Alignment, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    unsigned Alignment = 8;
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 8, Alignment, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+    VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+      Value *RegSaveAreaPtrPtr =
+          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                             PointerType::get(RegSaveAreaPtrTy, 0));
+      Value *RegSaveAreaPtr =
+          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+      unsigned Alignment = 8;
+      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       CopySize);
+    }
+  }
+};
+
+/// A no-op implementation of VarArgHelper.
+struct VarArgNoOpHelper : public VarArgHelper {
+  VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
+                   MemorySanitizerVisitor &MSV) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {}
+
+  void visitVAStartInst(VAStartInst &I) override {}
+
+  void visitVACopyInst(VACopyInst &I) override {}
+
+  void finalizeInstrumentation() override {}
+};
+
+} // end anonymous namespace
+
+static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                        MemorySanitizerVisitor &Visitor) {
+  // VarArg handling is only implemented on AMD64. False positives are possible
+  // on other platforms.
+  Triple TargetTriple(Func.getParent()->getTargetTriple());
+  if (TargetTriple.getArch() == Triple::x86_64)
+    return new VarArgAMD64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.isMIPS64())
+    return new VarArgMIPS64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == Triple::aarch64)
+    return new VarArgAArch64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == Triple::ppc64 ||
+           TargetTriple.getArch() == Triple::ppc64le)
+    return new VarArgPowerPC64Helper(Func, Msan, Visitor);
+  else
+    return new VarArgNoOpHelper(Func, Msan, Visitor);
+}
+
+bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
+  if (!CompileKernel && F.getName() == kMsanModuleCtorName)
+    return false;
+
+  MemorySanitizerVisitor Visitor(F, *this, TLI);
+
+  // Clear out readonly/readnone attributes.
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly)
+    .addAttribute(Attribute::ReadNone);
+  F.removeAttributes(AttributeList::FunctionIndex, B);
+
+  return Visitor.runOnFunction();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
new file mode 100644
index 000000000000..ca1bb62389e9
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -0,0 +1,1814 @@
+//===- PGOInstrumentation.cpp - MST-based PGO Instrumentation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PGO instrumentation using a minimum spanning tree based
+// on the following paper:
+//   [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points
+//   for program frequency counts. BIT Numerical Mathematics 1973, Volume 13,
+//   Issue 3, pp 313-322
+// The idea of the algorithm based on the fact that for each node (except for
+// the entry and exit), the sum of incoming edge counts equals the sum of
+// outgoing edge counts. The count of edge on spanning tree can be derived from
+// those edges not on the spanning tree. Knuth proves this method instruments
+// the minimum number of edges.
+//
+// The minimal spanning tree here is actually a maximum weight tree -- on-tree
+// edges have higher frequencies (more likely to execute). The idea is to
+// instrument those less frequently executed edges to reduce the runtime
+// overhead of instrumented binaries.
+//
+// This file contains two passes:
+// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
+// count profile, and generates the instrumentation for indirect call
+// profiling.
+// (2) Pass PGOInstrumentationUse which reads the edge count profile and
+// annotates the branch weights. It also reads the indirect call value
+// profiling records and annotate the indirect call instructions.
+//
+// To get the precise counter information, These two passes need to invoke at
+// the same compilation point (so they see the same IR). For pass
+// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
+// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and
+// the profile is opened in module level and passed to each PGOUseFunc instance.
+// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put
+// in class FuncPGOInstrumentation.
+//
+// Class PGOEdge represents a CFG edge and some auxiliary information. Class
+// BBInfo contains auxiliary information for each BB. These two classes are used
+// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived
+// class of PGOEdge and BBInfo, respectively. They contains extra data structure
+// used in populating profile counters.
+// The MST implementation is in Class CFGMST (CFGMST.h).
+//
+//===----------------------------------------------------------------------===//
+
+#include "CFGMST.h"
+#include "ValueProfileCollector.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CRC.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using ProfileCount = Function::ProfileCount;
+using VPCandidateInfo = ValueProfileCollector::CandidateInfo;
+
+#define DEBUG_TYPE "pgo-instrumentation"
+
+STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
+STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
+STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented.");
+STATISTIC(NumOfPGOEdge, "Number of edges.");
+STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
+STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
+STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
+STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
+STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+STATISTIC(NumOfPGOICall, "Number of indirect call value instrumentations.");
+STATISTIC(NumOfCSPGOInstrument, "Number of edges instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOSelectInsts,
+          "Number of select instruction instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOMemIntrinsics,
+          "Number of mem intrinsics instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOEdge, "Number of edges in CSPGO.");
+STATISTIC(NumOfCSPGOBB, "Number of basic-blocks in CSPGO.");
+STATISTIC(NumOfCSPGOSplit, "Number of critical edge splits in CSPGO.");
+STATISTIC(NumOfCSPGOFunc,
+          "Number of functions having valid profile counts in CSPGO.");
+STATISTIC(NumOfCSPGOMismatch,
+          "Number of functions having mismatch profile in CSPGO.");
+STATISTIC(NumOfCSPGOMissing, "Number of functions without profile in CSPGO.");
+
+// Command line option to specify the file to read profile from. This is
+// mainly used for testing.
+static cl::opt<std::string>
+    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
+                       cl::value_desc("filename"),
+                       cl::desc("Specify the path of profile data file. This is"
+                                "mainly for test purpose."));
+static cl::opt<std::string> PGOTestProfileRemappingFile(
+    "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden,
+    cl::value_desc("filename"),
+    cl::desc("Specify the path of profile remapping file. This is mainly for "
+             "test purpose."));
+
+// Command line option to disable value profiling. The default is false:
+// i.e. value profiling is enabled by default. This is for debug purpose.
+static cl::opt<bool> DisableValueProfiling("disable-vp", cl::init(false),
+                                           cl::Hidden,
+                                           cl::desc("Disable Value Profiling"));
+
+// Command line option to set the maximum number of VP annotations to write to
+// the metadata for a single indirect call callsite.
+static cl::opt<unsigned> MaxNumAnnotations(
+    "icp-max-annotations", cl::init(3), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of annotations for a single indirect "
+             "call callsite"));
+
+// Command line option to set the maximum number of value annotations
+// to write to the metadata for a single memop intrinsic.
+static cl::opt<unsigned> MaxNumMemOPAnnotations(
+    "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of preicise value annotations for a single memop"
+             "intrinsic"));
+
+// Command line option to control appending FunctionHash to the name of a COMDAT
+// function. This is to avoid the hash mismatch caused by the preinliner.
+static cl::opt<bool> DoComdatRenaming(
+    "do-comdat-renaming", cl::init(false), cl::Hidden,
+    cl::desc("Append function hash to the name of COMDAT function to avoid "
+             "function hash mismatch due to the preinliner"));
+
+// Command line option to enable/disable the warning about missing profile
+// information.
+static cl::opt<bool>
+    PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden,
+                   cl::desc("Use this option to turn on/off "
+                            "warnings about missing profile data for "
+                            "functions."));
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data.
+static cl::opt<bool>
+    NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
+                      cl::desc("Use this option to turn off/on "
+                               "warnings about profile cfg mismatch."));
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data for Comdat functions, which often turns out to be false
+// positive due to the pre-instrumentation inline.
+static cl::opt<bool>
+    NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
+                            cl::Hidden,
+                            cl::desc("The option is used to turn on/off "
+                                     "warnings about hash mismatch for comdat "
+                                     "functions."));
+
+// Command line option to enable/disable select instruction instrumentation.
+static cl::opt<bool>
+    PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden,
+                   cl::desc("Use this option to turn on/off SELECT "
+                            "instruction instrumentation. "));
+
+// Command line option to turn on CFG dot or text dump of raw profile counts
+static cl::opt<PGOViewCountsType> PGOViewRawCounts(
+    "pgo-view-raw-counts", cl::Hidden,
+    cl::desc("A boolean option to show CFG dag or text "
+             "with raw profile counts from "
+             "profile data. See also option "
+             "-pgo-view-counts. To limit graph "
+             "display to only one function, use "
+             "filtering option -view-bfi-func-name."),
+    cl::values(clEnumValN(PGOVCT_None, "none", "do not show."),
+               clEnumValN(PGOVCT_Graph, "graph", "show a graph."),
+               clEnumValN(PGOVCT_Text, "text", "show in text.")));
+
+// Command line option to enable/disable memop intrinsic call.size profiling.
+static cl::opt<bool>
+    PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
+                  cl::desc("Use this option to turn on/off "
+                           "memory intrinsic size profiling."));
+
+// Emit branch probability as optimization remarks.
+static cl::opt<bool>
+    EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden,
+                          cl::desc("When this option is on, the annotated "
+                                   "branch probability will be emitted as "
+                                   "optimization remarks: -{Rpass|"
+                                   "pass-remarks}=pgo-instrumentation"));
+
+// Command line option to turn on CFG dot dump after profile annotation.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
+extern cl::opt<PGOViewCountsType> PGOViewCounts;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
+// Return a string describing the branch condition that can be
+// used in static branch probability heuristics:
+static std::string getBranchCondString(Instruction *TI) {
+  BranchInst *BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isConditional())
+    return std::string();
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI)
+    return std::string();
+
+  std::string result;
+  raw_string_ostream OS(result);
+  OS << CmpInst::getPredicateName(CI->getPredicate()) << "_";
+  CI->getOperand(0)->getType()->print(OS, true);
+
+  Value *RHS = CI->getOperand(1);
+  ConstantInt *CV = dyn_cast<ConstantInt>(RHS);
+  if (CV) {
+    if (CV->isZero())
+      OS << "_Zero";
+    else if (CV->isOne())
+      OS << "_One";
+    else if (CV->isMinusOne())
+      OS << "_MinusOne";
+    else
+      OS << "_Const";
+  }
+  OS.flush();
+  return result;
+}
+
+static const char *ValueProfKindDescr[] = {
+#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+namespace {
+
+/// The select instruction visitor plays three roles specified
+/// by the mode. In \c VM_counting mode, it simply counts the number of
+/// select instructions. In \c VM_instrument mode, it inserts code to count
+/// the number times TrueValue of select is taken. In \c VM_annotate mode,
+/// it reads the profile data and annotate the select instruction with metadata.
+enum VisitMode { VM_counting, VM_instrument, VM_annotate };
+class PGOUseFunc;
+
+/// Instruction Visitor class to visit select instructions.
+struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
+  Function &F;
+  unsigned NSIs = 0;             // Number of select instructions instrumented.
+  VisitMode Mode = VM_counting;  // Visiting mode.
+  unsigned *CurCtrIdx = nullptr; // Pointer to current counter index.
+  unsigned TotalNumCtrs = 0;     // Total number of counters
+  GlobalVariable *FuncNameVar = nullptr;
+  uint64_t FuncHash = 0;
+  PGOUseFunc *UseFunc = nullptr;
+
+  SelectInstVisitor(Function &Func) : F(Func) {}
+
+  void countSelects(Function &Func) {
+    NSIs = 0;
+    Mode = VM_counting;
+    visit(Func);
+  }
+
+  // Visit the IR stream and instrument all select instructions. \p
+  // Ind is a pointer to the counter index variable; \p TotalNC
+  // is the total number of counters; \p FNV is the pointer to the
+  // PGO function name var; \p FHash is the function hash.
+  void instrumentSelects(Function &Func, unsigned *Ind, unsigned TotalNC,
+                         GlobalVariable *FNV, uint64_t FHash) {
+    Mode = VM_instrument;
+    CurCtrIdx = Ind;
+    TotalNumCtrs = TotalNC;
+    FuncHash = FHash;
+    FuncNameVar = FNV;
+    visit(Func);
+  }
+
+  // Visit the IR stream and annotate all select instructions.
+  void annotateSelects(Function &Func, PGOUseFunc *UF, unsigned *Ind) {
+    Mode = VM_annotate;
+    UseFunc = UF;
+    CurCtrIdx = Ind;
+    visit(Func);
+  }
+
+  void instrumentOneSelectInst(SelectInst &SI);
+  void annotateOneSelectInst(SelectInst &SI);
+
+  // Visit \p SI instruction and perform tasks according to visit mode.
+  void visitSelectInst(SelectInst &SI);
+
+  // Return the number of select instructions. This needs be called after
+  // countSelects().
+  unsigned getNumOfSelectInsts() const { return NSIs; }
+};
+
+
+class PGOInstrumentationGenLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  PGOInstrumentationGenLegacyPass(bool IsCS = false)
+      : ModulePass(ID), IsCS(IsCS) {
+    initializePGOInstrumentationGenLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOInstrumentationGenPass"; }
+
+private:
+  // Is this is context-sensitive instrumentation.
+  bool IsCS;
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+
+class PGOInstrumentationUseLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  // Provide the profile filename as the parameter.
+  PGOInstrumentationUseLegacyPass(std::string Filename = "", bool IsCS = false)
+      : ModulePass(ID), ProfileFileName(std::move(Filename)), IsCS(IsCS) {
+    if (!PGOTestProfileFile.empty())
+      ProfileFileName = PGOTestProfileFile;
+    initializePGOInstrumentationUseLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOInstrumentationUsePass"; }
+
+private:
+  std::string ProfileFileName;
+  // Is this is context-sensitive instrumentation use.
+  bool IsCS;
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+
+class PGOInstrumentationGenCreateVarLegacyPass : public ModulePass {
+public:
+  static char ID;
+  StringRef getPassName() const override {
+    return "PGOInstrumentationGenCreateVarPass";
+  }
+  PGOInstrumentationGenCreateVarLegacyPass(std::string CSInstrName = "")
+      : ModulePass(ID), InstrProfileOutput(CSInstrName) {
+    initializePGOInstrumentationGenCreateVarLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+private:
+  bool runOnModule(Module &M) override {
+    createProfileFileNameVar(M, InstrProfileOutput);
+    createIRLevelProfileFlagVar(M, true);
+    return false;
+  }
+  std::string InstrProfileOutput;
+};
+
+} // end anonymous namespace
+
+char PGOInstrumentationGenLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
+                      "PGO instrumentation.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
+                    "PGO instrumentation.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationGenLegacyPass(bool IsCS) {
+  return new PGOInstrumentationGenLegacyPass(IsCS);
+}
+
+char PGOInstrumentationUseLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
+                      "Read PGO instrumentation profile.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
+                    "Read PGO instrumentation profile.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename,
+                                                        bool IsCS) {
+  return new PGOInstrumentationUseLegacyPass(Filename.str(), IsCS);
+}
+
+char PGOInstrumentationGenCreateVarLegacyPass::ID = 0;
+
+INITIALIZE_PASS(PGOInstrumentationGenCreateVarLegacyPass,
+                "pgo-instr-gen-create-var",
+                "Create PGO instrumentation version variable for CSPGO.", false,
+                false)
+
+ModulePass *
+llvm::createPGOInstrumentationGenCreateVarLegacyPass(StringRef CSInstrName) {
+  return new PGOInstrumentationGenCreateVarLegacyPass(CSInstrName);
+}
+
+namespace {
+
+/// An MST based instrumentation for PGO
+///
+/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
+/// in the function level.
+struct PGOEdge {
+  // This class implements the CFG edges. Note the CFG can be a multi-graph.
+  // So there might be multiple edges with same SrcBB and DestBB.
+  const BasicBlock *SrcBB;
+  const BasicBlock *DestBB;
+  uint64_t Weight;
+  bool InMST = false;
+  bool Removed = false;
+  bool IsCritical = false;
+
+  PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
+      : SrcBB(Src), DestBB(Dest), Weight(W) {}
+
+  // Return the information string of an edge.
+  const std::string infoString() const {
+    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
+            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight)).str();
+  }
+};
+
+// This class stores the auxiliary information for each BB.
+struct BBInfo {
+  BBInfo *Group;
+  uint32_t Index;
+  uint32_t Rank = 0;
+
+  BBInfo(unsigned IX) : Group(this), Index(IX) {}
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    return (Twine("Index=") + Twine(Index)).str();
+  }
+
+  // Empty function -- only applicable to UseBBInfo.
+  void addOutEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {}
+
+  // Empty function -- only applicable to UseBBInfo.
+  void addInEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {}
+};
+
+// This class implements the CFG edges. Note the CFG can be a multi-graph.
+template <class Edge, class BBInfo> class FuncPGOInstrumentation {
+private:
+  Function &F;
+
+  // Is this is context-sensitive instrumentation.
+  bool IsCS;
+
+  // A map that stores the Comdat group in function F.
+  std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
+
+  ValueProfileCollector VPC;
+
+  void computeCFGHash();
+  void renameComdatFunction();
+
+public:
+  std::vector<std::vector<VPCandidateInfo>> ValueSites;
+  SelectInstVisitor SIVisitor;
+  std::string FuncName;
+  GlobalVariable *FuncNameVar;
+
+  // CFG hash value for this function.
+  uint64_t FunctionHash = 0;
+
+  // The Minimum Spanning Tree of function CFG.
+  CFGMST<Edge, BBInfo> MST;
+
+  // Collect all the BBs that will be instrumented, and store them in
+  // InstrumentBBs.
+  void getInstrumentBBs(std::vector<BasicBlock *> &InstrumentBBs);
+
+  // Give an edge, find the BB that will be instrumented.
+  // Return nullptr if there is no BB to be instrumented.
+  BasicBlock *getInstrBB(Edge *E);
+
+  // Return the auxiliary BB information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
+
+  // Return the auxiliary BB information if available.
+  BBInfo *findBBInfo(const BasicBlock *BB) const { return MST.findBBInfo(BB); }
+
+  // Dump edges and BB information.
+  void dumpInfo(std::string Str = "") const {
+    MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
+                              Twine(FunctionHash) + "\t" + Str);
+  }
+
+  FuncPGOInstrumentation(
+      Function &Func,
+      std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+      bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
+      BlockFrequencyInfo *BFI = nullptr, bool IsCS = false)
+      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func),
+        ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) {
+    // This should be done before CFG hash computation.
+    SIVisitor.countSelects(Func);
+    ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize);
+    if (!IsCS) {
+      NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
+      NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
+      NumOfPGOBB += MST.BBInfos.size();
+      ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget);
+    } else {
+      NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
+      NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
+      NumOfCSPGOBB += MST.BBInfos.size();
+    }
+
+    FuncName = getPGOFuncName(F);
+    computeCFGHash();
+    if (!ComdatMembers.empty())
+      renameComdatFunction();
+    LLVM_DEBUG(dumpInfo("after CFGMST"));
+
+    for (auto &E : MST.AllEdges) {
+      if (E->Removed)
+        continue;
+      IsCS ? NumOfCSPGOEdge++ : NumOfPGOEdge++;
+      if (!E->InMST)
+        IsCS ? NumOfCSPGOInstrument++ : NumOfPGOInstrument++;
+    }
+
+    if (CreateGlobalVar)
+      FuncNameVar = createPGOFuncNameVar(F, FuncName);
+  }
+};
+
+} // end anonymous namespace
+
+// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
+// value of each BB in the CFG. The higher 32 bits record the number of edges.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
+  std::vector<uint8_t> Indexes;
+  JamCRC JC;
+  for (auto &BB : F) {
+    const Instruction *TI = BB.getTerminator();
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      auto BI = findBBInfo(Succ);
+      if (BI == nullptr)
+        continue;
+      uint32_t Index = BI->Index;
+      for (int J = 0; J < 4; J++)
+        Indexes.push_back((uint8_t)(Index >> (J * 8)));
+    }
+  }
+  JC.update(Indexes);
+
+  // Hash format for context sensitive profile. Reserve 4 bits for other
+  // information.
+  FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
+                 (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
+                 //(uint64_t)ValueSites[IPVK_MemOPSize].size() << 40 |
+                 (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+  // Reserve bit 60-63 for other information purpose.
+  FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+  if (IsCS)
+    NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
+  LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
+                    << " CRC = " << JC.getCRC()
+                    << ", Selects = " << SIVisitor.getNumOfSelectInsts()
+                    << ", Edges = " << MST.AllEdges.size() << ", ICSites = "
+                    << ValueSites[IPVK_IndirectCallTarget].size()
+                    << ", Hash = " << FunctionHash << "\n";);
+}
+
+// Check if we can safely rename this Comdat function.
+static bool canRenameComdat(
+    Function &F,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+  if (!DoComdatRenaming || !canRenameComdatFunc(F, true))
+    return false;
+
+  // FIXME: Current only handle those Comdat groups that only containing one
+  // function and function aliases.
+  // (1) For a Comdat group containing multiple functions, we need to have a
+  // unique postfix based on the hashes for each function. There is a
+  // non-trivial code refactoring to do this efficiently.
+  // (2) Variables can not be renamed, so we can not rename Comdat function in a
+  // group including global vars.
+  Comdat *C = F.getComdat();
+  for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
+    if (dyn_cast<GlobalAlias>(CM.second))
+      continue;
+    Function *FM = dyn_cast<Function>(CM.second);
+    if (FM != &F)
+      return false;
+  }
+  return true;
+}
+
+// Append the CFGHash to the Comdat function name.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::renameComdatFunction() {
+  if (!canRenameComdat(F, ComdatMembers))
+    return;
+  std::string OrigName = F.getName().str();
+  std::string NewFuncName =
+      Twine(F.getName() + "." + Twine(FunctionHash)).str();
+  F.setName(Twine(NewFuncName));
+  GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigName, &F);
+  FuncName = Twine(FuncName + "." + Twine(FunctionHash)).str();
+  Comdat *NewComdat;
+  Module *M = F.getParent();
+  // For AvailableExternallyLinkage functions, change the linkage to
+  // LinkOnceODR and put them into comdat. This is because after renaming, there
+  // is no backup external copy available for the function.
+  if (!F.hasComdat()) {
+    assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
+    NewComdat = M->getOrInsertComdat(StringRef(NewFuncName));
+    F.setLinkage(GlobalValue::LinkOnceODRLinkage);
+    F.setComdat(NewComdat);
+    return;
+  }
+
+  // This function belongs to a single function Comdat group.
+  Comdat *OrigComdat = F.getComdat();
+  std::string NewComdatName =
+      Twine(OrigComdat->getName() + "." + Twine(FunctionHash)).str();
+  NewComdat = M->getOrInsertComdat(StringRef(NewComdatName));
+  NewComdat->setSelectionKind(OrigComdat->getSelectionKind());
+
+  for (auto &&CM : make_range(ComdatMembers.equal_range(OrigComdat))) {
+    if (GlobalAlias *GA = dyn_cast<GlobalAlias>(CM.second)) {
+      // For aliases, change the name directly.
+      assert(dyn_cast<Function>(GA->getAliasee()->stripPointerCasts()) == &F);
+      std::string OrigGAName = GA->getName().str();
+      GA->setName(Twine(GA->getName() + "." + Twine(FunctionHash)));
+      GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigGAName, GA);
+      continue;
+    }
+    // Must be a function.
+    Function *CF = dyn_cast<Function>(CM.second);
+    assert(CF);
+    CF->setComdat(NewComdat);
+  }
+}
+
+// Collect all the BBs that will be instruments and return them in
+// InstrumentBBs and setup InEdges/OutEdge for UseBBInfo.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::getInstrumentBBs(
+    std::vector<BasicBlock *> &InstrumentBBs) {
+  // Use a worklist as we will update the vector during the iteration.
+  std::vector<Edge *> EdgeList;
+  EdgeList.reserve(MST.AllEdges.size());
+  for (auto &E : MST.AllEdges)
+    EdgeList.push_back(E.get());
+
+  for (auto &E : EdgeList) {
+    BasicBlock *InstrBB = getInstrBB(E);
+    if (InstrBB)
+      InstrumentBBs.push_back(InstrBB);
+  }
+
+  // Set up InEdges/OutEdges for all BBs.
+  for (auto &E : MST.AllEdges) {
+    if (E->Removed)
+      continue;
+    const BasicBlock *SrcBB = E->SrcBB;
+    const BasicBlock *DestBB = E->DestBB;
+    BBInfo &SrcInfo = getBBInfo(SrcBB);
+    BBInfo &DestInfo = getBBInfo(DestBB);
+    SrcInfo.addOutEdge(E.get());
+    DestInfo.addInEdge(E.get());
+  }
+}
+
+// Given a CFG E to be instrumented, find which BB to place the instrumented
+// code. The function will split the critical edge if necessary.
+template <class Edge, class BBInfo>
+BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
+  if (E->InMST || E->Removed)
+    return nullptr;
+
+  BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+  BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+  // For a fake edge, instrument the real BB.
+  if (SrcBB == nullptr)
+    return DestBB;
+  if (DestBB == nullptr)
+    return SrcBB;
+
+  auto canInstrument = [](BasicBlock *BB) -> BasicBlock * {
+    // There are basic blocks (such as catchswitch) cannot be instrumented.
+    // If the returned first insertion point is the end of BB, skip this BB.
+    if (BB->getFirstInsertionPt() == BB->end())
+      return nullptr;
+    return BB;
+  };
+
+  // Instrument the SrcBB if it has a single successor,
+  // otherwise, the DestBB if this is not a critical edge.
+  Instruction *TI = SrcBB->getTerminator();
+  if (TI->getNumSuccessors() <= 1)
+    return canInstrument(SrcBB);
+  if (!E->IsCritical)
+    return canInstrument(DestBB);
+
+  unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+  BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum);
+  if (!InstrBB) {
+    LLVM_DEBUG(
+        dbgs() << "Fail to split critical edge: not instrument this edge.\n");
+    return nullptr;
+  }
+  // For a critical edge, we have to split. Instrument the newly
+  // created BB.
+  IsCS ? NumOfCSPGOSplit++ : NumOfPGOSplit++;
+  LLVM_DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index
+                    << " --> " << getBBInfo(DestBB).Index << "\n");
+  // Need to add two new edges. First one: Add new edge of SrcBB->InstrBB.
+  MST.addEdge(SrcBB, InstrBB, 0);
+  // Second one: Add new edge of InstrBB->DestBB.
+  Edge &NewEdge1 = MST.addEdge(InstrBB, DestBB, 0);
+  NewEdge1.InMST = true;
+  E->Removed = true;
+
+  return canInstrument(InstrBB);
+}
+
+// Visit all edge and instrument the edges not in MST, and do value profiling.
+// Critical edges will be split.
+static void instrumentOneFunc(
+    Function &F, Module *M, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFI,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+    bool IsCS) {
+  // Split indirectbr critical edges here before computing the MST rather than
+  // later in getInstrBB() to avoid invalidating it.
+  SplitIndirectBrCriticalEdges(F, BPI, BFI);
+
+  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, ComdatMembers, true, BPI,
+                                                   BFI, IsCS);
+  std::vector<BasicBlock *> InstrumentBBs;
+  FuncInfo.getInstrumentBBs(InstrumentBBs);
+  unsigned NumCounters =
+      InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
+
+  uint32_t I = 0;
+  Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
+  for (auto *InstrBB : InstrumentBBs) {
+    IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
+    assert(Builder.GetInsertPoint() != InstrBB->end() &&
+           "Cannot get the Instrumentation point");
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
+        {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+         Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
+         Builder.getInt32(I++)});
+  }
+
+  // Now instrument select instructions:
+  FuncInfo.SIVisitor.instrumentSelects(F, &I, NumCounters, FuncInfo.FuncNameVar,
+                                       FuncInfo.FunctionHash);
+  assert(I == NumCounters);
+
+  if (DisableValueProfiling)
+    return;
+
+  NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size();
+
+  // For each VP Kind, walk the VP candidates and instrument each one.
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
+    unsigned SiteIndex = 0;
+    if (Kind == IPVK_MemOPSize && !PGOInstrMemOP)
+      continue;
+
+    for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) {
+      LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind]
+                        << " site: CallSite Index = " << SiteIndex << "\n");
+
+      IRBuilder<> Builder(Cand.InsertPt);
+      assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() &&
+             "Cannot get the Instrumentation point");
+
+      Value *ToProfile = nullptr;
+      if (Cand.V->getType()->isIntegerTy())
+        ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty());
+      else if (Cand.V->getType()->isPointerTy())
+        ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
+      assert(ToProfile && "value profiling Value is of unexpected type");
+
+      Builder.CreateCall(
+          Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+          {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+           Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
+           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)});
+    }
+  } // IPVK_First <= Kind <= IPVK_Last
+}
+
+namespace {
+
+// This class represents a CFG edge in profile use compilation.
+struct PGOUseEdge : public PGOEdge {
+  bool CountValid = false;
+  uint64_t CountValue = 0;
+
+  PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
+      : PGOEdge(Src, Dest, W) {}
+
+  // Set edge count value
+  void setEdgeCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string for this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return PGOEdge::infoString();
+    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue))
+        .str();
+  }
+};
+
+using DirectEdges = SmallVector<PGOUseEdge *, 2>;
+
+// This class stores the auxiliary information for each BB.
+struct UseBBInfo : public BBInfo {
+  uint64_t CountValue = 0;
+  bool CountValid;
+  int32_t UnknownCountInEdge = 0;
+  int32_t UnknownCountOutEdge = 0;
+  DirectEdges InEdges;
+  DirectEdges OutEdges;
+
+  UseBBInfo(unsigned IX) : BBInfo(IX), CountValid(false) {}
+
+  UseBBInfo(unsigned IX, uint64_t C)
+      : BBInfo(IX), CountValue(C), CountValid(true) {}
+
+  // Set the profile count value for this BB.
+  void setBBInfoCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return BBInfo::infoString();
+    return (Twine(BBInfo::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+
+  // Add an OutEdge and update the edge count.
+  void addOutEdge(PGOUseEdge *E) {
+    OutEdges.push_back(E);
+    UnknownCountOutEdge++;
+  }
+
+  // Add an InEdge and update the edge count.
+  void addInEdge(PGOUseEdge *E) {
+    InEdges.push_back(E);
+    UnknownCountInEdge++;
+  }
+};
+
+} // end anonymous namespace
+
+// Sum up the count values for all the edges.
+static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
+  uint64_t Total = 0;
+  for (auto &E : Edges) {
+    if (E->Removed)
+      continue;
+    Total += E->CountValue;
+  }
+  return Total;
+}
+
+namespace {
+
+class PGOUseFunc {
+public:
+  PGOUseFunc(Function &Func, Module *Modu,
+             std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+             BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
+             ProfileSummaryInfo *PSI, bool IsCS)
+      : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
+        FuncInfo(Func, ComdatMembers, false, BPI, BFIin, IsCS),
+        FreqAttr(FFA_Normal), IsCS(IsCS) {}
+
+  // Read counts for the instrumented BB from profile.
+  bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros);
+
+  // Populate the counts for all BBs.
+  void populateCounters();
+
+  // Set the branch weights based on the count values.
+  void setBranchWeights();
+
+  // Annotate the value profile call sites for all value kind.
+  void annotateValueSites();
+
+  // Annotate the value profile call sites for one value kind.
+  void annotateValueSites(uint32_t Kind);
+
+  // Annotate the irreducible loop header weights.
+  void annotateIrrLoopHeaderWeights();
+
+  // The hotness of the function from the profile count.
+  enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
+
+  // Return the function hotness from the profile.
+  FuncFreqAttr getFuncFreqAttr() const { return FreqAttr; }
+
+  // Return the function hash.
+  uint64_t getFuncHash() const { return FuncInfo.FunctionHash; }
+
+  // Return the profile record for this function;
+  InstrProfRecord &getProfileRecord() { return ProfileRecord; }
+
+  // Return the auxiliary BB information.
+  UseBBInfo &getBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.getBBInfo(BB);
+  }
+
+  // Return the auxiliary BB information if available.
+  UseBBInfo *findBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.findBBInfo(BB);
+  }
+
+  Function &getFunc() const { return F; }
+
+  void dumpInfo(std::string Str = "") const {
+    FuncInfo.dumpInfo(Str);
+  }
+
+  uint64_t getProgramMaxCount() const { return ProgramMaxCount; }
+private:
+  Function &F;
+  Module *M;
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
+
+  // This member stores the shared information with class PGOGenFunc.
+  FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
+
+  // The maximum count value in the profile. This is only used in PGO use
+  // compilation.
+  uint64_t ProgramMaxCount;
+
+  // Position of counter that remains to be read.
+  uint32_t CountPosition = 0;
+
+  // Total size of the profile count for this function.
+  uint32_t ProfileCountSize = 0;
+
+  // ProfileRecord for this function.
+  InstrProfRecord ProfileRecord;
+
+  // Function hotness info derived from profile.
+  FuncFreqAttr FreqAttr;
+
+  // Is to use the context sensitive profile.
+  bool IsCS;
+
+  // Find the Instrumented BB and set the value. Return false on error.
+  bool setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
+
+  // Set the edge counter value for the unknown edge -- there should be only
+  // one unknown edge.
+  void setEdgeCount(DirectEdges &Edges, uint64_t Value);
+
+  // Return FuncName string;
+  const std::string getFuncName() const { return FuncInfo.FuncName; }
+
+  // Set the hot/cold inline hints based on the count values.
+  // FIXME: This function should be removed once the functionality in
+  // the inliner is implemented.
+  void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
+    if (PSI->isHotCount(EntryCount))
+      FreqAttr = FFA_Hot;
+    else if (PSI->isColdCount(MaxCount))
+      FreqAttr = FFA_Cold;
+  }
+};
+
+} // end anonymous namespace
+
+// Visit all the edges and assign the count value for the instrumented
+// edges and the BB. Return false on error.
+bool PGOUseFunc::setInstrumentedCounts(
+    const std::vector<uint64_t> &CountFromProfile) {
+
+  std::vector<BasicBlock *> InstrumentBBs;
+  FuncInfo.getInstrumentBBs(InstrumentBBs);
+  unsigned NumCounters =
+      InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
+  // The number of counters here should match the number of counters
+  // in profile. Return if they mismatch.
+  if (NumCounters != CountFromProfile.size()) {
+    return false;
+  }
+  // Set the profile count to the Instrumented BBs.
+  uint32_t I = 0;
+  for (BasicBlock *InstrBB : InstrumentBBs) {
+    uint64_t CountValue = CountFromProfile[I++];
+    UseBBInfo &Info = getBBInfo(InstrBB);
+    Info.setBBInfoCount(CountValue);
+  }
+  ProfileCountSize = CountFromProfile.size();
+  CountPosition = I;
+
+  // Set the edge count and update the count of unknown edges for BBs.
+  auto setEdgeCount = [this](PGOUseEdge *E, uint64_t Value) -> void {
+    E->setEdgeCount(Value);
+    this->getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+    this->getBBInfo(E->DestBB).UnknownCountInEdge--;
+  };
+
+  // Set the profile count the Instrumented edges. There are BBs that not in
+  // MST but not instrumented. Need to set the edge count value so that we can
+  // populate the profile counts later.
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (E->Removed || E->InMST)
+      continue;
+    const BasicBlock *SrcBB = E->SrcBB;
+    UseBBInfo &SrcInfo = getBBInfo(SrcBB);
+
+    // If only one out-edge, the edge profile count should be the same as BB
+    // profile count.
+    if (SrcInfo.CountValid && SrcInfo.OutEdges.size() == 1)
+      setEdgeCount(E.get(), SrcInfo.CountValue);
+    else {
+      const BasicBlock *DestBB = E->DestBB;
+      UseBBInfo &DestInfo = getBBInfo(DestBB);
+      // If only one in-edge, the edge profile count should be the same as BB
+      // profile count.
+      if (DestInfo.CountValid && DestInfo.InEdges.size() == 1)
+        setEdgeCount(E.get(), DestInfo.CountValue);
+    }
+    if (E->CountValid)
+      continue;
+    // E's count should have been set from profile. If not, this meenas E skips
+    // the instrumentation. We set the count to 0.
+    setEdgeCount(E.get(), 0);
+  }
+  return true;
+}
+
+// Set the count value for the unknown edge. There should be one and only one
+// unknown edge in Edges vector.
+void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
+  for (auto &E : Edges) {
+    if (E->CountValid)
+      continue;
+    E->setEdgeCount(Value);
+
+    getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+    getBBInfo(E->DestBB).UnknownCountInEdge--;
+    return;
+  }
+  llvm_unreachable("Cannot find the unknown count edge");
+}
+
+// Read the profile from ProfileFileName and assign the value to the
+// instrumented BB and the edges. This function also updates ProgramMaxCount.
+// Return true if the profile are successfully read, and false on errors.
+bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros) {
+  auto &Ctx = M->getContext();
+  Expected<InstrProfRecord> Result =
+      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
+  if (Error E = Result.takeError()) {
+    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+      auto Err = IPE.get();
+      bool SkipWarning = false;
+      LLVM_DEBUG(dbgs() << "Error in reading profile for Func "
+                        << FuncInfo.FuncName << ": ");
+      if (Err == instrprof_error::unknown_function) {
+        IsCS ? NumOfCSPGOMissing++ : NumOfPGOMissing++;
+        SkipWarning = !PGOWarnMissing;
+        LLVM_DEBUG(dbgs() << "unknown function");
+      } else if (Err == instrprof_error::hash_mismatch ||
+                 Err == instrprof_error::malformed) {
+        IsCS ? NumOfCSPGOMismatch++ : NumOfPGOMismatch++;
+        SkipWarning =
+            NoPGOWarnMismatch ||
+            (NoPGOWarnMismatchComdat &&
+             (F.hasComdat() ||
+              F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
+        LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
+      }
+
+      LLVM_DEBUG(dbgs() << " IsCS=" << IsCS << "\n");
+      if (SkipWarning)
+        return;
+
+      std::string Msg = IPE.message() + std::string(" ") + F.getName().str() +
+                        std::string(" Hash = ") +
+                        std::to_string(FuncInfo.FunctionHash);
+
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    });
+    return false;
+  }
+  ProfileRecord = std::move(Result.get());
+  std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts;
+
+  IsCS ? NumOfCSPGOFunc++ : NumOfPGOFunc++;
+  LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+  uint64_t ValueSum = 0;
+  for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
+    LLVM_DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
+    ValueSum += CountFromProfile[I];
+  }
+  AllZeros = (ValueSum == 0);
+
+  LLVM_DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
+
+  getBBInfo(nullptr).UnknownCountOutEdge = 2;
+  getBBInfo(nullptr).UnknownCountInEdge = 2;
+
+  if (!setInstrumentedCounts(CountFromProfile)) {
+    LLVM_DEBUG(
+        dbgs() << "Inconsistent number of counts, skipping this function");
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Inconsistent number of counts in ") + F.getName().str()
+        + Twine(": the profile may be stale or there is a function name collision."),
+        DS_Warning));
+    return false;
+  }
+  ProgramMaxCount = PGOReader->getMaximumFunctionCount(IsCS);
+  return true;
+}
+
+// Populate the counters from instrumented BBs to all BBs.
+// In the end of this operation, all BBs should have a valid count value.
+void PGOUseFunc::populateCounters() {
+  bool Changes = true;
+  unsigned NumPasses = 0;
+  while (Changes) {
+    NumPasses++;
+    Changes = false;
+
+    // For efficient traversal, it's better to start from the end as most
+    // of the instrumented edges are at the end.
+    for (auto &BB : reverse(F)) {
+      UseBBInfo *Count = findBBInfo(&BB);
+      if (Count == nullptr)
+        continue;
+      if (!Count->CountValid) {
+        if (Count->UnknownCountOutEdge == 0) {
+          Count->CountValue = sumEdgeCount(Count->OutEdges);
+          Count->CountValid = true;
+          Changes = true;
+        } else if (Count->UnknownCountInEdge == 0) {
+          Count->CountValue = sumEdgeCount(Count->InEdges);
+          Count->CountValid = true;
+          Changes = true;
+        }
+      }
+      if (Count->CountValid) {
+        if (Count->UnknownCountOutEdge == 1) {
+          uint64_t Total = 0;
+          uint64_t OutSum = sumEdgeCount(Count->OutEdges);
+          // If the one of the successor block can early terminate (no-return),
+          // we can end up with situation where out edge sum count is larger as
+          // the source BB's count is collected by a post-dominated block.
+          if (Count->CountValue > OutSum)
+            Total = Count->CountValue - OutSum;
+          setEdgeCount(Count->OutEdges, Total);
+          Changes = true;
+        }
+        if (Count->UnknownCountInEdge == 1) {
+          uint64_t Total = 0;
+          uint64_t InSum = sumEdgeCount(Count->InEdges);
+          if (Count->CountValue > InSum)
+            Total = Count->CountValue - InSum;
+          setEdgeCount(Count->InEdges, Total);
+          Changes = true;
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+#ifndef NDEBUG
+  // Assert every BB has a valid counter.
+  for (auto &BB : F) {
+    auto BI = findBBInfo(&BB);
+    if (BI == nullptr)
+      continue;
+    assert(BI->CountValid && "BB count is not valid");
+  }
+#endif
+  uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
+  F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real));
+  uint64_t FuncMaxCount = FuncEntryCount;
+  for (auto &BB : F) {
+    auto BI = findBBInfo(&BB);
+    if (BI == nullptr)
+      continue;
+    FuncMaxCount = std::max(FuncMaxCount, BI->CountValue);
+  }
+  markFunctionAttributes(FuncEntryCount, FuncMaxCount);
+
+  // Now annotate select instructions
+  FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition);
+  assert(CountPosition == ProfileCountSize);
+
+  LLVM_DEBUG(FuncInfo.dumpInfo("after reading profile."));
+}
+
+// Assign the scaled count values to the BB with multiple out edges.
+void PGOUseFunc::setBranchWeights() {
+  // Generate MD_prof metadata for every branch instruction.
+  LLVM_DEBUG(dbgs() << "\nSetting branch weights for func " << F.getName()
+                    << " IsCS=" << IsCS << "\n");
+  for (auto &BB : F) {
+    Instruction *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() < 2)
+      continue;
+    if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
+          isa<IndirectBrInst>(TI)))
+      continue;
+
+    if (getBBInfo(&BB).CountValue == 0)
+      continue;
+
+    // We have a non-zero Branch BB.
+    const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+    unsigned Size = BBCountInfo.OutEdges.size();
+    SmallVector<uint64_t, 2> EdgeCounts(Size, 0);
+    uint64_t MaxCount = 0;
+    for (unsigned s = 0; s < Size; s++) {
+      const PGOUseEdge *E = BBCountInfo.OutEdges[s];
+      const BasicBlock *SrcBB = E->SrcBB;
+      const BasicBlock *DestBB = E->DestBB;
+      if (DestBB == nullptr)
+        continue;
+      unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+      uint64_t EdgeCount = E->CountValue;
+      if (EdgeCount > MaxCount)
+        MaxCount = EdgeCount;
+      EdgeCounts[SuccNum] = EdgeCount;
+    }
+    setProfMetadata(M, TI, EdgeCounts, MaxCount);
+  }
+}
+
+static bool isIndirectBrTarget(BasicBlock *BB) {
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    if (isa<IndirectBrInst>((*PI)->getTerminator()))
+      return true;
+  }
+  return false;
+}
+
+void PGOUseFunc::annotateIrrLoopHeaderWeights() {
+  LLVM_DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
+  // Find irr loop headers
+  for (auto &BB : F) {
+    // As a heuristic also annotate indrectbr targets as they have a high chance
+    // to become an irreducible loop header after the indirectbr tail
+    // duplication.
+    if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
+      Instruction *TI = BB.getTerminator();
+      const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+      setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
+    }
+  }
+}
+
+void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
+  Module *M = F.getParent();
+  IRBuilder<> Builder(&SI);
+  Type *Int64Ty = Builder.getInt64Ty();
+  Type *I8PtrTy = Builder.getInt8PtrTy();
+  auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
+      {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+       Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
+       Builder.getInt32(*CurCtrIdx), Step});
+  ++(*CurCtrIdx);
+}
+
+void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
+  std::vector<uint64_t> &CountFromProfile = UseFunc->getProfileRecord().Counts;
+  assert(*CurCtrIdx < CountFromProfile.size() &&
+         "Out of bound access of counters");
+  uint64_t SCounts[2];
+  SCounts[0] = CountFromProfile[*CurCtrIdx]; // True count
+  ++(*CurCtrIdx);
+  uint64_t TotalCount = 0;
+  auto BI = UseFunc->findBBInfo(SI.getParent());
+  if (BI != nullptr)
+    TotalCount = BI->CountValue;
+  // False Count
+  SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
+  uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
+  if (MaxCount)
+    setProfMetadata(F.getParent(), &SI, SCounts, MaxCount);
+}
+
+void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
+  if (!PGOInstrSelect)
+    return;
+  // FIXME: do not handle this yet.
+  if (SI.getCondition()->getType()->isVectorTy())
+    return;
+
+  switch (Mode) {
+  case VM_counting:
+    NSIs++;
+    return;
+  case VM_instrument:
+    instrumentOneSelectInst(SI);
+    return;
+  case VM_annotate:
+    annotateOneSelectInst(SI);
+    return;
+  }
+
+  llvm_unreachable("Unknown visiting mode");
+}
+
+// Traverse all valuesites and annotate the instructions for all value kind.
+void PGOUseFunc::annotateValueSites() {
+  if (DisableValueProfiling)
+    return;
+
+  // Create the PGOFuncName meta data.
+  createPGOFuncNameMetadata(F, FuncInfo.FuncName);
+
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    annotateValueSites(Kind);
+}
+
+// Annotate the instructions for a specific value kind.
+void PGOUseFunc::annotateValueSites(uint32_t Kind) {
+  assert(Kind <= IPVK_Last);
+  unsigned ValueSiteIndex = 0;
+  auto &ValueSites = FuncInfo.ValueSites[Kind];
+  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
+  if (NumValueSites != ValueSites.size()) {
+    auto &Ctx = M->getContext();
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Inconsistent number of value sites for ") +
+            Twine(ValueProfKindDescr[Kind]) +
+            Twine(" profiling in \"") + F.getName().str() +
+            Twine("\", possibly due to the use of a stale profile."),
+        DS_Warning));
+    return;
+  }
+
+  for (VPCandidateInfo &I : ValueSites) {
+    LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+                      << "): Index = " << ValueSiteIndex << " out of "
+                      << NumValueSites << "\n");
+    annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord,
+                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
+                                             : MaxNumAnnotations);
+    ValueSiteIndex++;
+  }
+}
+
+// Collect the set of members for each Comdat in module M and store
+// in ComdatMembers.
+static void collectComdatMembers(
+    Module &M,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+  if (!DoComdatRenaming)
+    return;
+  for (Function &F : M)
+    if (Comdat *C = F.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &F));
+  for (GlobalVariable &GV : M.globals())
+    if (Comdat *C = GV.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GV));
+  for (GlobalAlias &GA : M.aliases())
+    if (Comdat *C = GA.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GA));
+}
+
+static bool InstrumentAllFunctions(
+    Module &M, function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
+  // For the context-sensitve instrumentation, we should have a separated pass
+  // (before LTO/ThinLTO linking) to create these variables.
+  if (!IsCS)
+    createIRLevelProfileFlagVar(M, /* IsCS */ false);
+  std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+  collectComdatMembers(M, ComdatMembers);
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto *BPI = LookupBPI(F);
+    auto *BFI = LookupBFI(F);
+    instrumentOneFunc(F, &M, BPI, BFI, ComdatMembers, IsCS);
+  }
+  return true;
+}
+
+PreservedAnalyses
+PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
+  createProfileFileNameVar(M, CSInstrName);
+  createIRLevelProfileFlagVar(M, /* IsCS */ true);
+  return PreservedAnalyses::all();
+}
+
+bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto LookupBPI = [this](Function &F) {
+    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+  };
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+  return InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS);
+}
+
+PreservedAnalyses PGOInstrumentationGen::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  if (!InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+static bool annotateAllFunctions(
+    Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
+    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
+    ProfileSummaryInfo *PSI, bool IsCS) {
+  LLVM_DEBUG(dbgs() << "Read in profile counters: ");
+  auto &Ctx = M.getContext();
+  // Read the counter array from file.
+  auto ReaderOrErr =
+      IndexedInstrProfReader::create(ProfileFileName, ProfileRemappingFileName);
+  if (Error E = ReaderOrErr.takeError()) {
+    handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(ProfileFileName.data(), EI.message()));
+    });
+    return false;
+  }
+
+  std::unique_ptr<IndexedInstrProfReader> PGOReader =
+      std::move(ReaderOrErr.get());
+  if (!PGOReader) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
+                                          StringRef("Cannot get PGOReader")));
+    return false;
+  }
+  if (!PGOReader->hasCSIRLevelProfile() && IsCS)
+    return false;
+
+  // TODO: might need to change the warning once the clang option is finalized.
+  if (!PGOReader->isIRLevelProfile()) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        ProfileFileName.data(), "Not an IR level instrumentation profile"));
+    return false;
+  }
+
+  // Add the profile summary (read from the header of the indexed summary) here
+  // so that we can use it below when reading counters (which checks if the
+  // function should be marked with a cold or inlinehint attribute).
+  M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
+                      IsCS ? ProfileSummary::PSK_CSInstr
+                           : ProfileSummary::PSK_Instr);
+
+  std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+  collectComdatMembers(M, ComdatMembers);
+  std::vector<Function *> HotFunctions;
+  std::vector<Function *> ColdFunctions;
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto *BPI = LookupBPI(F);
+    auto *BFI = LookupBFI(F);
+    // Split indirectbr critical edges here before computing the MST rather than
+    // later in getInstrBB() to avoid invalidating it.
+    SplitIndirectBrCriticalEdges(F, BPI, BFI);
+    PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, PSI, IsCS);
+    bool AllZeros = false;
+    if (!Func.readCounters(PGOReader.get(), AllZeros))
+      continue;
+    if (AllZeros) {
+      F.setEntryCount(ProfileCount(0, Function::PCT_Real));
+      if (Func.getProgramMaxCount() != 0)
+        ColdFunctions.push_back(&F);
+      continue;
+    }
+    Func.populateCounters();
+    Func.setBranchWeights();
+    Func.annotateValueSites();
+    Func.annotateIrrLoopHeaderWeights();
+    PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
+    if (FreqAttr == PGOUseFunc::FFA_Cold)
+      ColdFunctions.push_back(&F);
+    else if (FreqAttr == PGOUseFunc::FFA_Hot)
+      HotFunctions.push_back(&F);
+    if (PGOViewCounts != PGOVCT_None &&
+        (ViewBlockFreqFuncName.empty() ||
+         F.getName().equals(ViewBlockFreqFuncName))) {
+      LoopInfo LI{DominatorTree(F)};
+      std::unique_ptr<BranchProbabilityInfo> NewBPI =
+          std::make_unique<BranchProbabilityInfo>(F, LI);
+      std::unique_ptr<BlockFrequencyInfo> NewBFI =
+          std::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
+      if (PGOViewCounts == PGOVCT_Graph)
+        NewBFI->view();
+      else if (PGOViewCounts == PGOVCT_Text) {
+        dbgs() << "pgo-view-counts: " << Func.getFunc().getName() << "\n";
+        NewBFI->print(dbgs());
+      }
+    }
+    if (PGOViewRawCounts != PGOVCT_None &&
+        (ViewBlockFreqFuncName.empty() ||
+         F.getName().equals(ViewBlockFreqFuncName))) {
+      if (PGOViewRawCounts == PGOVCT_Graph)
+        if (ViewBlockFreqFuncName.empty())
+          WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+        else
+          ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+      else if (PGOViewRawCounts == PGOVCT_Text) {
+        dbgs() << "pgo-view-raw-counts: " << Func.getFunc().getName() << "\n";
+        Func.dumpInfo();
+      }
+    }
+  }
+
+  // Set function hotness attribute from the profile.
+  // We have to apply these attributes at the end because their presence
+  // can affect the BranchProbabilityInfo of any callers, resulting in an
+  // inconsistent MST between prof-gen and prof-use.
+  for (auto &F : HotFunctions) {
+    F->addFnAttr(Attribute::InlineHint);
+    LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+                      << "\n");
+  }
+  for (auto &F : ColdFunctions) {
+    F->addFnAttr(Attribute::Cold);
+    LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
+                      << "\n");
+  }
+  return true;
+}
+
+PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename,
+                                             std::string RemappingFilename,
+                                             bool IsCS)
+    : ProfileFileName(std::move(Filename)),
+      ProfileRemappingFileName(std::move(RemappingFilename)), IsCS(IsCS) {
+  if (!PGOTestProfileFile.empty())
+    ProfileFileName = PGOTestProfileFile;
+  if (!PGOTestProfileRemappingFile.empty())
+    ProfileRemappingFileName = PGOTestProfileRemappingFile;
+}
+
+PreservedAnalyses PGOInstrumentationUse::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+  if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
+                            LookupBPI, LookupBFI, PSI, IsCS))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto LookupBPI = [this](Function &F) {
+    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+  };
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, PSI,
+                              IsCS);
+}
+
+static std::string getSimpleNodeName(const BasicBlock *Node) {
+  if (!Node->getName().empty())
+    return Node->getName();
+
+  std::string SimpleNodeName;
+  raw_string_ostream OS(SimpleNodeName);
+  Node->printAsOperand(OS, false);
+  return OS.str();
+}
+
+void llvm::setProfMetadata(Module *M, Instruction *TI,
+                           ArrayRef<uint64_t> EdgeCounts,
+                           uint64_t MaxCount) {
+  MDBuilder MDB(M->getContext());
+  assert(MaxCount > 0 && "Bad max count");
+  uint64_t Scale = calculateCountScale(MaxCount);
+  SmallVector<unsigned, 4> Weights;
+  for (const auto &ECI : EdgeCounts)
+    Weights.push_back(scaleBranchCount(ECI, Scale));
+
+  LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
+                                           : Weights) {
+    dbgs() << W << " ";
+  } dbgs() << "\n";);
+
+  misexpect::verifyMisExpect(TI, Weights, TI->getContext());
+
+  TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  if (EmitBranchProbability) {
+    std::string BrCondStr = getBranchCondString(TI);
+    if (BrCondStr.empty())
+      return;
+
+    uint64_t WSum =
+        std::accumulate(Weights.begin(), Weights.end(), (uint64_t)0,
+                        [](uint64_t w1, uint64_t w2) { return w1 + w2; });
+    uint64_t TotalCount =
+        std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0,
+                        [](uint64_t c1, uint64_t c2) { return c1 + c2; });
+    Scale = calculateCountScale(WSum);
+    BranchProbability BP(scaleBranchCount(Weights[0], Scale),
+                         scaleBranchCount(WSum, Scale));
+    std::string BranchProbStr;
+    raw_string_ostream OS(BranchProbStr);
+    OS << BP;
+    OS << " (total count : " << TotalCount << ")";
+    OS.flush();
+    Function *F = TI->getParent()->getParent();
+    OptimizationRemarkEmitter ORE(F);
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "pgo-instrumentation", TI)
+             << BrCondStr << " is true with probability : " << BranchProbStr;
+    });
+  }
+}
+
+namespace llvm {
+
+void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
+  MDBuilder MDB(M->getContext());
+  TI->setMetadata(llvm::LLVMContext::MD_irr_loop,
+                  MDB.createIrrLoopHeaderWeight(Count));
+}
+
+template <> struct GraphTraits<PGOUseFunc *> {
+  using NodeRef = const BasicBlock *;
+  using ChildIteratorType = succ_const_iterator;
+  using nodes_iterator = pointer_iterator<Function::const_iterator>;
+
+  static NodeRef getEntryNode(const PGOUseFunc *G) {
+    return &G->getFunc().front();
+  }
+
+  static ChildIteratorType child_begin(const NodeRef N) {
+    return succ_begin(N);
+  }
+
+  static ChildIteratorType child_end(const NodeRef N) { return succ_end(N); }
+
+  static nodes_iterator nodes_begin(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().begin());
+  }
+
+  static nodes_iterator nodes_end(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().end());
+  }
+};
+
+template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool isSimple = false)
+      : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const PGOUseFunc *G) {
+    return G->getFunc().getName();
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+
+    OS << getSimpleNodeName(Node) << ":\\l";
+    UseBBInfo *BI = Graph->findBBInfo(Node);
+    OS << "Count : ";
+    if (BI && BI->CountValid)
+      OS << BI->CountValue << "\\l";
+    else
+      OS << "Unknown\\l";
+
+    if (!PGOInstrSelect)
+      return Result;
+
+    for (auto BI = Node->begin(); BI != Node->end(); ++BI) {
+      auto *I = &*BI;
+      if (!isa<SelectInst>(I))
+        continue;
+      // Display scaled counts for SELECT instruction:
+      OS << "SELECT : { T = ";
+      uint64_t TC, FC;
+      bool HasProf = I->extractProfMetadata(TC, FC);
+      if (!HasProf)
+        OS << "Unknown, F = Unknown }\\l";
+      else
+        OS << TC << ", F = " << FC << " }\\l";
+    }
+    return Result;
+  }
+};
+
+} // end namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
new file mode 100644
index 000000000000..9f81bb16d0a7
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -0,0 +1,452 @@
+//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that optimizes memory intrinsics
+// such as memcpy using the size value profile. When memory intrinsic size
+// value profile metadata is available, a single memory intrinsic is expanded
+// to a sequence of guarded specialized versions that are called with the
+// hottest size(s), for later expansion into more optimal inline sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-memop-opt"
+
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
+
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+                        cl::init(1000),
+                        cl::desc("The minimum count to optimize memory "
+                                 "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+                                     cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+                          cl::Hidden, cl::ZeroOrMore,
+                          cl::desc("The percentage threshold for the "
+                                   "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+                    cl::ZeroOrMore,
+                    cl::desc("The max version for the optimized memory "
+                             " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+                    cl::desc("Scale the memop size counts using the basic "
+                             " block count value"));
+
+// This option sets the rangge of precise profile memop sizes.
+extern cl::opt<std::string> MemOPSizeRange;
+
+// This option sets the value that groups large memop sizes
+extern cl::opt<unsigned> MemOPSizeLarge;
+
+namespace {
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                      "Optimize memory intrinsic using its size value profile",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                    "Optimize memory intrinsic using its size value profile",
+                    false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+  return new PGOMemOPSizeOptLegacyPass();
+}
+
+namespace {
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
+               OptimizationRemarkEmitter &ORE, DominatorTree *DT)
+      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
+    ValueDataArray =
+        std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+    // Get the MemOPSize range information from option MemOPSizeRange,
+    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
+                                PreciseRangeLast);
+  }
+  bool isChanged() const { return Changed; }
+  void perform() {
+    WorkList.clear();
+    visit(Func);
+
+    for (auto &MI : WorkList) {
+      ++NumOfPGOMemOPAnnotate;
+      if (perform(MI)) {
+        Changed = true;
+        ++NumOfPGOMemOPOpt;
+        LLVM_DEBUG(dbgs() << "MemOP call: "
+                          << MI->getCalledFunction()->getName()
+                          << "is Transformed.\n");
+      }
+    }
+  }
+
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not perform on constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+    WorkList.push_back(&MI);
+  }
+
+private:
+  Function &Func;
+  BlockFrequencyInfo &BFI;
+  OptimizationRemarkEmitter &ORE;
+  DominatorTree *DT;
+  bool Changed;
+  std::vector<MemIntrinsic *> WorkList;
+  // Start of the previse range.
+  int64_t PreciseRangeStart;
+  // Last value of the previse range.
+  int64_t PreciseRangeLast;
+  // The space to read the profile annotation.
+  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+  bool perform(MemIntrinsic *MI);
+
+  // This kind shows which group the value falls in. For PreciseValue, we have
+  // the profile count for that value. LargeGroup groups the values that are in
+  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
+  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
+
+  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
+    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
+      return LargeGroup;
+    if (Value == PreciseRangeLast + 1)
+      return NonLargeGroup;
+    return PreciseValue;
+  }
+};
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+  assert(Count <= TotalCount);
+  if (Count < MemOPCountThreshold)
+    return false;
+  if (Count < TotalCount * MemOPPercentThreshold / 100)
+    return false;
+  return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+                                      uint64_t Denom) {
+  if (!MemOPScaleCount)
+    return Count;
+  bool Overflowed;
+  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+  return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
+  assert(MI);
+  if (MI->getIntrinsicID() == Intrinsic::memmove)
+    return false;
+
+  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+  uint64_t TotalCount;
+  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+                                ValueDataArray.get(), NumVals, TotalCount))
+    return false;
+
+  uint64_t ActualCount = TotalCount;
+  uint64_t SavedTotalCount = TotalCount;
+  if (MemOPScaleCount) {
+    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+    if (!BBEdgeCount)
+      return false;
+    ActualCount = *BBEdgeCount;
+  }
+
+  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+  LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
+                    << ActualCount << "\n");
+  LLVM_DEBUG(
+      for (auto &VD
+           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
+
+  if (ActualCount < MemOPCountThreshold)
+    return false;
+  // Skip if the total value profiled count is 0, in which case we can't
+  // scale up the counts properly (and there is no profitable transformation).
+  if (TotalCount == 0)
+    return false;
+
+  TotalCount = ActualCount;
+  if (MemOPScaleCount)
+    LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+                      << " denominator = " << SavedTotalCount << "\n");
+
+  // Keeping track of the count of the default case:
+  uint64_t RemainCount = TotalCount;
+  uint64_t SavedRemainCount = SavedTotalCount;
+  SmallVector<uint64_t, 16> SizeIds;
+  SmallVector<uint64_t, 16> CaseCounts;
+  uint64_t MaxCount = 0;
+  unsigned Version = 0;
+  // Default case is in the front -- save the slot here.
+  CaseCounts.push_back(0);
+  for (auto &VD : VDs) {
+    int64_t V = VD.Value;
+    uint64_t C = VD.Count;
+    if (MemOPScaleCount)
+      C = getScaledCount(C, ActualCount, SavedTotalCount);
+
+    // Only care precise value here.
+    if (getMemOPSizeKind(V) != PreciseValue)
+      continue;
+
+    // ValueCounts are sorted on the count. Break at the first un-profitable
+    // value.
+    if (!isProfitable(C, RemainCount))
+      break;
+
+    SizeIds.push_back(V);
+    CaseCounts.push_back(C);
+    if (C > MaxCount)
+      MaxCount = C;
+
+    assert(RemainCount >= C);
+    RemainCount -= C;
+    assert(SavedRemainCount >= VD.Count);
+    SavedRemainCount -= VD.Count;
+
+    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+      break;
+  }
+
+  if (Version == 0)
+    return false;
+
+  CaseCounts[0] = RemainCount;
+  if (RemainCount > MaxCount)
+    MaxCount = RemainCount;
+
+  uint64_t SumForOpt = TotalCount - RemainCount;
+
+  LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+                    << " Versions (covering " << SumForOpt << " out of "
+                    << TotalCount << ")\n");
+
+  // mem_op(..., size)
+  // ==>
+  // switch (size) {
+  //   case s1:
+  //      mem_op(..., s1);
+  //      goto merge_bb;
+  //   case s2:
+  //      mem_op(..., s2);
+  //      goto merge_bb;
+  //   ...
+  //   default:
+  //      mem_op(..., size);
+  //      goto merge_bb;
+  // }
+  // merge_bb:
+
+  BasicBlock *BB = MI->getParent();
+  LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  LLVM_DEBUG(dbgs() << *BB << "\n");
+  auto OrigBBFreq = BFI.getBlockFreq(BB);
+
+  BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
+  BasicBlock::iterator It(*MI);
+  ++It;
+  assert(It != DefaultBB->end());
+  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
+  MergeBB->setName("MemOP.Merge");
+  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
+  DefaultBB->setName("MemOP.Default");
+
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  auto &Ctx = Func.getContext();
+  IRBuilder<> IRB(BB);
+  BB->getTerminator()->eraseFromParent();
+  Value *SizeVar = MI->getLength();
+  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+
+  // Clear the value profile data.
+  MI->setMetadata(LLVMContext::MD_prof, nullptr);
+  // If all promoted, we don't need the MD.prof metadata.
+  if (SavedRemainCount > 0 || Version != NumVals)
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
+                      SavedRemainCount, IPVK_MemOPSize, NumVals);
+
+  LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+  std::vector<DominatorTree::UpdateType> Updates;
+  if (DT)
+    Updates.reserve(2 * SizeIds.size());
+
+  for (uint64_t SizeId : SizeIds) {
+    BasicBlock *CaseBB = BasicBlock::Create(
+        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+    Instruction *NewInst = MI->clone();
+    // Fix the argument.
+    auto *MemI = cast<MemIntrinsic>(NewInst);
+    auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
+    assert(SizeType && "Expected integer type size argument.");
+    ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
+    MemI->setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewInst);
+    IRBuilder<> IRBCase(CaseBB);
+    IRBCase.CreateBr(MergeBB);
+    SI->addCase(CaseSizeId, CaseBB);
+    if (DT) {
+      Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
+      Updates.push_back({DominatorTree::Insert, BB, CaseBB});
+    }
+    LLVM_DEBUG(dbgs() << *CaseBB << "\n");
+  }
+  DTU.applyUpdates(Updates);
+  Updates.clear();
+
+  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+  LLVM_DEBUG(dbgs() << *BB << "\n");
+  LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
+  LLVM_DEBUG(dbgs() << *MergeBB << "\n");
+
+  ORE.emit([&]() {
+    using namespace ore;
+    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
+             << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
+             << " with count " << NV("Count", SumForOpt) << " out of "
+             << NV("Total", TotalCount) << " for " << NV("Versions", Version)
+             << " versions";
+  });
+
+  return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
+                                OptimizationRemarkEmitter &ORE,
+                                DominatorTree *DT) {
+  if (DisableMemOPOPT)
+    return false;
+
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
+  MemOPSizeOpt.perform();
+  return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
new file mode 100644
index 000000000000..81d92e724c7d
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -0,0 +1,357 @@
+//===- PoisonChecking.cpp - -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements a transform pass which instruments IR such that poison semantics
+// are made explicit.  That is, it provides a (possibly partial) executable
+// semantics for every instruction w.r.t. poison as specified in the LLVM
+// LangRef.  There are obvious parallels to the sanitizer tools, but this pass
+// is focused purely on the semantics of LLVM IR, not any particular source
+// language.   If you're looking for something to see if your C/C++ contains
+// UB, this is not it.  
+// 
+// The rewritten semantics of each instruction will include the following
+// components: 
+//
+// 1) The original instruction, unmodified.
+// 2) A propagation rule which translates dynamic information about the poison
+//    state of each input to whether the dynamic output of the instruction
+//    produces poison.
+// 3) A flag validation rule which validates any poison producing flags on the
+//    instruction itself (e.g. checks for overflow on nsw).
+// 4) A check rule which traps (to a handler function) if this instruction must
+//    execute undefined behavior given the poison state of it's inputs.
+//
+// At the moment, the UB detection is done in a best effort manner; that is,
+// the resulting code may produce a false negative result (not report UB when
+// it actually exists according to the LangRef spec), but should never produce
+// a false positive (report UB where it doesn't exist).  The intention is to
+// eventually support a "strict" mode which never dynamically reports a false
+// negative at the cost of rejecting some valid inputs to translation.
+//
+// Use cases for this pass include:
+// - Understanding (and testing!) the implications of the definition of poison
+//   from the LangRef.
+// - Validating the output of a IR fuzzer to ensure that all programs produced
+//   are well defined on the specific input used.
+// - Finding/confirming poison specific miscompiles by checking the poison
+//   status of an input/IR pair is the same before and after an optimization
+//   transform. 
+// - Checking that a bugpoint reduction does not introduce UB which didn't
+//   exist in the original program being reduced.
+//
+// The major sources of inaccuracy are currently:
+// - Most validation rules not yet implemented for instructions with poison
+//   relavant flags.  At the moment, only nsw/nuw on add/sub are supported.
+// - UB which is control dependent on a branch on poison is not yet
+//   reported. Currently, only data flow dependence is modeled.
+// - Poison which is propagated through memory is not modeled.  As such,
+//   storing poison to memory and then reloading it will cause a false negative
+//   as we consider the reloaded value to not be poisoned.
+// - Poison propagation across function boundaries is not modeled.  At the
+//   moment, all arguments and return values are assumed not to be poison.
+// - Undef is not modeled.  In particular, the optimizer's freedom to pick
+//   concrete values for undef bits so as to maximize potential for producing
+//   poison is not modeled.  
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PoisonChecking.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "poison-checking"
+
+static cl::opt<bool>
+LocalCheck("poison-checking-function-local",
+           cl::init(false),
+           cl::desc("Check that returns are non-poison (for testing)"));
+
+
+static bool isConstantFalse(Value* V) {
+  assert(V->getType()->isIntegerTy(1));
+  if (auto *CI = dyn_cast<ConstantInt>(V))
+    return CI->isZero();
+  return false;
+}
+
+static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) {
+  if (Ops.size() == 0)
+    return B.getFalse();
+  unsigned i = 0;
+  for (; i < Ops.size() && isConstantFalse(Ops[i]); i++) {}
+  if (i == Ops.size())
+    return B.getFalse();
+  Value *Accum = Ops[i++];
+  for (; i < Ops.size(); i++)
+    if (!isConstantFalse(Ops[i]))
+      Accum = B.CreateOr(Accum, Ops[i]);
+  return Accum;
+}
+
+static void generatePoisonChecksForBinOp(Instruction &I,
+                                         SmallVector<Value*, 2> &Checks) {
+  assert(isa<BinaryOperator>(I));
+  
+  IRBuilder<> B(&I);
+  Value *LHS = I.getOperand(0);
+  Value *RHS = I.getOperand(1);
+  switch (I.getOpcode()) {
+  default:
+    return;
+  case Instruction::Add: {
+    if (I.hasNoSignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    if (I.hasNoUnsignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::uadd_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    break;
+  }
+  case Instruction::Sub: {
+    if (I.hasNoSignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::ssub_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    if (I.hasNoUnsignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    break;
+  }
+  case Instruction::Mul: {
+    if (I.hasNoSignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::smul_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    if (I.hasNoUnsignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::umul_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    break;
+  }
+  case Instruction::UDiv: {
+    if (I.isExact()) {
+      auto *Check =
+        B.CreateICmp(ICmpInst::ICMP_NE, B.CreateURem(LHS, RHS),
+                     ConstantInt::get(LHS->getType(), 0));
+      Checks.push_back(Check);
+    }
+    break;
+  }
+  case Instruction::SDiv: {
+    if (I.isExact()) {
+      auto *Check =
+        B.CreateICmp(ICmpInst::ICMP_NE, B.CreateSRem(LHS, RHS),
+                     ConstantInt::get(LHS->getType(), 0));
+      Checks.push_back(Check);
+    }
+    break;
+  }
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl: {
+    Value *ShiftCheck =
+      B.CreateICmp(ICmpInst::ICMP_UGE, RHS,
+                   ConstantInt::get(RHS->getType(),
+                                    LHS->getType()->getScalarSizeInBits()));
+    Checks.push_back(ShiftCheck);
+    break;
+  }
+  };
+}
+
+static Value* generatePoisonChecks(Instruction &I) {
+  IRBuilder<> B(&I);
+  SmallVector<Value*, 2> Checks;
+  if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy())
+    generatePoisonChecksForBinOp(I, Checks);
+
+  // Handle non-binops seperately
+  switch (I.getOpcode()) {
+  default:
+    break;
+  case Instruction::ExtractElement: {
+    Value *Vec = I.getOperand(0);
+    if (Vec->getType()->getVectorIsScalable())
+      break;
+    Value *Idx = I.getOperand(1);
+    unsigned NumElts = Vec->getType()->getVectorNumElements();
+    Value *Check =
+      B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
+                   ConstantInt::get(Idx->getType(), NumElts));
+    Checks.push_back(Check);
+    break;
+  }
+  case Instruction::InsertElement: {
+    Value *Vec = I.getOperand(0);
+    if (Vec->getType()->getVectorIsScalable())
+      break;
+    Value *Idx = I.getOperand(2);
+    unsigned NumElts = Vec->getType()->getVectorNumElements();
+    Value *Check =
+      B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
+                   ConstantInt::get(Idx->getType(), NumElts));
+    Checks.push_back(Check);
+    break;
+  }
+  };
+  return buildOrChain(B, Checks);
+}
+
+static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) {
+  auto Itr = ValToPoison.find(V);
+  if (Itr != ValToPoison.end())
+    return Itr->second;
+  if (isa<Constant>(V)) {
+    return ConstantInt::getFalse(V->getContext());
+  }
+  // Return false for unknwon values - this implements a non-strict mode where
+  // unhandled IR constructs are simply considered to never produce poison.  At
+  // some point in the future, we probably want a "strict mode" for testing if
+  // nothing else.
+  return ConstantInt::getFalse(V->getContext());
+}
+
+static void CreateAssert(IRBuilder<> &B, Value *Cond) {
+  assert(Cond->getType()->isIntegerTy(1));
+  if (auto *CI = dyn_cast<ConstantInt>(Cond))
+    if (CI->isAllOnesValue())
+      return;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  M->getOrInsertFunction("__poison_checker_assert",
+                         Type::getVoidTy(M->getContext()),
+                         Type::getInt1Ty(M->getContext()));
+  Function *TrapFunc = M->getFunction("__poison_checker_assert");
+  B.CreateCall(TrapFunc, Cond);
+}
+
+static void CreateAssertNot(IRBuilder<> &B, Value *Cond) {
+  assert(Cond->getType()->isIntegerTy(1));
+  CreateAssert(B, B.CreateNot(Cond));
+}
+
+static bool rewrite(Function &F) {
+  auto * const Int1Ty = Type::getInt1Ty(F.getContext());
+
+  DenseMap<Value *, Value *> ValToPoison;
+
+  for (BasicBlock &BB : F)
+    for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
+      auto *OldPHI = cast<PHINode>(&*I);
+      auto *NewPHI = PHINode::Create(Int1Ty, 
+                                     OldPHI->getNumIncomingValues());
+      for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++)
+        NewPHI->addIncoming(UndefValue::get(Int1Ty),
+                            OldPHI->getIncomingBlock(i));
+      NewPHI->insertBefore(OldPHI);
+      ValToPoison[OldPHI] = NewPHI;
+    }
+  
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB) {
+      if (isa<PHINode>(I)) continue;
+
+      IRBuilder<> B(cast<Instruction>(&I));
+      
+      // Note: There are many more sources of documented UB, but this pass only
+      // attempts to find UB triggered by propagation of poison.
+      if (Value *Op = const_cast<Value*>(getGuaranteedNonFullPoisonOp(&I)))
+        CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
+
+      if (LocalCheck)
+        if (auto *RI = dyn_cast<ReturnInst>(&I))
+          if (RI->getNumOperands() != 0) {
+            Value *Op = RI->getOperand(0);
+            CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
+          }
+
+      SmallVector<Value*, 4> Checks;
+      if (propagatesFullPoison(&I))
+        for (Value *V : I.operands())
+          Checks.push_back(getPoisonFor(ValToPoison, V));
+
+      if (auto *Check = generatePoisonChecks(I))
+        Checks.push_back(Check);
+      ValToPoison[&I] = buildOrChain(B, Checks);
+    }
+
+  for (BasicBlock &BB : F)
+    for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
+      auto *OldPHI = cast<PHINode>(&*I);
+      if (!ValToPoison.count(OldPHI))
+        continue; // skip the newly inserted phis
+      auto *NewPHI = cast<PHINode>(ValToPoison[OldPHI]);
+      for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) {
+        auto *OldVal = OldPHI->getIncomingValue(i);
+        NewPHI->setIncomingValue(i, getPoisonFor(ValToPoison, OldVal));
+      }
+    }
+  return true;
+}
+
+
+PreservedAnalyses PoisonCheckingPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  bool Changed = false;
+  for (auto &F : M)
+    Changed |= rewrite(F);
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+PreservedAnalyses PoisonCheckingPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+
+/* Major TODO Items:
+   - Control dependent poison UB
+   - Strict mode - (i.e. must analyze every operand)
+     - Poison through memory
+     - Function ABIs
+     - Full coverage of intrinsics, etc.. (ouch)
+
+   Instructions w/Unclear Semantics:
+   - shufflevector - It would seem reasonable for an out of bounds mask element
+     to produce poison, but the LangRef does not state.  
+   - and/or - It would seem reasonable for poison to propagate from both
+     arguments, but LangRef doesn't state and propagatesFullPoison doesn't
+     include these two.
+   - all binary ops w/vector operands - The likely interpretation would be that
+     any element overflowing should produce poison for the entire result, but
+     the LangRef does not state.
+   - Floating point binary ops w/fmf flags other than (nnan, noinfs).  It seems
+     strange that only certian flags should be documented as producing poison.
+
+   Cases of clear poison semantics not yet implemented:
+   - Exact flags on ashr/lshr produce poison
+   - NSW/NUW flags on shl produce poison
+   - Inbounds flag on getelementptr produce poison
+   - fptosi/fptoui (out of bounds input) produce poison
+   - Scalable vector types for insertelement/extractelement
+   - Floating point binary ops w/fmf nnan/noinfs flags produce poison
+ */
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
new file mode 100644
index 000000000000..f8fa9cad03b8
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -0,0 +1,947 @@
+//===-- SanitizerCoverage.cpp - coverage instrumentation for sanitizers ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coverage instrumentation done on LLVM IR level, works with Sanitizers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sancov"
+
+static const char *const SanCovTracePCIndirName =
+    "__sanitizer_cov_trace_pc_indir";
+static const char *const SanCovTracePCName = "__sanitizer_cov_trace_pc";
+static const char *const SanCovTraceCmp1 = "__sanitizer_cov_trace_cmp1";
+static const char *const SanCovTraceCmp2 = "__sanitizer_cov_trace_cmp2";
+static const char *const SanCovTraceCmp4 = "__sanitizer_cov_trace_cmp4";
+static const char *const SanCovTraceCmp8 = "__sanitizer_cov_trace_cmp8";
+static const char *const SanCovTraceConstCmp1 =
+    "__sanitizer_cov_trace_const_cmp1";
+static const char *const SanCovTraceConstCmp2 =
+    "__sanitizer_cov_trace_const_cmp2";
+static const char *const SanCovTraceConstCmp4 =
+    "__sanitizer_cov_trace_const_cmp4";
+static const char *const SanCovTraceConstCmp8 =
+    "__sanitizer_cov_trace_const_cmp8";
+static const char *const SanCovTraceDiv4 = "__sanitizer_cov_trace_div4";
+static const char *const SanCovTraceDiv8 = "__sanitizer_cov_trace_div8";
+static const char *const SanCovTraceGep = "__sanitizer_cov_trace_gep";
+static const char *const SanCovTraceSwitchName = "__sanitizer_cov_trace_switch";
+static const char *const SanCovModuleCtorTracePcGuardName =
+    "sancov.module_ctor_trace_pc_guard";
+static const char *const SanCovModuleCtor8bitCountersName =
+    "sancov.module_ctor_8bit_counters";
+static const uint64_t SanCtorAndDtorPriority = 2;
+
+static const char *const SanCovTracePCGuardName =
+    "__sanitizer_cov_trace_pc_guard";
+static const char *const SanCovTracePCGuardInitName =
+    "__sanitizer_cov_trace_pc_guard_init";
+static const char *const SanCov8bitCountersInitName =
+    "__sanitizer_cov_8bit_counters_init";
+static const char *const SanCovPCsInitName = "__sanitizer_cov_pcs_init";
+
+static const char *const SanCovGuardsSectionName = "sancov_guards";
+static const char *const SanCovCountersSectionName = "sancov_cntrs";
+static const char *const SanCovPCsSectionName = "sancov_pcs";
+
+static const char *const SanCovLowestStackName = "__sancov_lowest_stack";
+
+static cl::opt<int> ClCoverageLevel(
+    "sanitizer-coverage-level",
+    cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
+             "3: all blocks and critical edges"),
+    cl::Hidden, cl::init(0));
+
+static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc",
+                               cl::desc("Experimental pc tracing"), cl::Hidden,
+                               cl::init(false));
+
+static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
+                                    cl::desc("pc tracing with a guard"),
+                                    cl::Hidden, cl::init(false));
+
+// If true, we create a global variable that contains PCs of all instrumented
+// BBs, put this global into a named section, and pass this section's bounds
+// to __sanitizer_cov_pcs_init.
+// This way the coverage instrumentation does not need to acquire the PCs
+// at run-time. Works with trace-pc-guard and inline-8bit-counters.
+static cl::opt<bool> ClCreatePCTable("sanitizer-coverage-pc-table",
+                                     cl::desc("create a static PC table"),
+                                     cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters",
+                         cl::desc("increments 8-bit counter for every edge"),
+                         cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClCMPTracing("sanitizer-coverage-trace-compares",
+                 cl::desc("Tracing of CMP and similar instructions"),
+                 cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDIVTracing("sanitizer-coverage-trace-divs",
+                                  cl::desc("Tracing of DIV instructions"),
+                                  cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClGEPTracing("sanitizer-coverage-trace-geps",
+                                  cl::desc("Tracing of GEP instructions"),
+                                  cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClPruneBlocks("sanitizer-coverage-prune-blocks",
+                  cl::desc("Reduce the number of instrumented blocks"),
+                  cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClStackDepth("sanitizer-coverage-stack-depth",
+                                  cl::desc("max stack depth tracing"),
+                                  cl::Hidden, cl::init(false));
+
+namespace {
+
+SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
+  SanitizerCoverageOptions Res;
+  switch (LegacyCoverageLevel) {
+  case 0:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_None;
+    break;
+  case 1:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Function;
+    break;
+  case 2:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_BB;
+    break;
+  case 3:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+    break;
+  case 4:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+    Res.IndirectCalls = true;
+    break;
+  }
+  return Res;
+}
+
+SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
+  // Sets CoverageType and IndirectCalls.
+  SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel);
+  Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType);
+  Options.IndirectCalls |= CLOpts.IndirectCalls;
+  Options.TraceCmp |= ClCMPTracing;
+  Options.TraceDiv |= ClDIVTracing;
+  Options.TraceGep |= ClGEPTracing;
+  Options.TracePC |= ClTracePC;
+  Options.TracePCGuard |= ClTracePCGuard;
+  Options.Inline8bitCounters |= ClInline8bitCounters;
+  Options.PCTable |= ClCreatePCTable;
+  Options.NoPrune |= !ClPruneBlocks;
+  Options.StackDepth |= ClStackDepth;
+  if (!Options.TracePCGuard && !Options.TracePC &&
+      !Options.Inline8bitCounters && !Options.StackDepth)
+    Options.TracePCGuard = true; // TracePCGuard is default.
+  return Options;
+}
+
+using DomTreeCallback = function_ref<const DominatorTree *(Function &F)>;
+using PostDomTreeCallback =
+    function_ref<const PostDominatorTree *(Function &F)>;
+
+class ModuleSanitizerCoverage {
+public:
+  ModuleSanitizerCoverage(
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
+      : Options(OverrideFromCL(Options)) {}
+  bool instrumentModule(Module &M, DomTreeCallback DTCallback,
+                        PostDomTreeCallback PDTCallback);
+
+private:
+  void instrumentFunction(Function &F, DomTreeCallback DTCallback,
+                          PostDomTreeCallback PDTCallback);
+  void InjectCoverageForIndirectCalls(Function &F,
+                                      ArrayRef<Instruction *> IndirCalls);
+  void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
+  void InjectTraceForDiv(Function &F,
+                         ArrayRef<BinaryOperator *> DivTraceTargets);
+  void InjectTraceForGep(Function &F,
+                         ArrayRef<GetElementPtrInst *> GepTraceTargets);
+  void InjectTraceForSwitch(Function &F,
+                            ArrayRef<Instruction *> SwitchTraceTargets);
+  bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks,
+                      bool IsLeafFunc = true);
+  GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements,
+                                                    Function &F, Type *Ty,
+                                                    const char *Section);
+  GlobalVariable *CreatePCArray(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  void CreateFunctionLocalArrays(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx,
+                             bool IsLeafFunc = true);
+  Function *CreateInitCallsForSections(Module &M, const char *CtorName,
+                                       const char *InitFunctionName, Type *Ty,
+                                       const char *Section);
+  std::pair<Value *, Value *> CreateSecStartEnd(Module &M, const char *Section,
+                                                Type *Ty);
+
+  void SetNoSanitizeMetadata(Instruction *I) {
+    I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
+                   MDNode::get(*C, None));
+  }
+
+  std::string getSectionName(const std::string &Section) const;
+  std::string getSectionStart(const std::string &Section) const;
+  std::string getSectionEnd(const std::string &Section) const;
+  FunctionCallee SanCovTracePCIndir;
+  FunctionCallee SanCovTracePC, SanCovTracePCGuard;
+  FunctionCallee SanCovTraceCmpFunction[4];
+  FunctionCallee SanCovTraceConstCmpFunction[4];
+  FunctionCallee SanCovTraceDivFunction[2];
+  FunctionCallee SanCovTraceGepFunction;
+  FunctionCallee SanCovTraceSwitchFunction;
+  GlobalVariable *SanCovLowestStack;
+  InlineAsm *EmptyAsm;
+  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
+      *Int16Ty, *Int8Ty, *Int8PtrTy;
+  Module *CurModule;
+  std::string CurModuleUniqueId;
+  Triple TargetTriple;
+  LLVMContext *C;
+  const DataLayout *DL;
+
+  GlobalVariable *FunctionGuardArray;  // for trace-pc-guard.
+  GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters.
+  GlobalVariable *FunctionPCsArray;  // for pc-table.
+  SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed;
+  SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed;
+
+  SanitizerCoverageOptions Options;
+};
+
+class ModuleSanitizerCoverageLegacyPass : public ModulePass {
+public:
+  ModuleSanitizerCoverageLegacyPass(
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
+      : ModulePass(ID), Options(Options) {
+    initializeModuleSanitizerCoverageLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    ModuleSanitizerCoverage ModuleSancov(Options);
+    auto DTCallback = [this](Function &F) -> const DominatorTree * {
+      return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+    auto PDTCallback = [this](Function &F) -> const PostDominatorTree * {
+      return &this->getAnalysis<PostDominatorTreeWrapperPass>(F)
+                  .getPostDomTree();
+    };
+    return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback);
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+  StringRef getPassName() const override { return "ModuleSanitizerCoverage"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+  }
+
+private:
+  SanitizerCoverageOptions Options;
+};
+
+} // namespace
+
+PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
+                                                   ModuleAnalysisManager &MAM) {
+  ModuleSanitizerCoverage ModuleSancov(Options);
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto DTCallback = [&FAM](Function &F) -> const DominatorTree * {
+    return &FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+  auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * {
+    return &FAM.getResult<PostDominatorTreeAnalysis>(F);
+  };
+  if (ModuleSancov.instrumentModule(M, DTCallback, PDTCallback))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+std::pair<Value *, Value *>
+ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
+                                           Type *Ty) {
+  GlobalVariable *SecStart =
+      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr,
+                         getSectionStart(Section));
+  SecStart->setVisibility(GlobalValue::HiddenVisibility);
+  GlobalVariable *SecEnd =
+      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                         nullptr, getSectionEnd(Section));
+  SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+  IRBuilder<> IRB(M.getContext());
+  Value *SecEndPtr = IRB.CreatePointerCast(SecEnd, Ty);
+  if (!TargetTriple.isOSBinFormatCOFF())
+    return std::make_pair(IRB.CreatePointerCast(SecStart, Ty), SecEndPtr);
+
+  // Account for the fact that on windows-msvc __start_* symbols actually
+  // point to a uint64_t before the start of the array.
+  auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
+  auto GEP = IRB.CreateGEP(Int8Ty, SecStartI8Ptr,
+                           ConstantInt::get(IntptrTy, sizeof(uint64_t)));
+  return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEndPtr);
+}
+
+Function *ModuleSanitizerCoverage::CreateInitCallsForSections(
+    Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty,
+    const char *Section) {
+  auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
+  auto SecStart = SecStartEnd.first;
+  auto SecEnd = SecStartEnd.second;
+  Function *CtorFunc;
+  std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, CtorName, InitFunctionName, {Ty, Ty}, {SecStart, SecEnd});
+  assert(CtorFunc->getName() == CtorName);
+
+  if (TargetTriple.supportsCOMDAT()) {
+    // Use comdat to dedup CtorFunc.
+    CtorFunc->setComdat(M.getOrInsertComdat(CtorName));
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
+  } else {
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+  }
+
+  if (TargetTriple.isOSBinFormatCOFF()) {
+    // In COFF files, if the contructors are set as COMDAT (they are because
+    // COFF supports COMDAT) and the linker flag /OPT:REF (strip unreferenced
+    // functions and data) is used, the constructors get stripped. To prevent
+    // this, give the constructors weak ODR linkage and ensure the linker knows
+    // to include the sancov constructor. This way the linker can deduplicate
+    // the constructors but always leave one copy.
+    CtorFunc->setLinkage(GlobalValue::WeakODRLinkage);
+    appendToUsed(M, CtorFunc);
+  }
+  return CtorFunc;
+}
+
+bool ModuleSanitizerCoverage::instrumentModule(
+    Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
+  if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
+    return false;
+  C = &(M.getContext());
+  DL = &M.getDataLayout();
+  CurModule = &M;
+  CurModuleUniqueId = getUniqueModuleId(CurModule);
+  TargetTriple = Triple(M.getTargetTriple());
+  FunctionGuardArray = nullptr;
+  Function8bitCounterArray = nullptr;
+  FunctionPCsArray = nullptr;
+  IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
+  IntptrPtrTy = PointerType::getUnqual(IntptrTy);
+  Type *VoidTy = Type::getVoidTy(*C);
+  IRBuilder<> IRB(*C);
+  Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
+  Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
+  Int64Ty = IRB.getInt64Ty();
+  Int32Ty = IRB.getInt32Ty();
+  Int16Ty = IRB.getInt16Ty();
+  Int8Ty = IRB.getInt8Ty();
+
+  SanCovTracePCIndir =
+      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy);
+  // Make sure smaller parameters are zero-extended to i64 as required by the
+  // x86_64 ABI.
+  AttributeList SanCovTraceCmpZeroExtAL;
+  if (TargetTriple.getArch() == Triple::x86_64) {
+    SanCovTraceCmpZeroExtAL =
+        SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 0, Attribute::ZExt);
+    SanCovTraceCmpZeroExtAL =
+        SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 1, Attribute::ZExt);
+  }
+
+  SanCovTraceCmpFunction[0] =
+      M.getOrInsertFunction(SanCovTraceCmp1, SanCovTraceCmpZeroExtAL, VoidTy,
+                            IRB.getInt8Ty(), IRB.getInt8Ty());
+  SanCovTraceCmpFunction[1] =
+      M.getOrInsertFunction(SanCovTraceCmp2, SanCovTraceCmpZeroExtAL, VoidTy,
+                            IRB.getInt16Ty(), IRB.getInt16Ty());
+  SanCovTraceCmpFunction[2] =
+      M.getOrInsertFunction(SanCovTraceCmp4, SanCovTraceCmpZeroExtAL, VoidTy,
+                            IRB.getInt32Ty(), IRB.getInt32Ty());
+  SanCovTraceCmpFunction[3] =
+      M.getOrInsertFunction(SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty);
+
+  SanCovTraceConstCmpFunction[0] = M.getOrInsertFunction(
+      SanCovTraceConstCmp1, SanCovTraceCmpZeroExtAL, VoidTy, Int8Ty, Int8Ty);
+  SanCovTraceConstCmpFunction[1] = M.getOrInsertFunction(
+      SanCovTraceConstCmp2, SanCovTraceCmpZeroExtAL, VoidTy, Int16Ty, Int16Ty);
+  SanCovTraceConstCmpFunction[2] = M.getOrInsertFunction(
+      SanCovTraceConstCmp4, SanCovTraceCmpZeroExtAL, VoidTy, Int32Ty, Int32Ty);
+  SanCovTraceConstCmpFunction[3] =
+      M.getOrInsertFunction(SanCovTraceConstCmp8, VoidTy, Int64Ty, Int64Ty);
+
+  {
+    AttributeList AL;
+    if (TargetTriple.getArch() == Triple::x86_64)
+      AL = AL.addParamAttribute(*C, 0, Attribute::ZExt);
+    SanCovTraceDivFunction[0] =
+        M.getOrInsertFunction(SanCovTraceDiv4, AL, VoidTy, IRB.getInt32Ty());
+  }
+  SanCovTraceDivFunction[1] =
+      M.getOrInsertFunction(SanCovTraceDiv8, VoidTy, Int64Ty);
+  SanCovTraceGepFunction =
+      M.getOrInsertFunction(SanCovTraceGep, VoidTy, IntptrTy);
+  SanCovTraceSwitchFunction =
+      M.getOrInsertFunction(SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy);
+
+  Constant *SanCovLowestStackConstant =
+      M.getOrInsertGlobal(SanCovLowestStackName, IntptrTy);
+  SanCovLowestStack = dyn_cast<GlobalVariable>(SanCovLowestStackConstant);
+  if (!SanCovLowestStack) {
+    C->emitError(StringRef("'") + SanCovLowestStackName +
+                 "' should not be declared by the user");
+    return true;
+  }
+  SanCovLowestStack->setThreadLocalMode(
+      GlobalValue::ThreadLocalMode::InitialExecTLSModel);
+  if (Options.StackDepth && !SanCovLowestStack->isDeclaration())
+    SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy));
+
+  // We insert an empty inline asm after cov callbacks to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+
+  SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy);
+  SanCovTracePCGuard =
+      M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy);
+
+  for (auto &F : M)
+    instrumentFunction(F, DTCallback, PDTCallback);
+
+  Function *Ctor = nullptr;
+
+  if (FunctionGuardArray)
+    Ctor = CreateInitCallsForSections(M, SanCovModuleCtorTracePcGuardName,
+                                      SanCovTracePCGuardInitName, Int32PtrTy,
+                                      SanCovGuardsSectionName);
+  if (Function8bitCounterArray)
+    Ctor = CreateInitCallsForSections(M, SanCovModuleCtor8bitCountersName,
+                                      SanCov8bitCountersInitName, Int8PtrTy,
+                                      SanCovCountersSectionName);
+  if (Ctor && Options.PCTable) {
+    auto SecStartEnd = CreateSecStartEnd(M, SanCovPCsSectionName, IntptrPtrTy);
+    FunctionCallee InitFunction = declareSanitizerInitFunction(
+        M, SanCovPCsInitName, {IntptrPtrTy, IntptrPtrTy});
+    IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator());
+    IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second});
+  }
+  // We don't reference these arrays directly in any of our runtime functions,
+  // so we need to prevent them from being dead stripped.
+  if (TargetTriple.isOSBinFormatMachO())
+    appendToUsed(M, GlobalsToAppendToUsed);
+  appendToCompilerUsed(M, GlobalsToAppendToCompilerUsed);
+  return true;
+}
+
+// True if block has successors and it dominates all of them.
+static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) {
+  if (succ_begin(BB) == succ_end(BB))
+    return false;
+
+  for (const BasicBlock *SUCC : make_range(succ_begin(BB), succ_end(BB))) {
+    if (!DT->dominates(BB, SUCC))
+      return false;
+  }
+
+  return true;
+}
+
+// True if block has predecessors and it postdominates all of them.
+static bool isFullPostDominator(const BasicBlock *BB,
+                                const PostDominatorTree *PDT) {
+  if (pred_begin(BB) == pred_end(BB))
+    return false;
+
+  for (const BasicBlock *PRED : make_range(pred_begin(BB), pred_end(BB))) {
+    if (!PDT->dominates(BB, PRED))
+      return false;
+  }
+
+  return true;
+}
+
+static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
+                                  const DominatorTree *DT,
+                                  const PostDominatorTree *PDT,
+                                  const SanitizerCoverageOptions &Options) {
+  // Don't insert coverage for blocks containing nothing but unreachable: we
+  // will never call __sanitizer_cov() for them, so counting them in
+  // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
+  // percentage. Also, unreachable instructions frequently have no debug
+  // locations.
+  if (isa<UnreachableInst>(BB->getFirstNonPHIOrDbgOrLifetime()))
+    return false;
+
+  // Don't insert coverage into blocks without a valid insertion point
+  // (catchswitch blocks).
+  if (BB->getFirstInsertionPt() == BB->end())
+    return false;
+
+  if (Options.NoPrune || &F.getEntryBlock() == BB)
+    return true;
+
+  if (Options.CoverageType == SanitizerCoverageOptions::SCK_Function &&
+      &F.getEntryBlock() != BB)
+    return false;
+
+  // Do not instrument full dominators, or full post-dominators with multiple
+  // predecessors.
+  return !isFullDominator(BB, DT)
+    && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
+}
+
+
+// Returns true iff From->To is a backedge.
+// A twist here is that we treat From->To as a backedge if
+//   * To dominates From or
+//   * To->UniqueSuccessor dominates From
+static bool IsBackEdge(BasicBlock *From, BasicBlock *To,
+                       const DominatorTree *DT) {
+  if (DT->dominates(To, From))
+    return true;
+  if (auto Next = To->getUniqueSuccessor())
+    if (DT->dominates(Next, From))
+      return true;
+  return false;
+}
+
+// Prunes uninteresting Cmp instrumentation:
+//   * CMP instructions that feed into loop backedge branch.
+//
+// Note that Cmp pruning is controlled by the same flag as the
+// BB pruning.
+static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT,
+                             const SanitizerCoverageOptions &Options) {
+  if (!Options.NoPrune)
+    if (CMP->hasOneUse())
+      if (auto BR = dyn_cast<BranchInst>(CMP->user_back()))
+        for (BasicBlock *B : BR->successors())
+          if (IsBackEdge(BR->getParent(), B, DT))
+            return false;
+  return true;
+}
+
+void ModuleSanitizerCoverage::instrumentFunction(
+    Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
+  if (F.empty())
+    return;
+  if (F.getName().find(".module_ctor") != std::string::npos)
+    return; // Should not instrument sanitizer init functions.
+  if (F.getName().startswith("__sanitizer_"))
+    return; // Don't instrument __sanitizer_* callbacks.
+  // Don't touch available_externally functions, their actual body is elewhere.
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+    return;
+  // Don't instrument MSVC CRT configuration helpers. They may run before normal
+  // initialization.
+  if (F.getName() == "__local_stdio_printf_options" ||
+      F.getName() == "__local_stdio_scanf_options")
+    return;
+  if (isa<UnreachableInst>(F.getEntryBlock().getTerminator()))
+    return;
+  // Don't instrument functions using SEH for now. Splitting basic blocks like
+  // we do for coverage breaks WinEHPrepare.
+  // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
+  if (F.hasPersonalityFn() &&
+      isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    return;
+  if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
+    SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests());
+  SmallVector<Instruction *, 8> IndirCalls;
+  SmallVector<BasicBlock *, 16> BlocksToInstrument;
+  SmallVector<Instruction *, 8> CmpTraceTargets;
+  SmallVector<Instruction *, 8> SwitchTraceTargets;
+  SmallVector<BinaryOperator *, 8> DivTraceTargets;
+  SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
+
+  const DominatorTree *DT = DTCallback(F);
+  const PostDominatorTree *PDT = PDTCallback(F);
+  bool IsLeafFunc = true;
+
+  for (auto &BB : F) {
+    if (shouldInstrumentBlock(F, &BB, DT, PDT, Options))
+      BlocksToInstrument.push_back(&BB);
+    for (auto &Inst : BB) {
+      if (Options.IndirectCalls) {
+        CallSite CS(&Inst);
+        if (CS && !CS.getCalledFunction())
+          IndirCalls.push_back(&Inst);
+      }
+      if (Options.TraceCmp) {
+        if (ICmpInst *CMP = dyn_cast<ICmpInst>(&Inst))
+          if (IsInterestingCmp(CMP, DT, Options))
+            CmpTraceTargets.push_back(&Inst);
+        if (isa<SwitchInst>(&Inst))
+          SwitchTraceTargets.push_back(&Inst);
+      }
+      if (Options.TraceDiv)
+        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&Inst))
+          if (BO->getOpcode() == Instruction::SDiv ||
+              BO->getOpcode() == Instruction::UDiv)
+            DivTraceTargets.push_back(BO);
+      if (Options.TraceGep)
+        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Inst))
+          GepTraceTargets.push_back(GEP);
+      if (Options.StackDepth)
+        if (isa<InvokeInst>(Inst) ||
+            (isa<CallInst>(Inst) && !isa<IntrinsicInst>(Inst)))
+          IsLeafFunc = false;
+    }
+  }
+
+  InjectCoverage(F, BlocksToInstrument, IsLeafFunc);
+  InjectCoverageForIndirectCalls(F, IndirCalls);
+  InjectTraceForCmp(F, CmpTraceTargets);
+  InjectTraceForSwitch(F, SwitchTraceTargets);
+  InjectTraceForDiv(F, DivTraceTargets);
+  InjectTraceForGep(F, GepTraceTargets);
+}
+
+GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
+    size_t NumElements, Function &F, Type *Ty, const char *Section) {
+  ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
+  auto Array = new GlobalVariable(
+      *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(ArrayTy), "__sancov_gen_");
+
+  if (TargetTriple.supportsCOMDAT() && !F.isInterposable())
+    if (auto Comdat =
+            GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
+      Array->setComdat(Comdat);
+  Array->setSection(getSectionName(Section));
+  Array->setAlignment(Align(Ty->isPointerTy()
+                                ? DL->getPointerSize()
+                                : Ty->getPrimitiveSizeInBits() / 8));
+  GlobalsToAppendToUsed.push_back(Array);
+  GlobalsToAppendToCompilerUsed.push_back(Array);
+  MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
+  Array->addMetadata(LLVMContext::MD_associated, *MD);
+
+  return Array;
+}
+
+GlobalVariable *
+ModuleSanitizerCoverage::CreatePCArray(Function &F,
+                                       ArrayRef<BasicBlock *> AllBlocks) {
+  size_t N = AllBlocks.size();
+  assert(N);
+  SmallVector<Constant *, 32> PCs;
+  IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt());
+  for (size_t i = 0; i < N; i++) {
+    if (&F.getEntryBlock() == AllBlocks[i]) {
+      PCs.push_back((Constant *)IRB.CreatePointerCast(&F, IntptrPtrTy));
+      PCs.push_back((Constant *)IRB.CreateIntToPtr(
+          ConstantInt::get(IntptrTy, 1), IntptrPtrTy));
+    } else {
+      PCs.push_back((Constant *)IRB.CreatePointerCast(
+          BlockAddress::get(AllBlocks[i]), IntptrPtrTy));
+      PCs.push_back((Constant *)IRB.CreateIntToPtr(
+          ConstantInt::get(IntptrTy, 0), IntptrPtrTy));
+    }
+  }
+  auto *PCArray = CreateFunctionLocalArrayInSection(N * 2, F, IntptrPtrTy,
+                                                    SanCovPCsSectionName);
+  PCArray->setInitializer(
+      ConstantArray::get(ArrayType::get(IntptrPtrTy, N * 2), PCs));
+  PCArray->setConstant(true);
+
+  return PCArray;
+}
+
+void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
+    Function &F, ArrayRef<BasicBlock *> AllBlocks) {
+  if (Options.TracePCGuard)
+    FunctionGuardArray = CreateFunctionLocalArrayInSection(
+        AllBlocks.size(), F, Int32Ty, SanCovGuardsSectionName);
+
+  if (Options.Inline8bitCounters)
+    Function8bitCounterArray = CreateFunctionLocalArrayInSection(
+        AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
+
+  if (Options.PCTable)
+    FunctionPCsArray = CreatePCArray(F, AllBlocks);
+}
+
+bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
+                                             ArrayRef<BasicBlock *> AllBlocks,
+                                             bool IsLeafFunc) {
+  if (AllBlocks.empty()) return false;
+  CreateFunctionLocalArrays(F, AllBlocks);
+  for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
+    InjectCoverageAtBlock(F, *AllBlocks[i], i, IsLeafFunc);
+  return true;
+}
+
+// On every indirect call we call a run-time function
+// __sanitizer_cov_indir_call* with two parameters:
+//   - callee address,
+//   - global cache array that contains CacheSize pointers (zero-initialized).
+//     The cache is used to speed up recording the caller-callee pairs.
+// The address of the caller is passed implicitly via caller PC.
+// CacheSize is encoded in the name of the run-time function.
+void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls(
+    Function &F, ArrayRef<Instruction *> IndirCalls) {
+  if (IndirCalls.empty())
+    return;
+  assert(Options.TracePC || Options.TracePCGuard || Options.Inline8bitCounters);
+  for (auto I : IndirCalls) {
+    IRBuilder<> IRB(I);
+    CallSite CS(I);
+    Value *Callee = CS.getCalledValue();
+    if (isa<InlineAsm>(Callee))
+      continue;
+    IRB.CreateCall(SanCovTracePCIndir, IRB.CreatePointerCast(Callee, IntptrTy));
+  }
+}
+
+// For every switch statement we insert a call:
+// __sanitizer_cov_trace_switch(CondValue,
+//      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
+
+void ModuleSanitizerCoverage::InjectTraceForSwitch(
+    Function &, ArrayRef<Instruction *> SwitchTraceTargets) {
+  for (auto I : SwitchTraceTargets) {
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      IRBuilder<> IRB(I);
+      SmallVector<Constant *, 16> Initializers;
+      Value *Cond = SI->getCondition();
+      if (Cond->getType()->getScalarSizeInBits() >
+          Int64Ty->getScalarSizeInBits())
+        continue;
+      Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases()));
+      Initializers.push_back(
+          ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits()));
+      if (Cond->getType()->getScalarSizeInBits() <
+          Int64Ty->getScalarSizeInBits())
+        Cond = IRB.CreateIntCast(Cond, Int64Ty, false);
+      for (auto It : SI->cases()) {
+        Constant *C = It.getCaseValue();
+        if (C->getType()->getScalarSizeInBits() <
+            Int64Ty->getScalarSizeInBits())
+          C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
+        Initializers.push_back(C);
+      }
+      llvm::sort(Initializers.begin() + 2, Initializers.end(),
+                 [](const Constant *A, const Constant *B) {
+                   return cast<ConstantInt>(A)->getLimitedValue() <
+                          cast<ConstantInt>(B)->getLimitedValue();
+                 });
+      ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
+      GlobalVariable *GV = new GlobalVariable(
+          *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
+          ConstantArray::get(ArrayOfInt64Ty, Initializers),
+          "__sancov_gen_cov_switch_values");
+      IRB.CreateCall(SanCovTraceSwitchFunction,
+                     {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)});
+    }
+  }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForDiv(
+    Function &, ArrayRef<BinaryOperator *> DivTraceTargets) {
+  for (auto BO : DivTraceTargets) {
+    IRBuilder<> IRB(BO);
+    Value *A1 = BO->getOperand(1);
+    if (isa<ConstantInt>(A1)) continue;
+    if (!A1->getType()->isIntegerTy())
+      continue;
+    uint64_t TypeSize = DL->getTypeStoreSizeInBits(A1->getType());
+    int CallbackIdx = TypeSize == 32 ? 0 :
+        TypeSize == 64 ? 1 : -1;
+    if (CallbackIdx < 0) continue;
+    auto Ty = Type::getIntNTy(*C, TypeSize);
+    IRB.CreateCall(SanCovTraceDivFunction[CallbackIdx],
+                   {IRB.CreateIntCast(A1, Ty, true)});
+  }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForGep(
+    Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) {
+  for (auto GEP : GepTraceTargets) {
+    IRBuilder<> IRB(GEP);
+    for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+      if (!isa<ConstantInt>(*I) && (*I)->getType()->isIntegerTy())
+        IRB.CreateCall(SanCovTraceGepFunction,
+                       {IRB.CreateIntCast(*I, IntptrTy, true)});
+  }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForCmp(
+    Function &, ArrayRef<Instruction *> CmpTraceTargets) {
+  for (auto I : CmpTraceTargets) {
+    if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
+      IRBuilder<> IRB(ICMP);
+      Value *A0 = ICMP->getOperand(0);
+      Value *A1 = ICMP->getOperand(1);
+      if (!A0->getType()->isIntegerTy())
+        continue;
+      uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
+      int CallbackIdx = TypeSize == 8 ? 0 :
+                        TypeSize == 16 ? 1 :
+                        TypeSize == 32 ? 2 :
+                        TypeSize == 64 ? 3 : -1;
+      if (CallbackIdx < 0) continue;
+      // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
+      auto CallbackFunc = SanCovTraceCmpFunction[CallbackIdx];
+      bool FirstIsConst = isa<ConstantInt>(A0);
+      bool SecondIsConst = isa<ConstantInt>(A1);
+      // If both are const, then we don't need such a comparison.
+      if (FirstIsConst && SecondIsConst) continue;
+      // If only one is const, then make it the first callback argument.
+      if (FirstIsConst || SecondIsConst) {
+        CallbackFunc = SanCovTraceConstCmpFunction[CallbackIdx];
+        if (SecondIsConst)
+          std::swap(A0, A1);
+      }
+
+      auto Ty = Type::getIntNTy(*C, TypeSize);
+      IRB.CreateCall(CallbackFunc, {IRB.CreateIntCast(A0, Ty, true),
+              IRB.CreateIntCast(A1, Ty, true)});
+    }
+  }
+}
+
+void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
+                                                    size_t Idx,
+                                                    bool IsLeafFunc) {
+  BasicBlock::iterator IP = BB.getFirstInsertionPt();
+  bool IsEntryBB = &BB == &F.getEntryBlock();
+  DebugLoc EntryLoc;
+  if (IsEntryBB) {
+    if (auto SP = F.getSubprogram())
+      EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
+    // Keep static allocas and llvm.localescape calls in the entry block.  Even
+    // if we aren't splitting the block, it's nice for allocas to be before
+    // calls.
+    IP = PrepareToSplitEntryBlock(BB, IP);
+  } else {
+    EntryLoc = IP->getDebugLoc();
+  }
+
+  IRBuilder<> IRB(&*IP);
+  IRB.SetCurrentDebugLocation(EntryLoc);
+  if (Options.TracePC) {
+    IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC.
+    IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+  }
+  if (Options.TracePCGuard) {
+    auto GuardPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
+                      ConstantInt::get(IntptrTy, Idx * 4)),
+        Int32PtrTy);
+    IRB.CreateCall(SanCovTracePCGuard, GuardPtr);
+    IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+  }
+  if (Options.Inline8bitCounters) {
+    auto CounterPtr = IRB.CreateGEP(
+        Function8bitCounterArray->getValueType(), Function8bitCounterArray,
+        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+    auto Load = IRB.CreateLoad(Int8Ty, CounterPtr);
+    auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1));
+    auto Store = IRB.CreateStore(Inc, CounterPtr);
+    SetNoSanitizeMetadata(Load);
+    SetNoSanitizeMetadata(Store);
+  }
+  if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
+    // Check stack depth.  If it's the deepest so far, record it.
+    Module *M = F.getParent();
+    Function *GetFrameAddr = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+    auto FrameAddrPtr =
+        IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)});
+    auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
+    auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack);
+    auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack);
+    auto ThenTerm = SplitBlockAndInsertIfThen(IsStackLower, &*IP, false);
+    IRBuilder<> ThenIRB(ThenTerm);
+    auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
+    SetNoSanitizeMetadata(LowestStack);
+    SetNoSanitizeMetadata(Store);
+  }
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionName(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatCOFF()) {
+    if (Section == SanCovCountersSectionName)
+      return ".SCOV$CM";
+    if (Section == SanCovPCsSectionName)
+      return ".SCOVP$M";
+    return ".SCOV$GM"; // For SanCovGuardsSectionName.
+  }
+  if (TargetTriple.isOSBinFormatMachO())
+    return "__DATA,__" + Section;
+  return "__" + Section;
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionStart(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$start$__DATA$__" + Section;
+  return "__start___" + Section;
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$end$__DATA$__" + Section;
+  return "__stop___" + Section;
+}
+
+char ModuleSanitizerCoverageLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov",
+                      "Pass for instrumenting coverage on functions", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov",
+                    "Pass for instrumenting coverage on functions", false,
+                    false)
+ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass(
+    const SanitizerCoverageOptions &Options) {
+  return new ModuleSanitizerCoverageLegacyPass(Options);
+}
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
new file mode 100644
index 000000000000..ac274a155a80
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -0,0 +1,735 @@
+//===-- ThreadSanitizer.cpp - race detector -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer, a race detector.
+//
+// The tool is under development, for the details about previous versions see
+// http://code.google.com/p/data-race-test
+//
+// The instrumentation phase is quite simple:
+//   - Insert calls to run-time library before every memory access.
+//      - Optimizations may apply to avoid instrumenting some of the accesses.
+//   - Insert calls at function entry/exit.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tsan"
+
+static cl::opt<bool>  ClInstrumentMemoryAccesses(
+    "tsan-instrument-memory-accesses", cl::init(true),
+    cl::desc("Instrument memory accesses"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentFuncEntryExit(
+    "tsan-instrument-func-entry-exit", cl::init(true),
+    cl::desc("Instrument function entry and exit"), cl::Hidden);
+static cl::opt<bool>  ClHandleCxxExceptions(
+    "tsan-handle-cxx-exceptions", cl::init(true),
+    cl::desc("Handle C++ exceptions (insert cleanup blocks for unwinding)"),
+    cl::Hidden);
+static cl::opt<bool>  ClInstrumentAtomics(
+    "tsan-instrument-atomics", cl::init(true),
+    cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "tsan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOmittedReadsBeforeWrite,
+          "Number of reads ignored due to following writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
+STATISTIC(NumOmittedReadsFromConstantGlobals,
+          "Number of reads from constant globals");
+STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
+STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
+
+static const char *const kTsanModuleCtorName = "tsan.module_ctor";
+static const char *const kTsanInitName = "__tsan_init";
+
+namespace {
+
+/// ThreadSanitizer: instrument the code in module to find races.
+///
+/// Instantiating ThreadSanitizer inserts the tsan runtime library API function
+/// declarations into the module if they don't exist already. Instantiating
+/// ensures the __tsan_init function is in the list of global constructors for
+/// the module.
+struct ThreadSanitizer {
+  bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
+
+private:
+  void initialize(Module &M);
+  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentAtomic(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(Instruction *I);
+  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local,
+                                      SmallVectorImpl<Instruction *> &All,
+                                      const DataLayout &DL);
+  bool addrPointsToConstantData(Value *Addr);
+  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+  void InsertRuntimeIgnores(Function &F);
+
+  Type *IntptrTy;
+  FunctionCallee TsanFuncEntry;
+  FunctionCallee TsanFuncExit;
+  FunctionCallee TsanIgnoreBegin;
+  FunctionCallee TsanIgnoreEnd;
+  // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+  static const size_t kNumberOfAccessSizes = 5;
+  FunctionCallee TsanRead[kNumberOfAccessSizes];
+  FunctionCallee TsanWrite[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedRead[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedWrite[kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicLoad[kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicStore[kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1]
+                              [kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicCAS[kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicThreadFence;
+  FunctionCallee TsanAtomicSignalFence;
+  FunctionCallee TsanVptrUpdate;
+  FunctionCallee TsanVptrLoad;
+  FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
+};
+
+struct ThreadSanitizerLegacyPass : FunctionPass {
+  ThreadSanitizerLegacyPass() : FunctionPass(ID) {}
+  StringRef getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+  bool doInitialization(Module &M) override;
+  static char ID; // Pass identification, replacement for typeid.
+private:
+  Optional<ThreadSanitizer> TSan;
+};
+
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
+}
+
+}  // namespace
+
+PreservedAnalyses ThreadSanitizerPass::run(Function &F,
+                                           FunctionAnalysisManager &FAM) {
+  ThreadSanitizer TSan;
+  if (TSan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses ThreadSanitizerPass::run(Module &M,
+                                           ModuleAnalysisManager &MAM) {
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
+char ThreadSanitizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan",
+                      "ThreadSanitizer: detects data races.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ThreadSanitizerLegacyPass, "tsan",
+                    "ThreadSanitizer: detects data races.", false, false)
+
+StringRef ThreadSanitizerLegacyPass::getPassName() const {
+  return "ThreadSanitizerLegacyPass";
+}
+
+void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+bool ThreadSanitizerLegacyPass::doInitialization(Module &M) {
+  insertModuleCtor(M);
+  TSan.emplace();
+  return true;
+}
+
+bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) {
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  TSan->sanitizeFunction(F, TLI);
+  return true;
+}
+
+FunctionPass *llvm::createThreadSanitizerLegacyPassPass() {
+  return new ThreadSanitizerLegacyPass();
+}
+
+void ThreadSanitizer::initialize(Module &M) {
+  const DataLayout &DL = M.getDataLayout();
+  IntptrTy = DL.getIntPtrType(M.getContext());
+
+  IRBuilder<> IRB(M.getContext());
+  AttributeList Attr;
+  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                           Attribute::NoUnwind);
+  // Initialize the callbacks.
+  TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", Attr,
+                                        IRB.getVoidTy(), IRB.getInt8PtrTy());
+  TsanFuncExit =
+      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy());
+  TsanIgnoreBegin = M.getOrInsertFunction("__tsan_ignore_thread_begin", Attr,
+                                          IRB.getVoidTy());
+  TsanIgnoreEnd =
+      M.getOrInsertFunction("__tsan_ignore_thread_end", Attr, IRB.getVoidTy());
+  IntegerType *OrdTy = IRB.getInt32Ty();
+  for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
+    const unsigned ByteSize = 1U << i;
+    const unsigned BitSize = ByteSize * 8;
+    std::string ByteSizeStr = utostr(ByteSize);
+    std::string BitSizeStr = utostr(BitSize);
+    SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
+    TsanRead[i] = M.getOrInsertFunction(ReadName, Attr, IRB.getVoidTy(),
+                                        IRB.getInt8PtrTy());
+
+    SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
+    TsanWrite[i] = M.getOrInsertFunction(WriteName, Attr, IRB.getVoidTy(),
+                                         IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
+    TsanUnalignedRead[i] = M.getOrInsertFunction(
+        UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
+    TsanUnalignedWrite[i] = M.getOrInsertFunction(
+        UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
+    TsanAtomicLoad[i] =
+        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy);
+
+    SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
+    TsanAtomicStore[i] = M.getOrInsertFunction(
+        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy);
+
+    for (int op = AtomicRMWInst::FIRST_BINOP;
+        op <= AtomicRMWInst::LAST_BINOP; ++op) {
+      TsanAtomicRMW[op][i] = nullptr;
+      const char *NamePart = nullptr;
+      if (op == AtomicRMWInst::Xchg)
+        NamePart = "_exchange";
+      else if (op == AtomicRMWInst::Add)
+        NamePart = "_fetch_add";
+      else if (op == AtomicRMWInst::Sub)
+        NamePart = "_fetch_sub";
+      else if (op == AtomicRMWInst::And)
+        NamePart = "_fetch_and";
+      else if (op == AtomicRMWInst::Or)
+        NamePart = "_fetch_or";
+      else if (op == AtomicRMWInst::Xor)
+        NamePart = "_fetch_xor";
+      else if (op == AtomicRMWInst::Nand)
+        NamePart = "_fetch_nand";
+      else
+        continue;
+      SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+      TsanAtomicRMW[op][i] =
+          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy);
+    }
+
+    SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
+                                  "_compare_exchange_val");
+    TsanAtomicCAS[i] = M.getOrInsertFunction(AtomicCASName, Attr, Ty, PtrTy, Ty,
+                                             Ty, OrdTy, OrdTy);
+  }
+  TsanVptrUpdate =
+      M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy());
+  TsanVptrLoad = M.getOrInsertFunction("__tsan_vptr_read", Attr,
+                                       IRB.getVoidTy(), IRB.getInt8PtrTy());
+  TsanAtomicThreadFence = M.getOrInsertFunction("__tsan_atomic_thread_fence",
+                                                Attr, IRB.getVoidTy(), OrdTy);
+  TsanAtomicSignalFence = M.getOrInsertFunction("__tsan_atomic_signal_fence",
+                                                Attr, IRB.getVoidTy(), OrdTy);
+
+  MemmoveFn =
+      M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  MemcpyFn =
+      M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  MemsetFn =
+      M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
+}
+
+static bool isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
+  return false;
+}
+
+// Do not instrument known races/"benign races" that come from compiler
+// instrumentatin. The user has no way of suppressing them.
+static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
+  // Peel off GEPs and BitCasts.
+  Addr = Addr->stripInBoundsOffsets();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->hasSection()) {
+      StringRef SectionName = GV->getSection();
+      // Check if the global is in the PGO counters section.
+      auto OF = Triple(M->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+        return false;
+    }
+
+    // Check if the global is private gcov data.
+    if (GV->getName().startswith("__llvm_gcov") ||
+        GV->getName().startswith("__llvm_gcda"))
+      return false;
+  }
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  if (Addr) {
+    Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return false;
+  }
+
+  return true;
+}
+
+bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      // Reads from constant globals can not race with any writes.
+      NumOmittedReadsFromConstantGlobals++;
+      return true;
+    }
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      // Reads from a vtable pointer can not race with any writes.
+      NumOmittedReadsFromVtable++;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Instrumenting some of the accesses may be proven redundant.
+// Currently handled:
+//  - read-before-write (within same BB, no calls between)
+//  - not captured variables
+//
+// We do not handle some of the patterns that should not survive
+// after the classic compiler optimizations.
+// E.g. two reads from the same temp should be eliminated by CSE,
+// two writes should be eliminated by DSE, etc.
+//
+// 'Local' is a vector of insns within the same BB (no calls between).
+// 'All' is a vector of insns that will be instrumented.
+void ThreadSanitizer::chooseInstructionsToInstrument(
+    SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
+    const DataLayout &DL) {
+  SmallPtrSet<Value*, 8> WriteTargets;
+  // Iterate from the end.
+  for (Instruction *I : reverse(Local)) {
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      Value *Addr = Store->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+        continue;
+      WriteTargets.insert(Addr);
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+        continue;
+      if (WriteTargets.count(Addr)) {
+        // We will write to this temp, so no reason to analyze the read.
+        NumOmittedReadsBeforeWrite++;
+        continue;
+      }
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        continue;
+      }
+    }
+    Value *Addr = isa<StoreInst>(*I)
+        ? cast<StoreInst>(I)->getPointerOperand()
+        : cast<LoadInst>(I)->getPointerOperand();
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+        !PointerMayBeCaptured(Addr, true, true)) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      continue;
+    }
+    All.push_back(I);
+  }
+  Local.clear();
+}
+
+static bool isAtomic(Instruction *I) {
+  // TODO: Ask TTI whether synchronization scope is between threads.
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (isa<FenceInst>(I))
+    return true;
+  return false;
+}
+
+void ThreadSanitizer::InsertRuntimeIgnores(Function &F) {
+  IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+  IRB.CreateCall(TsanIgnoreBegin);
+  EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions);
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    AtExit->CreateCall(TsanIgnoreEnd);
+  }
+}
+
+bool ThreadSanitizer::sanitizeFunction(Function &F,
+                                       const TargetLibraryInfo &TLI) {
+  // This is required to prevent instrumenting call to __tsan_init from within
+  // the module constructor.
+  if (F.getName() == kTsanModuleCtorName)
+    return false;
+  initialize(*F.getParent());
+  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
+  bool Res = false;
+  bool HasCalls = false;
+  bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Traverse all instructions, collect loads/stores/returns, check for calls.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isAtomic(&Inst))
+        AtomicAccesses.push_back(&Inst);
+      else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        LocalLoadsAndStores.push_back(&Inst);
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
+        if (isa<MemIntrinsic>(Inst))
+          MemIntrinCalls.push_back(&Inst);
+        HasCalls = true;
+        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
+                                       DL);
+      }
+    }
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
+  }
+
+  // We have collected all loads and stores.
+  // FIXME: many of these accesses do not need to be checked for races
+  // (e.g. variables that do not escape, etc).
+
+  // Instrument memory accesses only if we want to report bugs in the function.
+  if (ClInstrumentMemoryAccesses && SanitizeFunction)
+    for (auto Inst : AllLoadsAndStores) {
+      Res |= instrumentLoadOrStore(Inst, DL);
+    }
+
+  // Instrument atomic memory accesses in any case (they can be used to
+  // implement synchronization).
+  if (ClInstrumentAtomics)
+    for (auto Inst : AtomicAccesses) {
+      Res |= instrumentAtomic(Inst, DL);
+    }
+
+  if (ClInstrumentMemIntrinsics && SanitizeFunction)
+    for (auto Inst : MemIntrinCalls) {
+      Res |= instrumentMemIntrinsic(Inst);
+    }
+
+  if (F.hasFnAttribute("sanitize_thread_no_checking_at_run_time")) {
+    assert(!F.hasFnAttribute(Attribute::SanitizeThread));
+    if (HasCalls)
+      InsertRuntimeIgnores(F);
+  }
+
+  // Instrument function entry/exit points if there were instrumented accesses.
+  if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    Value *ReturnAddress = IRB.CreateCall(
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
+        IRB.getInt32(0));
+    IRB.CreateCall(TsanFuncEntry, ReturnAddress);
+
+    EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
+    while (IRBuilder<> *AtExit = EE.Next()) {
+      AtExit->CreateCall(TsanFuncExit, {});
+    }
+    Res = true;
+  }
+  return Res;
+}
+
+bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
+                                            const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(*I);
+  Value *Addr = IsWrite
+      ? cast<StoreInst>(I)->getPointerOperand()
+      : cast<LoadInst>(I)->getPointerOperand();
+
+  // swifterror memory addresses are mem2reg promoted by instruction selection.
+  // As such they cannot have regular uses like an instrumentation function and
+  // it makes no sense to track them as memory.
+  if (Addr->isSwiftError())
+    return false;
+
+  int Idx = getMemoryAccessFuncIndex(Addr, DL);
+  if (Idx < 0)
+    return false;
+  if (IsWrite && isVtableAccess(I)) {
+    LLVM_DEBUG(dbgs() << "  VPTR : " << *I << "\n");
+    Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+    // StoredValue may be a vector type if we are storing several vptrs at once.
+    // In this case, just take the first element of the vector since this is
+    // enough to find vptr races.
+    if (isa<VectorType>(StoredValue->getType()))
+      StoredValue = IRB.CreateExtractElement(
+          StoredValue, ConstantInt::get(IRB.getInt32Ty(), 0));
+    if (StoredValue->getType()->isIntegerTy())
+      StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
+    // Call TsanVptrUpdate.
+    IRB.CreateCall(TsanVptrUpdate,
+                   {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())});
+    NumInstrumentedVtableWrites++;
+    return true;
+  }
+  if (!IsWrite && isVtableAccess(I)) {
+    IRB.CreateCall(TsanVptrLoad,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableReads++;
+    return true;
+  }
+  const unsigned Alignment = IsWrite
+      ? cast<StoreInst>(I)->getAlignment()
+      : cast<LoadInst>(I)->getAlignment();
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  FunctionCallee OnAccessFunc = nullptr;
+  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0)
+    OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  else
+    OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
+  IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+  if (IsWrite) NumInstrumentedWrites++;
+  else         NumInstrumentedReads++;
+  return true;
+}
+
+static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case AtomicOrdering::NotAtomic:
+      llvm_unreachable("unexpected atomic ordering!");
+    case AtomicOrdering::Unordered:              LLVM_FALLTHROUGH;
+    case AtomicOrdering::Monotonic:              v = 0; break;
+    // Not specified yet:
+    // case AtomicOrdering::Consume:                v = 1; break;
+    case AtomicOrdering::Acquire:                v = 2; break;
+    case AtomicOrdering::Release:                v = 3; break;
+    case AtomicOrdering::AcquireRelease:         v = 4; break;
+    case AtomicOrdering::SequentiallyConsistent: v = 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    IRB.CreateCall(
+        MemsetFn,
+        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    I->eraseFromParent();
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    IRB.CreateCall(
+        isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    I->eraseFromParent();
+  }
+  return false;
+}
+
+// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
+// standards.  For background see C++11 standard.  A slightly older, publicly
+// available draft of the standard (not entirely up-to-date, but close enough
+// for casual browsing) is available here:
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf
+// The following page contains more background information:
+// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
+
+bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Value *Addr = LI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     createOrdering(&IRB, LI->getOrdering())};
+    Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+    Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
+    Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy);
+    I->replaceAllUsesWith(Cast);
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Value *Addr = SI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty),
+                     createOrdering(&IRB, SI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Value *Addr = RMWI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    FunctionCallee F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+    if (!F)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+                     createOrdering(&IRB, RMWI->getOrdering())};
+    CallInst *C = CallInst::Create(F, Args);
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Value *Addr = CASI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *CmpOperand =
+      IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty);
+    Value *NewOperand =
+      IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty);
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     CmpOperand,
+                     NewOperand,
+                     createOrdering(&IRB, CASI->getSuccessOrdering()),
+                     createOrdering(&IRB, CASI->getFailureOrdering())};
+    CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
+    Value *Success = IRB.CreateICmpEQ(C, CmpOperand);
+    Value *OldVal = C;
+    Type *OrigOldValTy = CASI->getNewValOperand()->getType();
+    if (Ty != OrigOldValTy) {
+      // The value is a pointer, so we need to cast the return value.
+      OldVal = IRB.CreateIntToPtr(C, OrigOldValTy);
+    }
+
+    Value *Res =
+      IRB.CreateInsertValue(UndefValue::get(CASI->getType()), OldVal, 0);
+    Res = IRB.CreateInsertValue(Res, Success, 1);
+
+    I->replaceAllUsesWith(Res);
+    I->eraseFromParent();
+  } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+    Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+    FunctionCallee F = FI->getSyncScopeID() == SyncScope::SingleThread
+                           ? TsanAtomicSignalFence
+                           : TsanAtomicThreadFence;
+    CallInst *C = CallInst::Create(F, Args);
+    ReplaceInstWithInst(I, C);
+  }
+  return true;
+}
+
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr,
+                                              const DataLayout &DL) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    NumAccessesWithBadSize++;
+    // Ignore all unusual sizes.
+    return -1;
+  }
+  size_t Idx = countTrailingZeros(TypeSize / 8);
+  assert(Idx < kNumberOfAccessSizes);
+  return Idx;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
new file mode 100644
index 000000000000..604726d4f40f
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -0,0 +1,78 @@
+//===- ValueProfileCollector.cpp - determine what to value profile --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfilePlugins.inc"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+/// A plugin-based class that takes an arbitrary number of Plugin types.
+/// Each plugin type must satisfy the following API:
+///  1) the constructor must take a `Function &f`. Typically, the plugin would
+///     scan the function looking for candidates.
+///  2) contain a member function with the following signature and name:
+///        void run(std::vector<CandidateInfo> &Candidates);
+///    such that the plugin would append its result into the vector parameter.
+///
+/// Plugins are defined in ValueProfilePlugins.inc
+template <class... Ts> class PluginChain;
+
+/// The type PluginChainFinal is the final chain of plugins that will be used by
+/// ValueProfileCollectorImpl.
+using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>;
+
+template <> class PluginChain<> {
+public:
+  PluginChain(Function &F) {}
+  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {}
+};
+
+template <class PluginT, class... Ts>
+class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> {
+  PluginT Plugin;
+  using Base = PluginChain<Ts...>;
+
+public:
+  PluginChain(Function &F) : PluginChain<Ts...>(F), Plugin(F) {}
+
+  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {
+    if (K == PluginT::Kind)
+      Plugin.run(Candidates);
+    Base::get(K, Candidates);
+  }
+};
+
+} // end anonymous namespace
+
+/// ValueProfileCollectorImpl inherits the API of PluginChainFinal.
+class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal {
+public:
+  using PluginChainFinal::PluginChainFinal;
+};
+
+ValueProfileCollector::ValueProfileCollector(Function &F)
+    : PImpl(new ValueProfileCollectorImpl(F)) {}
+
+ValueProfileCollector::~ValueProfileCollector() = default;
+
+std::vector<CandidateInfo>
+ValueProfileCollector::get(InstrProfValueKind Kind) const {
+  std::vector<CandidateInfo> Result;
+  PImpl->get(Kind, Result);
+  return Result;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
new file mode 100644
index 000000000000..ff883c8d0c77
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -0,0 +1,79 @@
+//===- ValueProfileCollector.h - determine what to value profile ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a utility class, ValueProfileCollector, that is used to
+// determine what kind of llvm::Value's are worth value-profiling, at which
+// point in the program, and which instruction holds the Value Profile metadata.
+// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use]
+// passes.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+
+namespace llvm {
+
+/// Utility analysis that determines what values are worth profiling.
+/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to
+/// populate the Candidates vector.
+///
+/// Value profiling an expression means to track the values that this expression
+/// takes at runtime and the frequency of each value.
+/// It is important to distinguish between two sets of value profiles for a
+/// particular expression:
+///  1) The set of values at the point of evaluation.
+///  2) The set of values at the point of use.
+/// In some cases, the two sets are identical, but it's not unusual for the two
+/// to differ.
+///
+/// To elaborate more, consider this C code, and focus on the expression `nn`:
+///  void foo(int nn, bool b) {
+///    if (b)  memcpy(x, y, nn);
+///  }
+/// The point of evaluation can be as early as the start of the function, and
+/// let's say the value profile for `nn` is:
+///     total=100; (value,freq) set = {(8,10), (32,50)}
+/// The point of use is right before we call memcpy, and since we execute the
+/// memcpy conditionally, the value profile of `nn` can be:
+///     total=15; (value,freq) set = {(8,10), (4,5)}
+///
+/// For this reason, a plugin is responsible for computing the insertion point
+/// for each value to be profiled. The `CandidateInfo` structure encapsulates
+/// all the information needed for each value profile site.
+class ValueProfileCollector {
+public:
+  struct CandidateInfo {
+    Value *V;                   // The value to profile.
+    Instruction *InsertPt;      // Insert the VP lib call before this instr.
+    Instruction *AnnotatedInst; // Where metadata is attached.
+  };
+
+  ValueProfileCollector(Function &Fn);
+  ValueProfileCollector(ValueProfileCollector &&) = delete;
+  ValueProfileCollector &operator=(ValueProfileCollector &&) = delete;
+
+  ValueProfileCollector(const ValueProfileCollector &) = delete;
+  ValueProfileCollector &operator=(const ValueProfileCollector &) = delete;
+  ~ValueProfileCollector();
+
+  /// returns a list of value profiling candidates of the given kind
+  std::vector<CandidateInfo> get(InstrProfValueKind Kind) const;
+
+private:
+  class ValueProfileCollectorImpl;
+  std::unique_ptr<ValueProfileCollectorImpl> PImpl;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
new file mode 100644
index 000000000000..4cc4c6c848c3
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -0,0 +1,75 @@
+//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a set of plugin classes used in ValueProfileCollectorImpl.
+// Each plugin is responsible for collecting Value Profiling candidates for a
+// particular optimization.
+// Each plugin must satisfy the interface described in ValueProfileCollector.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfileCollector.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+using CandidateInfo = ValueProfileCollector::CandidateInfo;
+
+///--------------------------- MemIntrinsicPlugin ------------------------------
+class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
+  Function &F;
+  std::vector<CandidateInfo> *Candidates;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_MemOPSize;
+
+  MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {}
+
+  void run(std::vector<CandidateInfo> &Cs) {
+    Candidates = &Cs;
+    visit(F);
+    Candidates = nullptr;
+  }
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not instrument constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+
+    Instruction *InsertPt = &MI;
+    Instruction *AnnotatedInst = &MI;
+    Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+  }
+};
+
+///------------------------ IndirectCallPromotionPlugin ------------------------
+class IndirectCallPromotionPlugin {
+  Function &F;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget;
+
+  IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {}
+
+  void run(std::vector<CandidateInfo> &Candidates) {
+    std::vector<Instruction *> Result = findIndirectCalls(F);
+    for (Instruction *I : Result) {
+      Value *Callee = CallSite(I).getCalledValue();
+      Instruction *InsertPt = I;
+      Instruction *AnnotatedInst = I;
+      Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst});
+    }
+  }
+};
+
+///----------------------- Registration of the plugins -------------------------
+/// For now, registering a plugin with the ValueProfileCollector is done by
+/// adding the plugin type to the VP_PLUGIN_LIST macro.
+#define VP_PLUGIN_LIST           \
+    MemIntrinsicPlugin,          \
+    IndirectCallPromotionPlugin