vendor/compiler-rt/compiler-rt-trunk-r300890

author: Dimitry Andric <dim@FreeBSD.org> 2017-04-20 21:20:59 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-04-20 21:20:59 +0000
commit: f351c8a560ddc5b5df9ee5ba4ccc1cfb9029146d (patch)
tree: a1af403c7ce4e7447ee7e01c045d260dba9a409b /lib
parent: ab0bf875a5f328a6710f4e48258979ae1bc8da1c (diff)
download: src-test2-f351c8a560ddc5b5df9ee5ba4ccc1cfb9029146d.tar.gz
src-test2-f351c8a560ddc5b5df9ee5ba4ccc1cfb9029146d.zip
43 files changed, 774 insertions, 832 deletions
diff --git a/lib/asan/asan_thread.cc b/lib/asan/asan_thread.cc
index aaa32d6ea6da..f41ee2df2d96 100644
--- a/lib/asan/asan_thread.cc
+++ b/lib/asan/asan_thread.cc
@@ -237,7 +237,7 @@ void AsanThread::Init() {
 }
 
 thread_return_t AsanThread::ThreadStart(
-    uptr os_id, atomic_uintptr_t *signal_thread_is_registered) {
+    tid_t os_id, atomic_uintptr_t *signal_thread_is_registered) {
   Init();
   asanThreadRegistry().StartThread(tid(), os_id, /*workerthread*/ false,
                                    nullptr);
@@ -395,7 +395,7 @@ void EnsureMainThreadIDIsCorrect() {
     context->os_id = GetTid();
 }
 
-__asan::AsanThread *GetAsanThreadByOsIDLocked(uptr os_id) {
+__asan::AsanThread *GetAsanThreadByOsIDLocked(tid_t os_id) {
   __asan::AsanThreadContext *context = static_cast<__asan::AsanThreadContext *>(
       __asan::asanThreadRegistry().FindThreadContextByOsIDLocked(os_id));
   if (!context) return nullptr;
@@ -405,7 +405,7 @@ __asan::AsanThread *GetAsanThreadByOsIDLocked(uptr os_id) {
 
 // --- Implementation of LSan-specific functions --- {{{1
 namespace __lsan {
-bool GetThreadRangesLocked(uptr os_id, uptr *stack_begin, uptr *stack_end,
+bool GetThreadRangesLocked(tid_t os_id, uptr *stack_begin, uptr *stack_end,
                            uptr *tls_begin, uptr *tls_end, uptr *cache_begin,
                            uptr *cache_end, DTLS **dtls) {
   __asan::AsanThread *t = __asan::GetAsanThreadByOsIDLocked(os_id);
@@ -421,7 +421,7 @@ bool GetThreadRangesLocked(uptr os_id, uptr *stack_begin, uptr *stack_end,
   return true;
 }
 
-void ForEachExtraStackRange(uptr os_id, RangeIteratorCallback callback,
+void ForEachExtraStackRange(tid_t os_id, RangeIteratorCallback callback,
                             void *arg) {
   __asan::AsanThread *t = __asan::GetAsanThreadByOsIDLocked(os_id);
   if (t && t->has_fake_stack())
diff --git a/lib/asan/asan_thread.h b/lib/asan/asan_thread.h
index f53dfb712449..424f9e68dfea 100644
--- a/lib/asan/asan_thread.h
+++ b/lib/asan/asan_thread.h
@@ -63,7 +63,7 @@ class AsanThread {
   void Destroy();
 
   void Init();  // Should be called from the thread itself.
-  thread_return_t ThreadStart(uptr os_id,
+  thread_return_t ThreadStart(tid_t os_id,
                               atomic_uintptr_t *signal_thread_is_registered);
 
   uptr stack_top();
diff --git a/lib/asan/tests/asan_test_main.cc b/lib/asan/tests/asan_test_main.cc
index 1071d4474674..0c1b93c7fda7 100644
--- a/lib/asan/tests/asan_test_main.cc
+++ b/lib/asan/tests/asan_test_main.cc
@@ -13,15 +13,23 @@
 #include "asan_test_utils.h"
 #include "sanitizer_common/sanitizer_platform.h"
 
-// Default ASAN_OPTIONS for the unit tests. Let's turn symbolication off to
-// speed up testing (unit tests don't use it anyway).
+// Default ASAN_OPTIONS for the unit tests.
 extern "C" const char* __asan_default_options() {
 #if SANITIZER_MAC
   // On Darwin, we default to `abort_on_error=1`, which would make tests run
-  // much slower. Let's override this and run lit tests with 'abort_on_error=0'.
-  // Also, make sure we do not overwhelm the syslog while testing.
+  // much slower. Let's override this and run lit tests with 'abort_on_error=0'
+  // and make sure we do not overwhelm the syslog while testing. Also, let's
+  // turn symbolization off to speed up testing, especially when not running
+  // with llvm-symbolizer but with atos.
   return "symbolize=false:abort_on_error=0:log_to_syslog=0";
+#elif SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT
+  // On PowerPC and ARM Thumb, a couple tests involving pthread_exit fail due to
+  // leaks detected by LSan. Symbolized leak report is required to apply a
+  // suppression for this known problem.
+  return "";
 #else
+  // Let's turn symbolization off to speed up testing (more than 3 times speedup
+  // observed).
   return "symbolize=false";
 #endif
 }
diff --git a/lib/dfsan/done_abilist.txt b/lib/dfsan/done_abilist.txt
index a00dc5426cd0..cbbedbc33601 100644
--- a/lib/dfsan/done_abilist.txt
+++ b/lib/dfsan/done_abilist.txt
@@ -285,22 +285,8 @@ fun:__sanitizer_cov_module_init=uninstrumented
 fun:__sanitizer_cov_module_init=discard
 fun:__sanitizer_cov_with_check=uninstrumented
 fun:__sanitizer_cov_with_check=discard
-fun:__sanitizer_cov_indir_call16=uninstrumented
-fun:__sanitizer_cov_indir_call16=discard
-fun:__sanitizer_cov_indir_call16=uninstrumented
-fun:__sanitizer_cov_indir_call16=discard
-fun:__sanitizer_reset_coverage=uninstrumented
-fun:__sanitizer_reset_coverage=discard
 fun:__sanitizer_set_death_callback=uninstrumented
 fun:__sanitizer_set_death_callback=discard
-fun:__sanitizer_get_coverage_guards=uninstrumented
-fun:__sanitizer_get_coverage_guards=discard
-fun:__sanitizer_get_number_of_counters=uninstrumented
-fun:__sanitizer_get_number_of_counters=discard
-fun:__sanitizer_update_counter_bitset_and_clear_counters=uninstrumented
-fun:__sanitizer_update_counter_bitset_and_clear_counters=discard
-fun:__sanitizer_get_total_unique_coverage=uninstrumented
-fun:__sanitizer_get_total_unique_coverage=discard
 fun:__sanitizer_get_total_unique_coverage=uninstrumented
 fun:__sanitizer_get_total_unique_coverage=discard
 fun:__sanitizer_update_counter_bitset_and_clear_counters=uninstrumented
diff --git a/lib/lsan/lsan_common.cc b/lib/lsan/lsan_common.cc
index 6cc73749812b..200f16a594fa 100644
--- a/lib/lsan/lsan_common.cc
+++ b/lib/lsan/lsan_common.cc
@@ -68,6 +68,14 @@ ALIGNED(64) static char suppression_placeholder[sizeof(SuppressionContext)];
 static SuppressionContext *suppression_ctx = nullptr;
 static const char kSuppressionLeak[] = "leak";
 static const char *kSuppressionTypes[] = { kSuppressionLeak };
+static const char kStdSuppressions[] =
+#if SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT
+  // The actual string allocation happens here (for more details refer to the
+  // SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT definition).
+  "leak:*_dl_map_object_deps*";
+#else
+  "";
+#endif  // SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT
 
 void InitializeSuppressions() {
   CHECK_EQ(nullptr, suppression_ctx);
@@ -76,6 +84,7 @@ void InitializeSuppressions() {
   suppression_ctx->ParseFromFile(flags()->suppressions);
   if (&__lsan_default_suppressions)
     suppression_ctx->Parse(__lsan_default_suppressions());
+  suppression_ctx->Parse(kStdSuppressions);
 }
 
 static SuppressionContext *GetSuppressionContext() {
@@ -83,12 +92,9 @@ static SuppressionContext *GetSuppressionContext() {
   return suppression_ctx;
 }
 
-struct RootRegion {
-  const void *begin;
-  uptr size;
-};
+static InternalMmapVector<RootRegion> *root_regions;
 
-InternalMmapVector<RootRegion> *root_regions;
+InternalMmapVector<RootRegion> const *GetRootRegions() { return root_regions; }
 
 void InitializeRootRegions() {
   CHECK(!root_regions);
@@ -200,11 +206,11 @@ void ForEachExtraStackRangeCb(uptr begin, uptr end, void* arg) {
 // Scans thread data (stacks and TLS) for heap pointers.
 static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
                            Frontier *frontier) {
-  InternalScopedBuffer<uptr> registers(SuspendedThreadsList::RegisterCount());
+  InternalScopedBuffer<uptr> registers(suspended_threads.RegisterCount());
   uptr registers_begin = reinterpret_cast<uptr>(registers.data());
   uptr registers_end = registers_begin + registers.size();
-  for (uptr i = 0; i < suspended_threads.thread_count(); i++) {
-    uptr os_id = static_cast<uptr>(suspended_threads.GetThreadID(i));
+  for (uptr i = 0; i < suspended_threads.ThreadCount(); i++) {
+    tid_t os_id = static_cast<tid_t>(suspended_threads.GetThreadID(i));
     LOG_THREADS("Processing thread %d.\n", os_id);
     uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end;
     DTLS *dtls;
@@ -291,23 +297,29 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
   }
 }
 
-static void ProcessRootRegion(Frontier *frontier, uptr root_begin,
-                              uptr root_end) {
-  MemoryMappingLayout proc_maps(/*cache_enabled*/true);
+void ScanRootRegion(Frontier *frontier, const RootRegion &root_region,
+                    uptr region_begin, uptr region_end, uptr prot) {
+  uptr intersection_begin = Max(root_region.begin, region_begin);
+  uptr intersection_end = Min(region_end, root_region.begin + root_region.size);
+  if (intersection_begin >= intersection_end) return;
+  bool is_readable = prot & MemoryMappingLayout::kProtectionRead;
+  LOG_POINTERS("Root region %p-%p intersects with mapped region %p-%p (%s)\n",
+               root_region.begin, root_region.begin + root_region.size,
+               region_begin, region_end,
+               is_readable ? "readable" : "unreadable");
+  if (is_readable)
+    ScanRangeForPointers(intersection_begin, intersection_end, frontier, "ROOT",
+                         kReachable);
+}
+
+static void ProcessRootRegion(Frontier *frontier,
+                              const RootRegion &root_region) {
+  MemoryMappingLayout proc_maps(/*cache_enabled*/ true);
   uptr begin, end, prot;
   while (proc_maps.Next(&begin, &end,
                         /*offset*/ nullptr, /*filename*/ nullptr,
                         /*filename_size*/ 0, &prot)) {
-    uptr intersection_begin = Max(root_begin, begin);
-    uptr intersection_end = Min(end, root_end);
-    if (intersection_begin >= intersection_end) continue;
-    bool is_readable = prot & MemoryMappingLayout::kProtectionRead;
-    LOG_POINTERS("Root region %p-%p intersects with mapped region %p-%p (%s)\n",
-                 root_begin, root_end, begin, end,
-                 is_readable ? "readable" : "unreadable");
-    if (is_readable)
-      ScanRangeForPointers(intersection_begin, intersection_end, frontier,
-                           "ROOT", kReachable);
+    ScanRootRegion(frontier, root_region, begin, end, prot);
   }
 }
 
@@ -316,9 +328,7 @@ static void ProcessRootRegions(Frontier *frontier) {
   if (!flags()->use_root_regions) return;
   CHECK(root_regions);
   for (uptr i = 0; i < root_regions->size(); i++) {
-    RootRegion region = (*root_regions)[i];
-    uptr begin_addr = reinterpret_cast<uptr>(region.begin);
-    ProcessRootRegion(frontier, begin_addr, begin_addr + region.size);
+    ProcessRootRegion(frontier, (*root_regions)[i]);
   }
 }
 
@@ -356,6 +366,72 @@ static void CollectIgnoredCb(uptr chunk, void *arg) {
   }
 }
 
+static uptr GetCallerPC(u32 stack_id, StackDepotReverseMap *map) {
+  CHECK(stack_id);
+  StackTrace stack = map->Get(stack_id);
+  // The top frame is our malloc/calloc/etc. The next frame is the caller.
+  if (stack.size >= 2)
+    return stack.trace[1];
+  return 0;
+}
+
+struct InvalidPCParam {
+  Frontier *frontier;
+  StackDepotReverseMap *stack_depot_reverse_map;
+  bool skip_linker_allocations;
+};
+
+// ForEachChunk callback. If the caller pc is invalid or is within the linker,
+// mark as reachable. Called by ProcessPlatformSpecificAllocations.
+static void MarkInvalidPCCb(uptr chunk, void *arg) {
+  CHECK(arg);
+  InvalidPCParam *param = reinterpret_cast<InvalidPCParam *>(arg);
+  chunk = GetUserBegin(chunk);
+  LsanMetadata m(chunk);
+  if (m.allocated() && m.tag() != kReachable && m.tag() != kIgnored) {
+    u32 stack_id = m.stack_trace_id();
+    uptr caller_pc = 0;
+    if (stack_id > 0)
+      caller_pc = GetCallerPC(stack_id, param->stack_depot_reverse_map);
+    // If caller_pc is unknown, this chunk may be allocated in a coroutine. Mark
+    // it as reachable, as we can't properly report its allocation stack anyway.
+    if (caller_pc == 0 || (param->skip_linker_allocations &&
+                           GetLinker()->containsAddress(caller_pc))) {
+      m.set_tag(kReachable);
+      param->frontier->push_back(chunk);
+    }
+  }
+}
+
+// On Linux, handles dynamically allocated TLS blocks by treating all chunks
+// allocated from ld-linux.so as reachable.
+// Dynamic TLS blocks contain the TLS variables of dynamically loaded modules.
+// They are allocated with a __libc_memalign() call in allocate_and_init()
+// (elf/dl-tls.c). Glibc won't tell us the address ranges occupied by those
+// blocks, but we can make sure they come from our own allocator by intercepting
+// __libc_memalign(). On top of that, there is no easy way to reach them. Their
+// addresses are stored in a dynamically allocated array (the DTV) which is
+// referenced from the static TLS. Unfortunately, we can't just rely on the DTV
+// being reachable from the static TLS, and the dynamic TLS being reachable from
+// the DTV. This is because the initial DTV is allocated before our interception
+// mechanism kicks in, and thus we don't recognize it as allocated memory. We
+// can't special-case it either, since we don't know its size.
+// Our solution is to include in the root set all allocations made from
+// ld-linux.so (which is where allocate_and_init() is implemented). This is
+// guaranteed to include all dynamic TLS blocks (and possibly other allocations
+// which we don't care about).
+// On all other platforms, this simply checks to ensure that the caller pc is
+// valid before reporting chunks as leaked.
+void ProcessPC(Frontier *frontier) {
+  StackDepotReverseMap stack_depot_reverse_map;
+  InvalidPCParam arg;
+  arg.frontier = frontier;
+  arg.stack_depot_reverse_map = &stack_depot_reverse_map;
+  arg.skip_linker_allocations =
+      flags()->use_tls && flags()->use_ld_allocations && GetLinker() != nullptr;
+  ForEachChunk(MarkInvalidPCCb, &arg);
+}
+
 // Sets the appropriate tag on each chunk.
 static void ClassifyAllChunks(SuspendedThreadsList const &suspended_threads) {
   // Holds the flood fill frontier.
@@ -367,11 +443,13 @@ static void ClassifyAllChunks(SuspendedThreadsList const &suspended_threads) {
   ProcessRootRegions(&frontier);
   FloodFillTag(&frontier, kReachable);
 
+  CHECK_EQ(0, frontier.size());
+  ProcessPC(&frontier);
+
   // The check here is relatively expensive, so we do this in a separate flood
   // fill. That way we can skip the check for chunks that are reachable
   // otherwise.
   LOG_POINTERS("Processing platform-specific allocations.\n");
-  CHECK_EQ(0, frontier.size());
   ProcessPlatformSpecificAllocations(&frontier);
   FloodFillTag(&frontier, kReachable);
 
@@ -707,7 +785,7 @@ void __lsan_register_root_region(const void *begin, uptr size) {
 #if CAN_SANITIZE_LEAKS
   BlockingMutexLock l(&global_mutex);
   CHECK(root_regions);
-  RootRegion region = {begin, size};
+  RootRegion region = {reinterpret_cast<uptr>(begin), size};
   root_regions->push_back(region);
   VReport(1, "Registered root region at %p of size %llu\n", begin, size);
 #endif // CAN_SANITIZE_LEAKS
@@ -721,7 +799,7 @@ void __lsan_unregister_root_region(const void *begin, uptr size) {
   bool removed = false;
   for (uptr i = 0; i < root_regions->size(); i++) {
     RootRegion region = (*root_regions)[i];
-    if (region.begin == begin && region.size == size) {
+    if (region.begin == reinterpret_cast<uptr>(begin) && region.size == size) {
       removed = true;
       uptr last_index = root_regions->size() - 1;
       (*root_regions)[i] = (*root_regions)[last_index];
diff --git a/lib/lsan/lsan_common.h b/lib/lsan/lsan_common.h
index 919be0ec2662..121b9c082983 100644
--- a/lib/lsan/lsan_common.h
+++ b/lib/lsan/lsan_common.h
@@ -118,6 +118,15 @@ typedef InternalMmapVector<uptr> Frontier;
 void InitializePlatformSpecificModules();
 void ProcessGlobalRegions(Frontier *frontier);
 void ProcessPlatformSpecificAllocations(Frontier *frontier);
+
+struct RootRegion {
+  uptr begin;
+  uptr size;
+};
+
+InternalMmapVector<RootRegion> const *GetRootRegions();
+void ScanRootRegion(Frontier *frontier, RootRegion const &region,
+                    uptr region_begin, uptr region_end, uptr prot);
 // Run stoptheworld while holding any platform-specific locks.
 void DoStopTheWorld(StopTheWorldCallback callback, void* argument);
 
@@ -193,10 +202,10 @@ bool WordIsPoisoned(uptr addr);
 // Wrappers for ThreadRegistry access.
 void LockThreadRegistry();
 void UnlockThreadRegistry();
-bool GetThreadRangesLocked(uptr os_id, uptr *stack_begin, uptr *stack_end,
+bool GetThreadRangesLocked(tid_t os_id, uptr *stack_begin, uptr *stack_end,
                            uptr *tls_begin, uptr *tls_end, uptr *cache_begin,
                            uptr *cache_end, DTLS **dtls);
-void ForEachExtraStackRange(uptr os_id, RangeIteratorCallback callback,
+void ForEachExtraStackRange(tid_t os_id, RangeIteratorCallback callback,
                             void *arg);
 // If called from the main thread, updates the main thread's TID in the thread
 // registry. We need this to handle processes that fork() without a subsequent
@@ -212,6 +221,10 @@ uptr PointsIntoChunk(void *p);
 uptr GetUserBegin(uptr chunk);
 // Helper for __lsan_ignore_object().
 IgnoreObjectResult IgnoreObjectLocked(const void *p);
+
+// Return the linker module, if valid for the platform.
+LoadedModule *GetLinker();
+
 // Wrapper for chunk metadata operations.
 class LsanMetadata {
  public:
diff --git a/lib/lsan/lsan_common_linux.cc b/lib/lsan/lsan_common_linux.cc
index 0d1e998a5cfe..fadd0263de73 100644
--- a/lib/lsan/lsan_common_linux.cc
+++ b/lib/lsan/lsan_common_linux.cc
@@ -89,70 +89,9 @@ void ProcessGlobalRegions(Frontier *frontier) {
   dl_iterate_phdr(ProcessGlobalRegionsCallback, frontier);
 }
 
-static uptr GetCallerPC(u32 stack_id, StackDepotReverseMap *map) {
-  CHECK(stack_id);
-  StackTrace stack = map->Get(stack_id);
-  // The top frame is our malloc/calloc/etc. The next frame is the caller.
-  if (stack.size >= 2)
-    return stack.trace[1];
-  return 0;
-}
+LoadedModule *GetLinker() { return linker; }
 
-struct ProcessPlatformAllocParam {
-  Frontier *frontier;
-  StackDepotReverseMap *stack_depot_reverse_map;
-  bool skip_linker_allocations;
-};
-
-// ForEachChunk callback. Identifies unreachable chunks which must be treated as
-// reachable. Marks them as reachable and adds them to the frontier.
-static void ProcessPlatformSpecificAllocationsCb(uptr chunk, void *arg) {
-  CHECK(arg);
-  ProcessPlatformAllocParam *param =
-      reinterpret_cast<ProcessPlatformAllocParam *>(arg);
-  chunk = GetUserBegin(chunk);
-  LsanMetadata m(chunk);
-  if (m.allocated() && m.tag() != kReachable && m.tag() != kIgnored) {
-    u32 stack_id = m.stack_trace_id();
-    uptr caller_pc = 0;
-    if (stack_id > 0)
-      caller_pc = GetCallerPC(stack_id, param->stack_depot_reverse_map);
-    // If caller_pc is unknown, this chunk may be allocated in a coroutine. Mark
-    // it as reachable, as we can't properly report its allocation stack anyway.
-    if (caller_pc == 0 || (param->skip_linker_allocations &&
-                           linker->containsAddress(caller_pc))) {
-      m.set_tag(kReachable);
-      param->frontier->push_back(chunk);
-    }
-  }
-}
-
-// Handles dynamically allocated TLS blocks by treating all chunks allocated
-// from ld-linux.so as reachable.
-// Dynamic TLS blocks contain the TLS variables of dynamically loaded modules.
-// They are allocated with a __libc_memalign() call in allocate_and_init()
-// (elf/dl-tls.c). Glibc won't tell us the address ranges occupied by those
-// blocks, but we can make sure they come from our own allocator by intercepting
-// __libc_memalign(). On top of that, there is no easy way to reach them. Their
-// addresses are stored in a dynamically allocated array (the DTV) which is
-// referenced from the static TLS. Unfortunately, we can't just rely on the DTV
-// being reachable from the static TLS, and the dynamic TLS being reachable from
-// the DTV. This is because the initial DTV is allocated before our interception
-// mechanism kicks in, and thus we don't recognize it as allocated memory. We
-// can't special-case it either, since we don't know its size.
-// Our solution is to include in the root set all allocations made from
-// ld-linux.so (which is where allocate_and_init() is implemented). This is
-// guaranteed to include all dynamic TLS blocks (and possibly other allocations
-// which we don't care about).
-void ProcessPlatformSpecificAllocations(Frontier *frontier) {
-  StackDepotReverseMap stack_depot_reverse_map;
-  ProcessPlatformAllocParam arg;
-  arg.frontier = frontier;
-  arg.stack_depot_reverse_map = &stack_depot_reverse_map;
-  arg.skip_linker_allocations =
-      flags()->use_tls && flags()->use_ld_allocations && linker != nullptr;
-  ForEachChunk(ProcessPlatformSpecificAllocationsCb, &arg);
-}
+void ProcessPlatformSpecificAllocations(Frontier *frontier) {}
 
 struct DoStopTheWorldParam {
   StopTheWorldCallback callback;
diff --git a/lib/lsan/lsan_common_mac.cc b/lib/lsan/lsan_common_mac.cc
index 022e73937895..a9adcdfff37f 100644
--- a/lib/lsan/lsan_common_mac.cc
+++ b/lib/lsan/lsan_common_mac.cc
@@ -22,6 +22,8 @@
 
 #include <pthread.h>
 
+#include <mach/mach.h>
+
 namespace __lsan {
 
 typedef struct {
@@ -85,6 +87,8 @@ void SetCurrentThread(u32 tid) { get_tls_val(true)->current_thread_id = tid; }
 
 AllocatorCache *GetAllocatorCache() { return &get_tls_val(true)->cache; }
 
+LoadedModule *GetLinker() { return nullptr; }
+
 // Required on Linux for initialization of TLS behavior, but should not be
 // required on Darwin.
 void InitializePlatformSpecificModules() {
@@ -106,7 +110,7 @@ void ProcessGlobalRegions(Frontier *frontier) {
 
     for (const __sanitizer::LoadedModule::AddressRange &range :
          modules[i].ranges()) {
-      if (range.executable) continue;
+      if (range.executable || !range.readable) continue;
 
       ScanGlobalRange(range.beg, range.end, frontier);
     }
@@ -114,11 +118,54 @@ void ProcessGlobalRegions(Frontier *frontier) {
 }
 
 void ProcessPlatformSpecificAllocations(Frontier *frontier) {
-  CHECK(0 && "unimplemented");
+  mach_port_name_t port;
+  if (task_for_pid(mach_task_self(), internal_getpid(), &port)
+      != KERN_SUCCESS) {
+    return;
+  }
+
+  unsigned depth = 1;
+  vm_size_t size = 0;
+  vm_address_t address = 0;
+  kern_return_t err = KERN_SUCCESS;
+  mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
+
+  InternalMmapVector<RootRegion> const *root_regions = GetRootRegions();
+
+  while (err == KERN_SUCCESS) {
+    struct vm_region_submap_info_64 info;
+    err = vm_region_recurse_64(port, &address, &size, &depth,
+                               (vm_region_info_t)&info, &count);
+
+    uptr end_address = address + size;
+
+    // libxpc stashes some pointers in the Kernel Alloc Once page,
+    // make sure not to report those as leaks.
+    if (info.user_tag == VM_MEMORY_OS_ALLOC_ONCE) {
+      ScanRangeForPointers(address, end_address, frontier, "GLOBAL",
+                           kReachable);
+    }
+
+    // This additional root region scan is required on Darwin in order to
+    // detect root regions contained within mmap'd memory regions, because
+    // the Darwin implementation of sanitizer_procmaps traverses images
+    // as loaded by dyld, and not the complete set of all memory regions.
+    //
+    // TODO(fjricci) - remove this once sanitizer_procmaps_mac has the same
+    // behavior as sanitizer_procmaps_linux and traverses all memory regions
+    if (flags()->use_root_regions) {
+      for (uptr i = 0; i < root_regions->size(); i++) {
+        ScanRootRegion(frontier, (*root_regions)[i], address, end_address,
+                       info.protection);
+      }
+    }
+
+    address = end_address;
+  }
 }
 
 void DoStopTheWorld(StopTheWorldCallback callback, void *argument) {
-  CHECK(0 && "unimplemented");
+  StopTheWorld(callback, argument);
 }
 
 } // namespace __lsan
diff --git a/lib/lsan/lsan_thread.cc b/lib/lsan/lsan_thread.cc
index 09eeb9c24982..0ea7a6e97497 100644
--- a/lib/lsan/lsan_thread.cc
+++ b/lib/lsan/lsan_thread.cc
@@ -77,7 +77,7 @@ u32 ThreadCreate(u32 parent_tid, uptr user_id, bool detached) {
                                        /* arg */ nullptr);
 }
 
-void ThreadStart(u32 tid, uptr os_id) {
+void ThreadStart(u32 tid, tid_t os_id) {
   OnStartedArgs args;
   uptr stack_size = 0;
   uptr tls_size = 0;
@@ -127,7 +127,7 @@ void EnsureMainThreadIDIsCorrect() {
 
 ///// Interface to the common LSan module. /////
 
-bool GetThreadRangesLocked(uptr os_id, uptr *stack_begin, uptr *stack_end,
+bool GetThreadRangesLocked(tid_t os_id, uptr *stack_begin, uptr *stack_end,
                            uptr *tls_begin, uptr *tls_end, uptr *cache_begin,
                            uptr *cache_end, DTLS **dtls) {
   ThreadContext *context = static_cast<ThreadContext *>(
@@ -143,7 +143,7 @@ bool GetThreadRangesLocked(uptr os_id, uptr *stack_begin, uptr *stack_end,
   return true;
 }
 
-void ForEachExtraStackRange(uptr os_id, RangeIteratorCallback callback,
+void ForEachExtraStackRange(tid_t os_id, RangeIteratorCallback callback,
                             void *arg) {
 }
 
diff --git a/lib/lsan/lsan_thread.h b/lib/lsan/lsan_thread.h
index 10b7b5796c51..73e080e26f76 100644
--- a/lib/lsan/lsan_thread.h
+++ b/lib/lsan/lsan_thread.h
@@ -45,7 +45,7 @@ class ThreadContext : public ThreadContextBase {
 
 void InitializeThreadRegistry();
 
-void ThreadStart(u32 tid, uptr os_id);
+void ThreadStart(u32 tid, tid_t os_id);
 void ThreadFinish();
 u32 ThreadCreate(u32 tid, uptr uid, bool detached);
 void ThreadJoin(u32 tid);
diff --git a/lib/sanitizer_common/sanitizer_common.cc b/lib/sanitizer_common/sanitizer_common.cc
index 3ef366f4f328..471c3ded2115 100644
--- a/lib/sanitizer_common/sanitizer_common.cc
+++ b/lib/sanitizer_common/sanitizer_common.cc
@@ -284,9 +284,10 @@ void LoadedModule::clear() {
   }
 }
 
-void LoadedModule::addAddressRange(uptr beg, uptr end, bool executable) {
+void LoadedModule::addAddressRange(uptr beg, uptr end, bool executable,
+                                   bool readable) {
   void *mem = InternalAlloc(sizeof(AddressRange));
-  AddressRange *r = new(mem) AddressRange(beg, end, executable);
+  AddressRange *r = new(mem) AddressRange(beg, end, executable, readable);
   ranges_.push_back(r);
   if (executable && end > max_executable_address_)
     max_executable_address_ = end;
diff --git a/lib/sanitizer_common/sanitizer_common.h b/lib/sanitizer_common/sanitizer_common.h
index 9d367ca80144..bbe7aebf3279 100644
--- a/lib/sanitizer_common/sanitizer_common.h
+++ b/lib/sanitizer_common/sanitizer_common.h
@@ -72,7 +72,7 @@ INLINE uptr GetPageSizeCached() {
 uptr GetMmapGranularity();
 uptr GetMaxVirtualAddress();
 // Threads
-uptr GetTid();
+tid_t GetTid();
 uptr GetThreadSelf();
 void GetThreadStackTopAndBottom(bool at_initialization, uptr *stack_top,
                                 uptr *stack_bottom);
@@ -717,7 +717,7 @@ class LoadedModule {
   void set(const char *module_name, uptr base_address, ModuleArch arch,
            u8 uuid[kModuleUUIDSize], bool instrumented);
   void clear();
-  void addAddressRange(uptr beg, uptr end, bool executable);
+  void addAddressRange(uptr beg, uptr end, bool executable, bool readable);
   bool containsAddress(uptr address) const;
 
   const char *full_name() const { return full_name_; }
@@ -732,9 +732,14 @@ class LoadedModule {
     uptr beg;
     uptr end;
     bool executable;
-
-    AddressRange(uptr beg, uptr end, bool executable)
-        : next(nullptr), beg(beg), end(end), executable(executable) {}
+    bool readable;
+
+    AddressRange(uptr beg, uptr end, bool executable, bool readable)
+        : next(nullptr),
+          beg(beg),
+          end(end),
+          executable(executable),
+          readable(readable) {}
   };
 
   const IntrusiveList<AddressRange> &ranges() const { return ranges_; }
diff --git a/lib/sanitizer_common/sanitizer_common_interceptors.inc b/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 7b4e6d27df3d..d1c793c551f7 100644
--- a/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -139,12 +139,9 @@ bool PlatformHasDifferentMemcpyAndMemmove();
 #define COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED (0)
 #endif
 
-#define COMMON_INTERCEPTOR_READ_STRING_OF_LEN(ctx, s, len, n)       \
-    COMMON_INTERCEPTOR_READ_RANGE((ctx), (s),                       \
-      common_flags()->strict_string_checks ? (len) + 1 : (n) )
-
 #define COMMON_INTERCEPTOR_READ_STRING(ctx, s, n)                   \
-    COMMON_INTERCEPTOR_READ_STRING_OF_LEN((ctx), (s), REAL(strlen)(s), (n))
+    COMMON_INTERCEPTOR_READ_RANGE((ctx), (s),                       \
+      common_flags()->strict_string_checks ? (REAL(strlen)(s)) + 1 : (n) )
 
 #ifndef COMMON_INTERCEPTOR_ON_DLOPEN
 #define COMMON_INTERCEPTOR_ON_DLOPEN(filename, flag) \
@@ -450,8 +447,7 @@ static inline void StrstrCheck(void *ctx, char *r, const char *s1,
                                const char *s2) {
     uptr len1 = REAL(strlen)(s1);
     uptr len2 = REAL(strlen)(s2);
-    COMMON_INTERCEPTOR_READ_STRING_OF_LEN(ctx, s1, len1,
-                                          r ? r - s1 + len2 : len1 + 1);
+    COMMON_INTERCEPTOR_READ_STRING(ctx, s1, r ? r - s1 + len2 : len1 + 1);
     COMMON_INTERCEPTOR_READ_RANGE(ctx, s2, len2 + 1);
 }
 #endif
@@ -577,10 +573,11 @@ INTERCEPTOR(char*, strchr, const char *s, int c) {
     return internal_strchr(s, c);
   COMMON_INTERCEPTOR_ENTER(ctx, strchr, s, c);
   char *result = REAL(strchr)(s, c);
-  uptr len = internal_strlen(s);
-  uptr n = result ? result - s + 1 : len + 1;
-  if (common_flags()->intercept_strchr)
-    COMMON_INTERCEPTOR_READ_STRING_OF_LEN(ctx, s, len, n);
+  if (common_flags()->intercept_strchr) {
+    // Keep strlen as macro argument, as macro may ignore it.
+    COMMON_INTERCEPTOR_READ_STRING(ctx, s,
+      (result ? result - s : REAL(strlen)(s)) + 1);
+  }
   return result;
 }
 #define INIT_STRCHR COMMON_INTERCEPT_FUNCTION(strchr)
@@ -609,9 +606,8 @@ INTERCEPTOR(char*, strrchr, const char *s, int c) {
   if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)
     return internal_strrchr(s, c);
   COMMON_INTERCEPTOR_ENTER(ctx, strrchr, s, c);
-  uptr len = internal_strlen(s);
   if (common_flags()->intercept_strchr)
-    COMMON_INTERCEPTOR_READ_STRING_OF_LEN(ctx, s, len, len + 1);
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, s, REAL(strlen)(s) + 1);
   return REAL(strrchr)(s, c);
 }
 #define INIT_STRRCHR COMMON_INTERCEPT_FUNCTION(strrchr)
diff --git a/lib/sanitizer_common/sanitizer_coverage_interface.inc b/lib/sanitizer_common/sanitizer_coverage_interface.inc
index ae691bd9dd27..42b4d3aba01b 100644
--- a/lib/sanitizer_common/sanitizer_coverage_interface.inc
+++ b/lib/sanitizer_common/sanitizer_coverage_interface.inc
@@ -10,21 +10,13 @@
 //===----------------------------------------------------------------------===//
 INTERFACE_FUNCTION(__sanitizer_cov)
 INTERFACE_FUNCTION(__sanitizer_cov_dump)
-INTERFACE_FUNCTION(__sanitizer_cov_indir_call16)
 INTERFACE_FUNCTION(__sanitizer_cov_init)
 INTERFACE_FUNCTION(__sanitizer_cov_module_init)
-INTERFACE_FUNCTION(__sanitizer_cov_trace_basic_block)
-INTERFACE_FUNCTION(__sanitizer_cov_trace_func_enter)
 INTERFACE_FUNCTION(__sanitizer_cov_with_check)
 INTERFACE_FUNCTION(__sanitizer_dump_coverage)
 INTERFACE_FUNCTION(__sanitizer_dump_trace_pc_guard_coverage)
-INTERFACE_FUNCTION(__sanitizer_get_coverage_guards)
-INTERFACE_FUNCTION(__sanitizer_get_number_of_counters)
-INTERFACE_FUNCTION(__sanitizer_get_total_unique_caller_callee_pairs)
 INTERFACE_FUNCTION(__sanitizer_get_total_unique_coverage)
 INTERFACE_FUNCTION(__sanitizer_maybe_open_cov_file)
-INTERFACE_FUNCTION(__sanitizer_reset_coverage)
-INTERFACE_FUNCTION(__sanitizer_update_counter_bitset_and_clear_counters)
 INTERFACE_WEAK_FUNCTION(__sancov_default_options)
 INTERFACE_WEAK_FUNCTION(__sanitizer_cov_trace_cmp)
 INTERFACE_WEAK_FUNCTION(__sanitizer_cov_trace_cmp1)
diff --git a/lib/sanitizer_common/sanitizer_coverage_libcdep.cc b/lib/sanitizer_common/sanitizer_coverage_libcdep.cc
index e934af3ed975..bb59c344edc2 100644
--- a/lib/sanitizer_common/sanitizer_coverage_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_coverage_libcdep.cc
@@ -57,12 +57,6 @@ static const u64 kMagic = SANITIZER_WORDSIZE == 64 ? kMagic64 : kMagic32;
 static atomic_uint32_t dump_once_guard;  // Ensure that CovDump runs only once.
 
 static atomic_uintptr_t coverage_counter;
-static atomic_uintptr_t caller_callee_counter;
-
-static void ResetGlobalCounters() {
-  return atomic_store(&coverage_counter, 0, memory_order_relaxed);
-  return atomic_store(&caller_callee_counter, 0, memory_order_relaxed);
-}
 
 // pc_array is the array containing the covered PCs.
 // To make the pc_array thread- and async-signal-safe it has to be large enough.
@@ -90,25 +84,14 @@ class CoverageData {
   void AfterFork(int child_pid);
   void Extend(uptr npcs);
   void Add(uptr pc, u32 *guard);
-  void IndirCall(uptr caller, uptr callee, uptr callee_cache[],
-                 uptr cache_size);
-  void DumpCallerCalleePairs();
-  void DumpTrace();
   void DumpAsBitSet();
-  void DumpCounters();
   void DumpOffsets();
   void DumpAll();
 
-  ALWAYS_INLINE
-  void TraceBasicBlock(u32 *id);
-
   void InitializeGuardArray(s32 *guards);
   void InitializeGuards(s32 *guards, uptr n, const char *module_name,
                         uptr caller_pc);
-  void InitializeCounters(u8 *counters, uptr n);
   void ReinitializeGuards();
-  uptr GetNumberOf8bitCounters();
-  uptr Update8bitCounterBitsetAndClearCounters(u8 *bitset);
 
   uptr *data();
   uptr size() const;
@@ -150,37 +133,6 @@ class CoverageData {
   InternalMmapVectorNoCtor<NamedPcRange> comp_unit_name_vec;
   InternalMmapVectorNoCtor<NamedPcRange> module_name_vec;
 
-  struct CounterAndSize {
-    u8 *counters;
-    uptr n;
-  };
-
-  InternalMmapVectorNoCtor<CounterAndSize> counters_vec;
-  uptr num_8bit_counters;
-
-  // Caller-Callee (cc) array, size and current index.
-  static const uptr kCcArrayMaxSize = FIRST_32_SECOND_64(1 << 18, 1 << 24);
-  uptr **cc_array;
-  atomic_uintptr_t cc_array_index;
-  atomic_uintptr_t cc_array_size;
-
-  // Tracing event array, size and current pointer.
-  // We record all events (basic block entries) in a global buffer of u32
-  // values. Each such value is the index in pc_array.
-  // So far the tracing is highly experimental:
-  //   - not thread-safe;
-  //   - does not support long traces;
-  //   - not tuned for performance.
-  // Windows doesn't do overcommit (committed virtual memory costs swap), so
-  // programs can't reliably map such large amounts of virtual memory.
-  // TODO(etienneb): Find a way to support coverage of larger executable
-static const uptr kTrEventArrayMaxSize =
-    (SANITIZER_WORDSIZE == 32 || SANITIZER_WINDOWS) ? 1 << 22 : 1 << 30;
-  u32 *tr_event_array;
-  uptr tr_event_array_size;
-  u32 *tr_event_pointer;
-  static const uptr kTrPcArrayMaxSize    = FIRST_32_SECOND_64(1 << 22, 1 << 27);
-
   StaticSpinMutex mu;
 };
 
@@ -217,23 +169,6 @@ void CoverageData::Enable() {
   } else {
     atomic_store(&pc_array_size, kPcArrayMaxSize, memory_order_relaxed);
   }
-
-  cc_array = reinterpret_cast<uptr **>(MmapNoReserveOrDie(
-      sizeof(uptr *) * kCcArrayMaxSize, "CovInit::cc_array"));
-  atomic_store(&cc_array_size, kCcArrayMaxSize, memory_order_relaxed);
-  atomic_store(&cc_array_index, 0, memory_order_relaxed);
-
-  // Allocate tr_event_array with a guard page at the end.
-  tr_event_array = reinterpret_cast<u32 *>(MmapNoReserveOrDie(
-      sizeof(tr_event_array[0]) * kTrEventArrayMaxSize + GetMmapGranularity(),
-      "CovInit::tr_event_array"));
-  MprotectNoAccess(
-      reinterpret_cast<uptr>(&tr_event_array[kTrEventArrayMaxSize]),
-      GetMmapGranularity());
-  tr_event_array_size = kTrEventArrayMaxSize;
-  tr_event_pointer = tr_event_array;
-
-  num_8bit_counters = 0;
 }
 
 void CoverageData::InitializeGuardArray(s32 *guards) {
@@ -251,17 +186,6 @@ void CoverageData::Disable() {
     UnmapOrDie(pc_array, sizeof(uptr) * kPcArrayMaxSize);
     pc_array = nullptr;
   }
-  if (cc_array) {
-    UnmapOrDie(cc_array, sizeof(uptr *) * kCcArrayMaxSize);
-    cc_array = nullptr;
-  }
-  if (tr_event_array) {
-    UnmapOrDie(tr_event_array,
-               sizeof(tr_event_array[0]) * kTrEventArrayMaxSize +
-                   GetMmapGranularity());
-    tr_event_array = nullptr;
-    tr_event_pointer = nullptr;
-  }
   if (pc_fd != kInvalidFd) {
     CloseFile(pc_fd);
     pc_fd = kInvalidFd;
@@ -341,15 +265,6 @@ void CoverageData::Extend(uptr npcs) {
   atomic_store(&pc_array_size, size, memory_order_release);
 }
 
-void CoverageData::InitializeCounters(u8 *counters, uptr n) {
-  if (!counters) return;
-  CHECK_EQ(reinterpret_cast<uptr>(counters) % 16, 0);
-  n = RoundUpTo(n, 16); // The compiler must ensure that counters is 16-aligned.
-  SpinMutexLock l(&mu);
-  counters_vec.push_back({counters, n});
-  num_8bit_counters += n;
-}
-
 void CoverageData::UpdateModuleNameVec(uptr caller_pc, uptr range_beg,
                                        uptr range_end) {
   auto sym = Symbolizer::GetOrInit();
@@ -424,98 +339,6 @@ void CoverageData::Add(uptr pc, u32 *guard) {
   pc_array[idx] = BundlePcAndCounter(pc, counter);
 }
 
-// Registers a pair caller=>callee.
-// When a given caller is seen for the first time, the callee_cache is added
-// to the global array cc_array, callee_cache[0] is set to caller and
-// callee_cache[1] is set to cache_size.
-// Then we are trying to add callee to callee_cache [2,cache_size) if it is
-// not there yet.
-// If the cache is full we drop the callee (may want to fix this later).
-void CoverageData::IndirCall(uptr caller, uptr callee, uptr callee_cache[],
-                             uptr cache_size) {
-  if (!cc_array) return;
-  atomic_uintptr_t *atomic_callee_cache =
-      reinterpret_cast<atomic_uintptr_t *>(callee_cache);
-  uptr zero = 0;
-  if (atomic_compare_exchange_strong(&atomic_callee_cache[0], &zero, caller,
-                                     memory_order_seq_cst)) {
-    uptr idx = atomic_fetch_add(&cc_array_index, 1, memory_order_relaxed);
-    CHECK_LT(idx * sizeof(uptr),
-             atomic_load(&cc_array_size, memory_order_acquire));
-    callee_cache[1] = cache_size;
-    cc_array[idx] = callee_cache;
-  }
-  CHECK_EQ(atomic_load(&atomic_callee_cache[0], memory_order_relaxed), caller);
-  for (uptr i = 2; i < cache_size; i++) {
-    uptr was = 0;
-    if (atomic_compare_exchange_strong(&atomic_callee_cache[i], &was, callee,
-                                       memory_order_seq_cst)) {
-      atomic_fetch_add(&caller_callee_counter, 1, memory_order_relaxed);
-      return;
-    }
-    if (was == callee)  // Already have this callee.
-      return;
-  }
-}
-
-uptr CoverageData::GetNumberOf8bitCounters() {
-  return num_8bit_counters;
-}
-
-// Map every 8bit counter to a 8-bit bitset and clear the counter.
-uptr CoverageData::Update8bitCounterBitsetAndClearCounters(u8 *bitset) {
-  uptr num_new_bits = 0;
-  uptr cur = 0;
-  // For better speed we map 8 counters to 8 bytes of bitset at once.
-  static const uptr kBatchSize = 8;
-  CHECK_EQ(reinterpret_cast<uptr>(bitset) % kBatchSize, 0);
-  for (uptr i = 0, len = counters_vec.size(); i < len; i++) {
-    u8 *c = counters_vec[i].counters;
-    uptr n = counters_vec[i].n;
-    CHECK_EQ(n % 16, 0);
-    CHECK_EQ(cur % kBatchSize, 0);
-    CHECK_EQ(reinterpret_cast<uptr>(c) % kBatchSize, 0);
-    if (!bitset) {
-      internal_bzero_aligned16(c, n);
-      cur += n;
-      continue;
-    }
-    for (uptr j = 0; j < n; j += kBatchSize, cur += kBatchSize) {
-      CHECK_LT(cur, num_8bit_counters);
-      u64 *pc64 = reinterpret_cast<u64*>(c + j);
-      u64 *pb64 = reinterpret_cast<u64*>(bitset + cur);
-      u64 c64 = *pc64;
-      u64 old_bits_64 = *pb64;
-      u64 new_bits_64 = old_bits_64;
-      if (c64) {
-        *pc64 = 0;
-        for (uptr k = 0; k < kBatchSize; k++) {
-          u64 x = (c64 >> (8 * k)) & 0xff;
-          if (x) {
-            u64 bit = 0;
-            /**/ if (x >= 128) bit = 128;
-            else if (x >= 32) bit = 64;
-            else if (x >= 16) bit = 32;
-            else if (x >= 8) bit = 16;
-            else if (x >= 4) bit = 8;
-            else if (x >= 3) bit = 4;
-            else if (x >= 2) bit = 2;
-            else if (x >= 1) bit = 1;
-            u64 mask = bit << (8 * k);
-            if (!(new_bits_64 & mask)) {
-              num_new_bits++;
-              new_bits_64 |= mask;
-            }
-          }
-        }
-        *pb64 = new_bits_64;
-      }
-    }
-  }
-  CHECK_EQ(cur, num_8bit_counters);
-  return num_new_bits;
-}
-
 uptr *CoverageData::data() {
   return pc_array;
 }
@@ -596,132 +419,6 @@ static fd_t CovOpenFile(InternalScopedString *path, bool packed,
   return fd;
 }
 
-// Dump trace PCs and trace events into two separate files.
-void CoverageData::DumpTrace() {
-  uptr max_idx = tr_event_pointer - tr_event_array;
-  if (!max_idx) return;
-  auto sym = Symbolizer::GetOrInit();
-  if (!sym)
-    return;
-  InternalScopedString out(32 << 20);
-  for (uptr i = 0, n = size(); i < n; i++) {
-    const char *module_name = "<unknown>";
-    uptr module_address = 0;
-    sym->GetModuleNameAndOffsetForPC(UnbundlePc(pc_array[i]), &module_name,
-                                     &module_address);
-    out.append("%s 0x%zx\n", module_name, module_address);
-  }
-  InternalScopedString path(kMaxPathLength);
-  fd_t fd = CovOpenFile(&path, false, "trace-points");
-  if (fd == kInvalidFd) return;
-  WriteToFile(fd, out.data(), out.length());
-  CloseFile(fd);
-
-  fd = CovOpenFile(&path, false, "trace-compunits");
-  if (fd == kInvalidFd) return;
-  out.clear();
-  for (uptr i = 0; i < comp_unit_name_vec.size(); i++)
-    out.append("%s\n", comp_unit_name_vec[i].copied_module_name);
-  WriteToFile(fd, out.data(), out.length());
-  CloseFile(fd);
-
-  fd = CovOpenFile(&path, false, "trace-events");
-  if (fd == kInvalidFd) return;
-  uptr bytes_to_write = max_idx * sizeof(tr_event_array[0]);
-  u8 *event_bytes = reinterpret_cast<u8*>(tr_event_array);
-  // The trace file could be huge, and may not be written with a single syscall.
-  while (bytes_to_write) {
-    uptr actually_written;
-    if (WriteToFile(fd, event_bytes, bytes_to_write, &actually_written) &&
-        actually_written <= bytes_to_write) {
-      bytes_to_write -= actually_written;
-      event_bytes += actually_written;
-    } else {
-      break;
-    }
-  }
-  CloseFile(fd);
-  VReport(1, " CovDump: Trace: %zd PCs written\n", size());
-  VReport(1, " CovDump: Trace: %zd Events written\n", max_idx);
-}
-
-// This function dumps the caller=>callee pairs into a file as a sequence of
-// lines like "module_name offset".
-void CoverageData::DumpCallerCalleePairs() {
-  uptr max_idx = atomic_load(&cc_array_index, memory_order_relaxed);
-  if (!max_idx) return;
-  auto sym = Symbolizer::GetOrInit();
-  if (!sym)
-    return;
-  InternalScopedString out(32 << 20);
-  uptr total = 0;
-  for (uptr i = 0; i < max_idx; i++) {
-    uptr *cc_cache = cc_array[i];
-    CHECK(cc_cache);
-    uptr caller = cc_cache[0];
-    uptr n_callees = cc_cache[1];
-    const char *caller_module_name = "<unknown>";
-    uptr caller_module_address = 0;
-    sym->GetModuleNameAndOffsetForPC(caller, &caller_module_name,
-                                     &caller_module_address);
-    for (uptr j = 2; j < n_callees; j++) {
-      uptr callee = cc_cache[j];
-      if (!callee) break;
-      total++;
-      const char *callee_module_name = "<unknown>";
-      uptr callee_module_address = 0;
-      sym->GetModuleNameAndOffsetForPC(callee, &callee_module_name,
-                                       &callee_module_address);
-      out.append("%s 0x%zx\n%s 0x%zx\n", caller_module_name,
-                 caller_module_address, callee_module_name,
-                 callee_module_address);
-    }
-  }
-  InternalScopedString path(kMaxPathLength);
-  fd_t fd = CovOpenFile(&path, false, "caller-callee");
-  if (fd == kInvalidFd) return;
-  WriteToFile(fd, out.data(), out.length());
-  CloseFile(fd);
-  VReport(1, " CovDump: %zd caller-callee pairs written\n", total);
-}
-
-// Record the current PC into the event buffer.
-// Every event is a u32 value (index in tr_pc_array_index) so we compute
-// it once and then cache in the provided 'cache' storage.
-//
-// This function will eventually be inlined by the compiler.
-void CoverageData::TraceBasicBlock(u32 *id) {
-  // Will trap here if
-  //  1. coverage is not enabled at run-time.
-  //  2. The array tr_event_array is full.
-  *tr_event_pointer = *id - 1;
-  tr_event_pointer++;
-}
-
-void CoverageData::DumpCounters() {
-  if (!common_flags()->coverage_counters) return;
-  uptr n = coverage_data.GetNumberOf8bitCounters();
-  if (!n) return;
-  InternalScopedBuffer<u8> bitset(n);
-  coverage_data.Update8bitCounterBitsetAndClearCounters(bitset.data());
-  InternalScopedString path(kMaxPathLength);
-
-  for (uptr m = 0; m < module_name_vec.size(); m++) {
-    auto r = module_name_vec[m];
-    CHECK(r.copied_module_name);
-    CHECK_LE(r.beg, r.end);
-    CHECK_LE(r.end, size());
-    const char *base_name = StripModuleName(r.copied_module_name);
-    fd_t fd =
-        CovOpenFile(&path, /* packed */ false, base_name, "counters-sancov");
-    if (fd == kInvalidFd) return;
-    WriteToFile(fd, bitset.data() + r.beg, r.end - r.beg);
-    CloseFile(fd);
-    VReport(1, " CovDump: %zd counters written for '%s'\n", r.end - r.beg,
-            base_name);
-  }
-}
-
 void CoverageData::DumpAsBitSet() {
   if (!common_flags()->coverage_bitset) return;
   if (!size()) return;
@@ -869,10 +566,7 @@ void CoverageData::DumpAll() {
   if (atomic_fetch_add(&dump_once_guard, 1, memory_order_relaxed))
     return;
   DumpAsBitSet();
-  DumpCounters();
-  DumpTrace();
   DumpOffsets();
-  DumpCallerCalleePairs();
 }
 
 void CovPrepareForSandboxing(__sanitizer_sandbox_arguments *args) {
@@ -946,11 +640,6 @@ SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_cov_with_check(u32 *guard) {
   coverage_data.Add(StackTrace::GetPreviousInstructionPc(GET_CALLER_PC()),
                     guard);
 }
-SANITIZER_INTERFACE_ATTRIBUTE void
-__sanitizer_cov_indir_call16(uptr callee, uptr callee_cache16[]) {
-  coverage_data.IndirCall(StackTrace::GetPreviousInstructionPc(GET_CALLER_PC()),
-                          callee, callee_cache16, 16);
-}
 SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_cov_init() {
   coverage_enabled = true;
   coverage_dir = common_flags()->coverage_dir;
@@ -964,7 +653,6 @@ SANITIZER_INTERFACE_ATTRIBUTE void
 __sanitizer_cov_module_init(s32 *guards, uptr npcs, u8 *counters,
                             const char *comp_unit_name) {
   coverage_data.InitializeGuards(guards, npcs, comp_unit_name, GET_CALLER_PC());
-  coverage_data.InitializeCounters(counters, npcs);
   if (!common_flags()->coverage_direct) return;
   if (SANITIZER_ANDROID && coverage_enabled) {
     // dlopen/dlclose interceptors do not work on Android, so we rely on
@@ -982,45 +670,6 @@ uptr __sanitizer_get_total_unique_coverage() {
   return atomic_load(&coverage_counter, memory_order_relaxed);
 }
 
-SANITIZER_INTERFACE_ATTRIBUTE
-uptr __sanitizer_get_total_unique_caller_callee_pairs() {
-  return atomic_load(&caller_callee_counter, memory_order_relaxed);
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-void __sanitizer_cov_trace_func_enter(u32 *id) {
-  __sanitizer_cov_with_check(id);
-  coverage_data.TraceBasicBlock(id);
-}
-SANITIZER_INTERFACE_ATTRIBUTE
-void __sanitizer_cov_trace_basic_block(u32 *id) {
-  __sanitizer_cov_with_check(id);
-  coverage_data.TraceBasicBlock(id);
-}
-SANITIZER_INTERFACE_ATTRIBUTE
-void __sanitizer_reset_coverage() {
-  ResetGlobalCounters();
-  coverage_data.ReinitializeGuards();
-  internal_bzero_aligned16(
-      coverage_data.data(),
-      RoundUpTo(coverage_data.size() * sizeof(coverage_data.data()[0]), 16));
-}
-SANITIZER_INTERFACE_ATTRIBUTE
-uptr __sanitizer_get_coverage_guards(uptr **data) {
-  *data = coverage_data.data();
-  return coverage_data.size();
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-uptr __sanitizer_get_number_of_counters() {
-  return coverage_data.GetNumberOf8bitCounters();
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-uptr __sanitizer_update_counter_bitset_and_clear_counters(u8 *bitset) {
-  return coverage_data.Update8bitCounterBitsetAndClearCounters(bitset);
-}
-
 // Default empty implementations (weak). Users should redefine them.
 SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_cov_trace_cmp, void) {}
 SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_cov_trace_cmp1, void) {}
diff --git a/lib/sanitizer_common/sanitizer_internal_defs.h b/lib/sanitizer_common/sanitizer_internal_defs.h
index ea5022e31bc3..f35b095ee94e 100644
--- a/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -152,6 +152,12 @@ typedef u32 operator_new_size_type;
 # endif
 #endif
 
+#if SANITIZER_MAC
+// On Darwin, thread IDs are 64-bit even on 32-bit systems.
+typedef u64 tid_t;
+#else
+typedef uptr tid_t;
+#endif
 
 // ----------- ATTENTION -------------
 // This header should NOT include any other headers to avoid portability issues.
diff --git a/lib/sanitizer_common/sanitizer_linux.cc b/lib/sanitizer_common/sanitizer_linux.cc
index 0b5473d95336..fce78fe590d5 100644
--- a/lib/sanitizer_common/sanitizer_linux.cc
+++ b/lib/sanitizer_common/sanitizer_linux.cc
@@ -384,7 +384,7 @@ bool FileExists(const char *filename) {
   return S_ISREG(st.st_mode);
 }
 
-uptr GetTid() {
+tid_t GetTid() {
 #if SANITIZER_FREEBSD
   return (uptr)pthread_self();
 #else
diff --git a/lib/sanitizer_common/sanitizer_linux_libcdep.cc b/lib/sanitizer_common/sanitizer_linux_libcdep.cc
index 6fde671f882d..25f1e12c0374 100644
--- a/lib/sanitizer_common/sanitizer_linux_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_linux_libcdep.cc
@@ -447,7 +447,9 @@ static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *arg) {
       uptr cur_beg = info->dlpi_addr + phdr->p_vaddr;
       uptr cur_end = cur_beg + phdr->p_memsz;
       bool executable = phdr->p_flags & PF_X;
-      cur_module.addAddressRange(cur_beg, cur_end, executable);
+      bool readable = phdr->p_flags & PF_R;
+      cur_module.addAddressRange(cur_beg, cur_end, executable,
+                                 readable);
     }
   }
   data->modules->push_back(cur_module);
diff --git a/lib/sanitizer_common/sanitizer_mac.cc b/lib/sanitizer_common/sanitizer_mac.cc
index 34af4e91876e..2f990b805ff9 100644
--- a/lib/sanitizer_common/sanitizer_mac.cc
+++ b/lib/sanitizer_common/sanitizer_mac.cc
@@ -252,9 +252,8 @@ bool FileExists(const char *filename) {
   return S_ISREG(st.st_mode);
 }
 
-uptr GetTid() {
-  // FIXME: This can potentially get truncated on 32-bit, where uptr is 4 bytes.
-  uint64_t tid;
+tid_t GetTid() {
+  tid_t tid;
   pthread_threadid_np(nullptr, &tid);
   return tid;
 }
diff --git a/lib/sanitizer_common/sanitizer_platform.h b/lib/sanitizer_common/sanitizer_platform.h
index 1a6410878579..49732aa32323 100644
--- a/lib/sanitizer_common/sanitizer_platform.h
+++ b/lib/sanitizer_common/sanitizer_platform.h
@@ -259,4 +259,15 @@
 # define SANITIZER_GO 0
 #endif
 
+// On PowerPC and ARM Thumb, calling pthread_exit() causes LSan to detect leaks.
+// pthread_exit() performs unwinding that leads to dlopen'ing libgcc_s.so.
+// dlopen mallocs "libgcc_s.so" string which confuses LSan, it fails to realize
+// that this allocation happens in dynamic linker and should be ignored.
+#if SANITIZER_PPC || defined(__thumb__)
+# define SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT 1
+#else
+# define SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT 0
+#endif
+
+
 #endif // SANITIZER_PLATFORM_H
diff --git a/lib/sanitizer_common/sanitizer_procmaps_common.cc b/lib/sanitizer_common/sanitizer_procmaps_common.cc
index fac3fbdad07a..67a659010aaf 100644
--- a/lib/sanitizer_common/sanitizer_procmaps_common.cc
+++ b/lib/sanitizer_common/sanitizer_procmaps_common.cc
@@ -141,7 +141,8 @@ void MemoryMappingLayout::DumpListOfModules(
     uptr base_address = (i ? cur_beg : 0) - cur_offset;
     LoadedModule cur_module;
     cur_module.set(cur_name, base_address);
-    cur_module.addAddressRange(cur_beg, cur_end, prot & kProtectionExecute);
+    cur_module.addAddressRange(cur_beg, cur_end, prot & kProtectionExecute,
+                               prot & kProtectionRead);
     modules->push_back(cur_module);
   }
 }
diff --git a/lib/sanitizer_common/sanitizer_procmaps_mac.cc b/lib/sanitizer_common/sanitizer_procmaps_mac.cc
index 2831f286932f..be59b481f5a1 100644
--- a/lib/sanitizer_common/sanitizer_procmaps_mac.cc
+++ b/lib/sanitizer_common/sanitizer_procmaps_mac.cc
@@ -262,7 +262,8 @@ void MemoryMappingLayout::DumpListOfModules(
       cur_module->set(cur_name, cur_beg, cur_arch, cur_uuid,
                       current_instrumented_);
     }
-    cur_module->addAddressRange(cur_beg, cur_end, prot & kProtectionExecute);
+    cur_module->addAddressRange(cur_beg, cur_end, prot & kProtectionExecute,
+                                prot & kProtectionRead);
   }
 }
 
diff --git a/lib/sanitizer_common/sanitizer_stoptheworld.h b/lib/sanitizer_common/sanitizer_stoptheworld.h
index 41752d8f66e7..20b49ae78b85 100644
--- a/lib/sanitizer_common/sanitizer_stoptheworld.h
+++ b/lib/sanitizer_common/sanitizer_stoptheworld.h
@@ -18,7 +18,6 @@
 #include "sanitizer_common.h"
 
 namespace __sanitizer {
-typedef int SuspendedThreadID;
 
 enum PtraceRegistersStatus {
   REGISTERS_UNAVAILABLE_FATAL = -1,
@@ -30,31 +29,21 @@ enum PtraceRegistersStatus {
 // register contexts.
 class SuspendedThreadsList {
  public:
-  SuspendedThreadsList()
-    : thread_ids_(1024) {}
-  SuspendedThreadID GetThreadID(uptr index) const {
-    CHECK_LT(index, thread_ids_.size());
-    return thread_ids_[index];
+  SuspendedThreadsList() = default;
+
+  // Can't declare pure virtual functions in sanitizer runtimes:
+  // __cxa_pure_virtual might be unavailable. Use UNIMPLEMENTED() instead.
+  virtual PtraceRegistersStatus GetRegistersAndSP(uptr index, uptr *buffer,
+                                                  uptr *sp) const {
+    UNIMPLEMENTED();
   }
-  PtraceRegistersStatus GetRegistersAndSP(uptr index, uptr *buffer,
-                                          uptr *sp) const;
+
   // The buffer in GetRegistersAndSP should be at least this big.
-  static uptr RegisterCount();
-  uptr thread_count() const { return thread_ids_.size(); }
-  bool Contains(SuspendedThreadID thread_id) const {
-    for (uptr i = 0; i < thread_ids_.size(); i++) {
-      if (thread_ids_[i] == thread_id)
-        return true;
-    }
-    return false;
-  }
-  void Append(SuspendedThreadID thread_id) {
-    thread_ids_.push_back(thread_id);
-  }
+  virtual uptr RegisterCount() const { UNIMPLEMENTED(); }
+  virtual uptr ThreadCount() const { UNIMPLEMENTED(); }
+  virtual tid_t GetThreadID(uptr index) const { UNIMPLEMENTED(); }
 
  private:
-  InternalMmapVector<SuspendedThreadID> thread_ids_;
-
   // Prohibit copy and assign.
   SuspendedThreadsList(const SuspendedThreadsList&);
   void operator=(const SuspendedThreadsList&);
diff --git a/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc b/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
index 6e4baeecaffd..03f73ae88308 100644
--- a/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
@@ -32,17 +32,13 @@
 #include <sys/types.h> // for pid_t
 #include <sys/uio.h> // for iovec
 #include <elf.h> // for NT_PRSTATUS
-#if SANITIZER_ANDROID && defined(__arm__)
-# include <linux/user.h>  // for pt_regs
-#else
-# ifdef __aarch64__
+#if defined(__aarch64__) && !SANITIZER_ANDROID
 // GLIBC 2.20+ sys/user does not include asm/ptrace.h
-#  include <asm/ptrace.h>
-# endif
-# include <sys/user.h>  // for user_regs_struct
-# if SANITIZER_ANDROID && SANITIZER_MIPS
-#   include <asm/reg.h>  // for mips SP register in sys/user.h
-# endif
+# include <asm/ptrace.h>
+#endif
+#include <sys/user.h>  // for user_regs_struct
+#if SANITIZER_ANDROID && SANITIZER_MIPS
+# include <asm/reg.h>  // for mips SP register in sys/user.h
 #endif
 #include <sys/wait.h> // for signal-related stuff
 
@@ -82,7 +78,22 @@
 
 namespace __sanitizer {
 
-COMPILER_CHECK(sizeof(SuspendedThreadID) == sizeof(pid_t));
+class SuspendedThreadsListLinux : public SuspendedThreadsList {
+ public:
+  SuspendedThreadsListLinux() : thread_ids_(1024) {}
+
+  tid_t GetThreadID(uptr index) const;
+  uptr ThreadCount() const;
+  bool ContainsTid(tid_t thread_id) const;
+  void Append(tid_t tid);
+
+  PtraceRegistersStatus GetRegistersAndSP(uptr index, uptr *buffer,
+                                          uptr *sp) const;
+  uptr RegisterCount() const;
+
+ private:
+  InternalMmapVector<tid_t> thread_ids_;
+};
 
 // Structure for passing arguments into the tracer thread.
 struct TracerThreadArgument {
@@ -107,31 +118,31 @@ class ThreadSuspender {
   bool SuspendAllThreads();
   void ResumeAllThreads();
   void KillAllThreads();
-  SuspendedThreadsList &suspended_threads_list() {
+  SuspendedThreadsListLinux &suspended_threads_list() {
     return suspended_threads_list_;
   }
   TracerThreadArgument *arg;
  private:
-  SuspendedThreadsList suspended_threads_list_;
+  SuspendedThreadsListLinux suspended_threads_list_;
   pid_t pid_;
-  bool SuspendThread(SuspendedThreadID thread_id);
+  bool SuspendThread(tid_t thread_id);
 };
 
-bool ThreadSuspender::SuspendThread(SuspendedThreadID tid) {
+bool ThreadSuspender::SuspendThread(tid_t tid) {
   // Are we already attached to this thread?
   // Currently this check takes linear time, however the number of threads is
   // usually small.
-  if (suspended_threads_list_.Contains(tid))
-    return false;
+  if (suspended_threads_list_.ContainsTid(tid)) return false;
   int pterrno;
   if (internal_iserror(internal_ptrace(PTRACE_ATTACH, tid, nullptr, nullptr),
                        &pterrno)) {
     // Either the thread is dead, or something prevented us from attaching.
     // Log this event and move on.
-    VReport(1, "Could not attach to thread %d (errno %d).\n", tid, pterrno);
+    VReport(1, "Could not attach to thread %zu (errno %d).\n", (uptr)tid,
+            pterrno);
     return false;
   } else {
-    VReport(2, "Attached to thread %d.\n", tid);
+    VReport(2, "Attached to thread %zu.\n", (uptr)tid);
     // The thread is not guaranteed to stop before ptrace returns, so we must
     // wait on it. Note: if the thread receives a signal concurrently,
     // we can get notification about the signal before notification about stop.
@@ -149,8 +160,8 @@ bool ThreadSuspender::SuspendThread(SuspendedThreadID tid) {
       if (internal_iserror(waitpid_status, &wperrno)) {
         // Got a ECHILD error. I don't think this situation is possible, but it
         // doesn't hurt to report it.
-        VReport(1, "Waiting on thread %d failed, detaching (errno %d).\n",
-                tid, wperrno);
+        VReport(1, "Waiting on thread %zu failed, detaching (errno %d).\n",
+                (uptr)tid, wperrno);
         internal_ptrace(PTRACE_DETACH, tid, nullptr, nullptr);
         return false;
       }
@@ -167,7 +178,7 @@ bool ThreadSuspender::SuspendThread(SuspendedThreadID tid) {
 }
 
 void ThreadSuspender::ResumeAllThreads() {
-  for (uptr i = 0; i < suspended_threads_list_.thread_count(); i++) {
+  for (uptr i = 0; i < suspended_threads_list_.ThreadCount(); i++) {
     pid_t tid = suspended_threads_list_.GetThreadID(i);
     int pterrno;
     if (!internal_iserror(internal_ptrace(PTRACE_DETACH, tid, nullptr, nullptr),
@@ -183,7 +194,7 @@ void ThreadSuspender::ResumeAllThreads() {
 }
 
 void ThreadSuspender::KillAllThreads() {
-  for (uptr i = 0; i < suspended_threads_list_.thread_count(); i++)
+  for (uptr i = 0; i < suspended_threads_list_.ThreadCount(); i++)
     internal_ptrace(PTRACE_KILL, suspended_threads_list_.GetThreadID(i),
                     nullptr, nullptr);
 }
@@ -494,9 +505,28 @@ typedef _user_regs_struct regs_struct;
 #error "Unsupported architecture"
 #endif // SANITIZER_ANDROID && defined(__arm__)
 
-PtraceRegistersStatus SuspendedThreadsList::GetRegistersAndSP(uptr index,
-                                                              uptr *buffer,
-                                                              uptr *sp) const {
+tid_t SuspendedThreadsListLinux::GetThreadID(uptr index) const {
+  CHECK_LT(index, thread_ids_.size());
+  return thread_ids_[index];
+}
+
+uptr SuspendedThreadsListLinux::ThreadCount() const {
+  return thread_ids_.size();
+}
+
+bool SuspendedThreadsListLinux::ContainsTid(tid_t thread_id) const {
+  for (uptr i = 0; i < thread_ids_.size(); i++) {
+    if (thread_ids_[i] == thread_id) return true;
+  }
+  return false;
+}
+
+void SuspendedThreadsListLinux::Append(tid_t tid) {
+  thread_ids_.push_back(tid);
+}
+
+PtraceRegistersStatus SuspendedThreadsListLinux::GetRegistersAndSP(
+    uptr index, uptr *buffer, uptr *sp) const {
   pid_t tid = GetThreadID(index);
   regs_struct regs;
   int pterrno;
@@ -526,7 +556,7 @@ PtraceRegistersStatus SuspendedThreadsList::GetRegistersAndSP(uptr index,
   return REGISTERS_AVAILABLE;
 }
 
-uptr SuspendedThreadsList::RegisterCount() {
+uptr SuspendedThreadsListLinux::RegisterCount() const {
   return sizeof(regs_struct) / sizeof(uptr);
 }
 } // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc b/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc
index 047472a657a6..20b8760935bd 100644
--- a/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc
+++ b/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc
@@ -14,27 +14,169 @@
 #include "sanitizer_platform.h"
 
 #if SANITIZER_MAC && (defined(__x86_64__) || defined(__aarch64__) || \
-                      defined(__mips64) || defined(__i386))
+                      defined(__i386))
+
+#include <mach/mach.h>
+#include <mach/thread_info.h>
+#include <pthread.h>
 
 #include "sanitizer_stoptheworld.h"
 
 namespace __sanitizer {
+typedef struct {
+  tid_t tid;
+  thread_t thread;
+} SuspendedThreadInfo;
+
+class SuspendedThreadsListMac : public SuspendedThreadsList {
+ public:
+  SuspendedThreadsListMac() : threads_(1024) {}
+
+  tid_t GetThreadID(uptr index) const;
+  thread_t GetThread(uptr index) const;
+  uptr ThreadCount() const;
+  bool ContainsThread(thread_t thread) const;
+  void Append(thread_t thread);
+
+  PtraceRegistersStatus GetRegistersAndSP(uptr index, uptr *buffer,
+                                          uptr *sp) const;
+  uptr RegisterCount() const;
+
+ private:
+  InternalMmapVector<SuspendedThreadInfo> threads_;
+};
+
+struct RunThreadArgs {
+  StopTheWorldCallback callback;
+  void *argument;
+};
+
+void RunThread(void *arg) {
+  struct RunThreadArgs *run_args = (struct RunThreadArgs *)arg;
+  SuspendedThreadsListMac suspended_threads_list;
+
+  mach_port_t task;
+  kern_return_t err = task_for_pid(mach_task_self(), internal_getpid(), &task);
+  if (err != KERN_SUCCESS) {
+    VReport(1, "Getting task from pid failed (errno %d).\n", err);
+    return;
+  }
+
+  thread_array_t threads;
+  mach_msg_type_number_t num_threads;
+
+  err = task_threads(task, &threads, &num_threads);
+  if (err != KERN_SUCCESS) {
+    VReport(1, "Failed to get threads for task (errno %d).\n", err);
+    return;
+  }
+
+  thread_t thread_self = mach_thread_self();
+  for (unsigned int i = 0; i < num_threads; ++i) {
+    if (threads[i] == thread_self) continue;
+
+    thread_suspend(threads[i]);
+    suspended_threads_list.Append(threads[i]);
+  }
+
+  run_args->callback(suspended_threads_list, run_args->argument);
+
+  uptr num_suspended = suspended_threads_list.ThreadCount();
+  for (unsigned int i = 0; i < num_suspended; ++i) {
+    thread_resume(suspended_threads_list.GetThread(i));
+  }
+}
+
 void StopTheWorld(StopTheWorldCallback callback, void *argument) {
-  CHECK(0 && "unimplemented");
+  struct RunThreadArgs arg = {callback, argument};
+  pthread_t run_thread = (pthread_t)internal_start_thread(RunThread, &arg);
+  internal_join_thread(run_thread);
 }
 
-PtraceRegistersStatus SuspendedThreadsList::GetRegistersAndSP(uptr index,
-                                                              uptr *buffer,
-                                                              uptr *sp) const {
-  CHECK(0 && "unimplemented");
-  return REGISTERS_UNAVAILABLE_FATAL;
+#if defined(__x86_64__)
+typedef x86_thread_state64_t regs_struct;
+
+#define SP_REG __rsp
+
+#elif defined(__aarch64__)
+typedef arm_thread_state64_t regs_struct;
+
+# if __DARWIN_UNIX03
+#  define SP_REG __sp
+# else
+#  define SP_REG sp
+# endif
+
+#elif defined(__i386)
+typedef x86_thread_state32_t regs_struct;
+
+#define SP_REG __esp
+
+#else
+#error "Unsupported architecture"
+#endif
+
+tid_t SuspendedThreadsListMac::GetThreadID(uptr index) const {
+  CHECK_LT(index, threads_.size());
+  return threads_[index].tid;
+}
+
+thread_t SuspendedThreadsListMac::GetThread(uptr index) const {
+  CHECK_LT(index, threads_.size());
+  return threads_[index].thread;
+}
+
+uptr SuspendedThreadsListMac::ThreadCount() const {
+  return threads_.size();
+}
+
+bool SuspendedThreadsListMac::ContainsThread(thread_t thread) const {
+  for (uptr i = 0; i < threads_.size(); i++) {
+    if (threads_[i].thread == thread) return true;
+  }
+  return false;
+}
+
+void SuspendedThreadsListMac::Append(thread_t thread) {
+  thread_identifier_info_data_t info;
+  mach_msg_type_number_t info_count = THREAD_IDENTIFIER_INFO_COUNT;
+  kern_return_t err = thread_info(thread, THREAD_IDENTIFIER_INFO,
+                                  (thread_info_t)&info, &info_count);
+  if (err != KERN_SUCCESS) {
+    VReport(1, "Error - unable to get thread ident for a thread\n");
+    return;
+  }
+  threads_.push_back({info.thread_id, thread});
+}
+
+PtraceRegistersStatus SuspendedThreadsListMac::GetRegistersAndSP(
+    uptr index, uptr *buffer, uptr *sp) const {
+  thread_t thread = GetThread(index);
+  regs_struct regs;
+  int err;
+  mach_msg_type_number_t reg_count = MACHINE_THREAD_STATE_COUNT;
+  err = thread_get_state(thread, MACHINE_THREAD_STATE, (thread_state_t)&regs,
+                         &reg_count);
+  if (err != KERN_SUCCESS) {
+    VReport(1, "Error - unable to get registers for a thread\n");
+    // KERN_INVALID_ARGUMENT indicates that either the flavor is invalid,
+    // or the thread does not exist. The other possible error case,
+    // MIG_ARRAY_TOO_LARGE, means that the state is too large, but it's
+    // still safe to proceed.
+    return err == KERN_INVALID_ARGUMENT ? REGISTERS_UNAVAILABLE_FATAL
+                                        : REGISTERS_UNAVAILABLE;
+  }
+
+  internal_memcpy(buffer, &regs, sizeof(regs));
+  *sp = regs.SP_REG;
+
+  return REGISTERS_AVAILABLE;
 }
 
-uptr SuspendedThreadsList::RegisterCount() {
-  CHECK(0 && "unimplemented");
-  return 0;
+uptr SuspendedThreadsListMac::RegisterCount() const {
+  return MACHINE_THREAD_STATE_COUNT;
 }
 } // namespace __sanitizer
 
 #endif  // SANITIZER_MAC && (defined(__x86_64__) || defined(__aarch64__)) ||
-        //                   defined(__mips64) || defined(__i386))
+        //                   defined(__i386))
diff --git a/lib/sanitizer_common/sanitizer_thread_registry.cc b/lib/sanitizer_common/sanitizer_thread_registry.cc
index c5b2e0946282..439e33a08d01 100644
--- a/lib/sanitizer_common/sanitizer_thread_registry.cc
+++ b/lib/sanitizer_common/sanitizer_thread_registry.cc
@@ -59,7 +59,8 @@ void ThreadContextBase::SetFinished() {
   OnFinished();
 }
 
-void ThreadContextBase::SetStarted(uptr _os_id, bool _workerthread, void *arg) {
+void ThreadContextBase::SetStarted(tid_t _os_id, bool _workerthread,
+                                   void *arg) {
   status = ThreadStatusRunning;
   os_id = _os_id;
   workerthread = _workerthread;
@@ -193,7 +194,7 @@ static bool FindThreadContextByOsIdCallback(ThreadContextBase *tctx,
       tctx->status != ThreadStatusDead);
 }
 
-ThreadContextBase *ThreadRegistry::FindThreadContextByOsIDLocked(uptr os_id) {
+ThreadContextBase *ThreadRegistry::FindThreadContextByOsIDLocked(tid_t os_id) {
   return FindThreadContextLocked(FindThreadContextByOsIdCallback,
                                  (void *)os_id);
 }
@@ -267,7 +268,7 @@ void ThreadRegistry::FinishThread(u32 tid) {
   }
 }
 
-void ThreadRegistry::StartThread(u32 tid, uptr os_id, bool workerthread,
+void ThreadRegistry::StartThread(u32 tid, tid_t os_id, bool workerthread,
                                  void *arg) {
   BlockingMutexLock l(&mtx_);
   running_threads_++;
diff --git a/lib/sanitizer_common/sanitizer_thread_registry.h b/lib/sanitizer_common/sanitizer_thread_registry.h
index 17b1d5d90962..9aae875c7360 100644
--- a/lib/sanitizer_common/sanitizer_thread_registry.h
+++ b/lib/sanitizer_common/sanitizer_thread_registry.h
@@ -39,7 +39,7 @@ class ThreadContextBase {
   const u32 tid;  // Thread ID. Main thread should have tid = 0.
   u64 unique_id;  // Unique thread ID.
   u32 reuse_count;  // Number of times this tid was reused.
-  uptr os_id;     // PID (used for reporting).
+  tid_t os_id;     // PID (used for reporting).
   uptr user_id;   // Some opaque user thread id (e.g. pthread_t).
   char name[64];  // As annotated by user.
 
@@ -55,7 +55,7 @@ class ThreadContextBase {
   void SetDead();
   void SetJoined(void *arg);
   void SetFinished();
-  void SetStarted(uptr _os_id, bool _workerthread, void *arg);
+  void SetStarted(tid_t _os_id, bool _workerthread, void *arg);
   void SetCreated(uptr _user_id, u64 _unique_id, bool _detached,
                   u32 _parent_tid, void *arg);
   void Reset();
@@ -109,14 +109,14 @@ class ThreadRegistry {
   // is found.
   ThreadContextBase *FindThreadContextLocked(FindThreadCallback cb,
                                              void *arg);
-  ThreadContextBase *FindThreadContextByOsIDLocked(uptr os_id);
+  ThreadContextBase *FindThreadContextByOsIDLocked(tid_t os_id);
 
   void SetThreadName(u32 tid, const char *name);
   void SetThreadNameByUserId(uptr user_id, const char *name);
   void DetachThread(u32 tid, void *arg);
   void JoinThread(u32 tid, void *arg);
   void FinishThread(u32 tid);
-  void StartThread(u32 tid, uptr os_id, bool workerthread, void *arg);
+  void StartThread(u32 tid, tid_t os_id, bool workerthread, void *arg);
 
  private:
   const ThreadContextFactory context_factory_;
diff --git a/lib/sanitizer_common/sanitizer_win.cc b/lib/sanitizer_common/sanitizer_win.cc
index b1a2a53a3fbf..1a454ba42c8e 100644
--- a/lib/sanitizer_common/sanitizer_win.cc
+++ b/lib/sanitizer_common/sanitizer_win.cc
@@ -80,7 +80,7 @@ uptr internal_getpid() {
 
 // In contrast to POSIX, on Windows GetCurrentThreadId()
 // returns a system-unique identifier.
-uptr GetTid() {
+tid_t GetTid() {
   return GetCurrentThreadId();
 }
 
@@ -553,7 +553,8 @@ void ListOfModules::init() {
     LoadedModule cur_module;
     cur_module.set(module_name, adjusted_base);
     // We add the whole module as one single address range.
-    cur_module.addAddressRange(base_address, end_address, /*executable*/ true);
+    cur_module.addAddressRange(base_address, end_address, /*executable*/ true,
+                               /*readable*/ true);
     modules_.push_back(cur_module);
   }
   UnmapOrDie(hmodules, modules_buffer_size);
diff --git a/lib/scudo/scudo_allocator.cpp b/lib/scudo/scudo_allocator.cpp
index dab6abedcb3e..9812fc0f59f8 100644
--- a/lib/scudo/scudo_allocator.cpp
+++ b/lib/scudo/scudo_allocator.cpp
@@ -22,8 +22,7 @@
 
 #include <limits.h>
 #include <pthread.h>
-
-#include <cstring>
+#include <string.h>
 
 namespace __scudo {
 
@@ -60,9 +59,9 @@ typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef ScudoLargeMmapAllocator SecondaryAllocator;
 typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
-  ScudoAllocator;
+  ScudoBackendAllocator;
 
-static ScudoAllocator &getAllocator();
+static ScudoBackendAllocator &getBackendAllocator();
 
 static thread_local Xorshift128Plus Prng;
 // Global static cookie, initialized at start-up.
@@ -101,9 +100,10 @@ struct ScudoChunk : UnpackedHeader {
   // Returns the usable size for a chunk, meaning the amount of bytes from the
   // beginning of the user data to the end of the backend allocated chunk.
   uptr getUsableSize(UnpackedHeader *Header) {
-    uptr Size = getAllocator().GetActuallyAllocatedSize(getAllocBeg(Header));
+    uptr Size = getBackendAllocator().GetActuallyAllocatedSize(
+        getAllocBeg(Header));
     if (Size == 0)
-      return Size;
+      return 0;
     return Size - AlignedChunkHeaderSize - (Header->Offset << MinAlignmentLog);
   }
 
@@ -120,7 +120,8 @@ struct ScudoChunk : UnpackedHeader {
     return static_cast<u16>(Crc);
   }
 
-  // Checks the validity of a chunk by verifying its checksum.
+  // Checks the validity of a chunk by verifying its checksum. It doesn't
+  // incur termination in the event of an invalid chunk.
   bool isValid() {
     UnpackedHeader NewUnpackedHeader;
     const AtomicPackedHeader *AtomicHeader =
@@ -130,13 +131,27 @@ struct ScudoChunk : UnpackedHeader {
     return (NewUnpackedHeader.Checksum == computeChecksum(&NewUnpackedHeader));
   }
 
+  // Nulls out a chunk header. When returning the chunk to the backend, there
+  // is no need to store a valid ChunkAvailable header, as this would be
+  // computationally expensive. Zeroing out serves the same purpose by making
+  // the header invalid. In the extremely rare event where 0 would be a valid
+  // checksum for the chunk, the state of the chunk is ChunkAvailable anyway.
+  COMPILER_CHECK(ChunkAvailable == 0);
+  void eraseHeader() {
+    PackedHeader NullPackedHeader = 0;
+    AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<AtomicPackedHeader *>(this);
+    atomic_store_relaxed(AtomicHeader, NullPackedHeader);
+  }
+
   // Loads and unpacks the header, verifying the checksum in the process.
   void loadHeader(UnpackedHeader *NewUnpackedHeader) const {
     const AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<const AtomicPackedHeader *>(this);
     PackedHeader NewPackedHeader = atomic_load_relaxed(AtomicHeader);
     *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
-    if (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader)) {
+    if (UNLIKELY(NewUnpackedHeader->Checksum !=
+        computeChecksum(NewUnpackedHeader))) {
       dieWithMessage("ERROR: corrupted chunk header at address %p\n", this);
     }
   }
@@ -160,15 +175,19 @@ struct ScudoChunk : UnpackedHeader {
     PackedHeader OldPackedHeader = bit_cast<PackedHeader>(*OldUnpackedHeader);
     AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<AtomicPackedHeader *>(this);
-    if (!atomic_compare_exchange_strong(AtomicHeader,
-                                        &OldPackedHeader,
-                                        NewPackedHeader,
-                                        memory_order_relaxed)) {
+    if (UNLIKELY(!atomic_compare_exchange_strong(AtomicHeader,
+                                                 &OldPackedHeader,
+                                                 NewPackedHeader,
+                                                 memory_order_relaxed))) {
       dieWithMessage("ERROR: race on chunk header at address %p\n", this);
     }
   }
 };
 
+ScudoChunk *getScudoChunk(uptr UserBeg) {
+  return reinterpret_cast<ScudoChunk *>(UserBeg - AlignedChunkHeaderSize);
+}
+
 static bool ScudoInitIsRunning = false;
 
 static pthread_once_t GlobalInited = PTHREAD_ONCE_INIT;
@@ -190,7 +209,7 @@ static void teardownThread(void *p) {
     return;
   }
   drainQuarantine();
-  getAllocator().DestroyCache(&Cache);
+  getBackendAllocator().DestroyCache(&Cache);
   ThreadTornDown = true;
 }
 
@@ -223,7 +242,7 @@ static void initGlobal() {
 static void NOINLINE initThread() {
   pthread_once(&GlobalInited, initGlobal);
   pthread_setspecific(PThreadKey, reinterpret_cast<void *>(1));
-  getAllocator().InitCache(&Cache);
+  getBackendAllocator().InitCache(&Cache);
   ThreadInited = true;
 }
 
@@ -235,38 +254,31 @@ struct QuarantineCallback {
   void Recycle(ScudoChunk *Chunk) {
     UnpackedHeader Header;
     Chunk->loadHeader(&Header);
-    if (Header.State != ChunkQuarantine) {
+    if (UNLIKELY(Header.State != ChunkQuarantine)) {
       dieWithMessage("ERROR: invalid chunk state when recycling address %p\n",
                      Chunk);
     }
+    Chunk->eraseHeader();
     void *Ptr = Chunk->getAllocBeg(&Header);
-    getAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().Deallocate(Cache_, Ptr);
   }
 
   /// Internal quarantine allocation and deallocation functions.
   void *Allocate(uptr Size) {
-    // The internal quarantine memory cannot be protected by us. But the only
-    // structures allocated are QuarantineBatch, that are 8KB for x64. So we
-    // will use mmap for those, and given that Deallocate doesn't pass a size
-    // in, we enforce the size of the allocation to be sizeof(QuarantineBatch).
-    // TODO(kostyak): switching to mmap impacts greatly performances, we have
-    //                to find another solution
-    // CHECK_EQ(Size, sizeof(QuarantineBatch));
-    // return MmapOrDie(Size, "QuarantineBatch");
-    return getAllocator().Allocate(Cache_, Size, 1, false);
+    // TODO(kostyak): figure out the best way to protect the batches.
+    return getBackendAllocator().Allocate(Cache_, Size, MinAlignment);
   }
 
   void Deallocate(void *Ptr) {
-    // UnmapOrDie(Ptr, sizeof(QuarantineBatch));
-    getAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().Deallocate(Cache_, Ptr);
   }
 
   AllocatorCache *Cache_;
 };
 
 typedef Quarantine<QuarantineCallback, ScudoChunk> ScudoQuarantine;
-typedef ScudoQuarantine::Cache QuarantineCache;
-static thread_local QuarantineCache ThreadQuarantineCache;
+typedef ScudoQuarantine::Cache ScudoQuarantineCache;
+static thread_local ScudoQuarantineCache ThreadQuarantineCache;
 
 void AllocatorOptions::setFrom(const Flags *f, const CommonFlags *cf) {
   MayReturnNull = cf->allocator_may_return_null;
@@ -288,11 +300,11 @@ void AllocatorOptions::copyTo(Flags *f, CommonFlags *cf) const {
   f->ZeroContents = ZeroContents;
 }
 
-struct Allocator {
+struct ScudoAllocator {
   static const uptr MaxAllowedMallocSize =
       FIRST_32_SECOND_64(2UL << 30, 1ULL << 40);
 
-  ScudoAllocator BackendAllocator;
+  ScudoBackendAllocator BackendAllocator;
   ScudoQuarantine AllocatorQuarantine;
 
   // The fallback caches are used when the thread local caches have been
@@ -300,13 +312,13 @@ struct Allocator {
   // be accessed by different threads.
   StaticSpinMutex FallbackMutex;
   AllocatorCache FallbackAllocatorCache;
-  QuarantineCache FallbackQuarantineCache;
+  ScudoQuarantineCache FallbackQuarantineCache;
 
   bool DeallocationTypeMismatch;
   bool ZeroContents;
   bool DeleteSizeMismatch;
 
-  explicit Allocator(LinkerInitialized)
+  explicit ScudoAllocator(LinkerInitialized)
     : AllocatorQuarantine(LINKER_INITIALIZED),
       FallbackQuarantineCache(LINKER_INITIALIZED) {}
 
@@ -329,14 +341,14 @@ struct Allocator {
       dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
                      "header\n");
     }
-    // Verify that we can fit the maximum amount of unused bytes in the header.
-    // Given that the Secondary fits the allocation to a page, the worst case
-    // scenario happens in the Primary. It will depend on the second to last
-    // and last class sizes, as well as the dynamic base for the Primary. The
-    // following is an over-approximation that works for our needs.
-    uptr MaxUnusedBytes = SizeClassMap::kMaxSize - 1 - AlignedChunkHeaderSize;
-    Header.UnusedBytes = MaxUnusedBytes;
-    if (Header.UnusedBytes != MaxUnusedBytes) {
+    // Verify that we can fit the maximum size or amount of unused bytes in the
+    // header. Given that the Secondary fits the allocation to a page, the worst
+    // case scenario happens in the Primary. It will depend on the second to
+    // last and last class sizes, as well as the dynamic base for the Primary.
+    // The following is an over-approximation that works for our needs.
+    uptr MaxSizeOrUnusedBytes = SizeClassMap::kMaxSize - 1;
+    Header.SizeOrUnusedBytes = MaxSizeOrUnusedBytes;
+    if (Header.SizeOrUnusedBytes != MaxSizeOrUnusedBytes) {
       dieWithMessage("ERROR: the maximum possible unused bytes doesn't fit in "
                      "the header\n");
     }
@@ -349,37 +361,37 @@ struct Allocator {
         static_cast<uptr>(Options.QuarantineSizeMb) << 20,
         static_cast<uptr>(Options.ThreadLocalQuarantineSizeKb) << 10);
     BackendAllocator.InitCache(&FallbackAllocatorCache);
-    Cookie = Prng.Next();
+    Cookie = Prng.getNext();
   }
 
-  // Helper function that checks for a valid Scudo chunk.
+  // Helper function that checks for a valid Scudo chunk. nullptr isn't.
   bool isValidPointer(const void *UserPtr) {
     if (UNLIKELY(!ThreadInited))
       initThread();
-    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
-    if (!IsAligned(ChunkBeg, MinAlignment)) {
+    if (!UserPtr)
       return false;
-    }
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
-    return Chunk->isValid();
+    uptr UserBeg = reinterpret_cast<uptr>(UserPtr);
+    if (!IsAligned(UserBeg, MinAlignment))
+      return false;
+    return getScudoChunk(UserBeg)->isValid();
   }
 
   // Allocates a chunk.
-  void *allocate(uptr Size, uptr Alignment, AllocType Type) {
+  void *allocate(uptr Size, uptr Alignment, AllocType Type,
+                 bool ForceZeroContents = false) {
     if (UNLIKELY(!ThreadInited))
       initThread();
-    if (!IsPowerOfTwo(Alignment)) {
+    if (UNLIKELY(!IsPowerOfTwo(Alignment))) {
       dieWithMessage("ERROR: alignment is not a power of 2\n");
     }
     if (Alignment > MaxAlignment)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
     if (Alignment < MinAlignment)
       Alignment = MinAlignment;
-    if (Size == 0)
-      Size = 1;
     if (Size >= MaxAllowedMallocSize)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
+    if (Size == 0)
+      Size = 1;
 
     uptr NeededSize = RoundUpTo(Size, MinAlignment) + AlignedChunkHeaderSize;
     if (Alignment > MinAlignment)
@@ -395,13 +407,13 @@ struct Allocator {
     bool FromPrimary = PrimaryAllocator::CanAllocate(NeededSize, MinAlignment);
 
     void *Ptr;
+    uptr AllocationAlignment = FromPrimary ? MinAlignment : Alignment;
     if (LIKELY(!ThreadTornDown)) {
-      Ptr = BackendAllocator.Allocate(&Cache, NeededSize,
-                                      FromPrimary ? MinAlignment : Alignment);
+      Ptr = BackendAllocator.Allocate(&Cache, NeededSize, AllocationAlignment);
     } else {
       SpinMutexLock l(&FallbackMutex);
       Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, NeededSize,
-                                      FromPrimary ? MinAlignment : Alignment);
+                                      AllocationAlignment);
     }
     if (!Ptr)
       return BackendAllocator.ReturnNullOrDieOnOOM();
@@ -416,30 +428,34 @@ struct Allocator {
         NeededSize -= Alignment;
     }
 
-    uptr ActuallyAllocatedSize = BackendAllocator.GetActuallyAllocatedSize(
-        reinterpret_cast<void *>(AllocBeg));
     // If requested, we will zero out the entire contents of the returned chunk.
-    if (ZeroContents && FromPrimary)
-       memset(Ptr, 0, ActuallyAllocatedSize);
-
-    uptr ChunkBeg = AllocBeg + AlignedChunkHeaderSize;
-    if (!IsAligned(ChunkBeg, Alignment))
-      ChunkBeg = RoundUpTo(ChunkBeg, Alignment);
-    CHECK_LE(ChunkBeg + Size, AllocBeg + NeededSize);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    if ((ForceZeroContents || ZeroContents) && FromPrimary)
+       memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
+
+    uptr UserBeg = AllocBeg + AlignedChunkHeaderSize;
+    if (!IsAligned(UserBeg, Alignment))
+      UserBeg = RoundUpTo(UserBeg, Alignment);
+    CHECK_LE(UserBeg + Size, AllocBeg + NeededSize);
     UnpackedHeader Header = {};
     Header.State = ChunkAllocated;
-    uptr Offset = ChunkBeg - AlignedChunkHeaderSize - AllocBeg;
+    uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
     Header.Offset = Offset >> MinAlignmentLog;
     Header.AllocType = Type;
-    Header.UnusedBytes = ActuallyAllocatedSize - Offset -
-        AlignedChunkHeaderSize - Size;
-    Header.Salt = static_cast<u8>(Prng.Next());
-    Chunk->storeHeader(&Header);
-    void *UserPtr = reinterpret_cast<void *>(ChunkBeg);
-    // TODO(kostyak): hooks sound like a terrible idea security wise but might
-    //                be needed for things to work properly?
+    if (FromPrimary) {
+      Header.FromPrimary = FromPrimary;
+      Header.SizeOrUnusedBytes = Size;
+    } else {
+      // The secondary fits the allocations to a page, so the amount of unused
+      // bytes is the difference between the end of the user allocation and the
+      // next page boundary.
+      uptr PageSize = GetPageSizeCached();
+      uptr TrailingBytes = (UserBeg + Size) & (PageSize - 1);
+      if (TrailingBytes)
+        Header.SizeOrUnusedBytes = PageSize - TrailingBytes;
+    }
+    Header.Salt = static_cast<u8>(Prng.getNext());
+    getScudoChunk(UserBeg)->storeHeader(&Header);
+    void *UserPtr = reinterpret_cast<void *>(UserBeg);
     // if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(UserPtr, Size);
     return UserPtr;
   }
@@ -449,53 +465,57 @@ struct Allocator {
   void deallocate(void *UserPtr, uptr DeleteSize, AllocType Type) {
     if (UNLIKELY(!ThreadInited))
       initThread();
-    // TODO(kostyak): see hook comment above
     // if (&__sanitizer_free_hook) __sanitizer_free_hook(UserPtr);
     if (!UserPtr)
       return;
-    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
-    if (!IsAligned(ChunkBeg, MinAlignment)) {
+    uptr UserBeg = reinterpret_cast<uptr>(UserPtr);
+    if (UNLIKELY(!IsAligned(UserBeg, MinAlignment))) {
       dieWithMessage("ERROR: attempted to deallocate a chunk not properly "
                      "aligned at address %p\n", UserPtr);
     }
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
     UnpackedHeader OldHeader;
     Chunk->loadHeader(&OldHeader);
-    if (OldHeader.State != ChunkAllocated) {
+    if (UNLIKELY(OldHeader.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when deallocating address "
                      "%p\n", UserPtr);
     }
-    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
-    UnpackedHeader NewHeader = OldHeader;
-    NewHeader.State = ChunkQuarantine;
-    Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
     if (DeallocationTypeMismatch) {
       // The deallocation type has to match the allocation one.
-      if (NewHeader.AllocType != Type) {
+      if (OldHeader.AllocType != Type) {
         // With the exception of memalign'd Chunks, that can be still be free'd.
-        if (NewHeader.AllocType != FromMemalign || Type != FromMalloc) {
+        if (OldHeader.AllocType != FromMemalign || Type != FromMalloc) {
           dieWithMessage("ERROR: allocation type mismatch on address %p\n",
-                         Chunk);
+                         UserPtr);
         }
       }
     }
-    uptr Size = UsableSize - OldHeader.UnusedBytes;
+    uptr Size = OldHeader.FromPrimary ? OldHeader.SizeOrUnusedBytes :
+        Chunk->getUsableSize(&OldHeader) - OldHeader.SizeOrUnusedBytes;
     if (DeleteSizeMismatch) {
       if (DeleteSize && DeleteSize != Size) {
         dieWithMessage("ERROR: invalid sized delete on chunk at address %p\n",
-                       Chunk);
+                       UserPtr);
       }
     }
 
+    UnpackedHeader NewHeader = OldHeader;
+    NewHeader.State = ChunkQuarantine;
+    Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
+
+    // If a small memory amount was allocated with a larger alignment, we want
+    // to take that into account. Otherwise the Quarantine would be filled with
+    // tiny chunks, taking a lot of VA memory. This an approximation of the
+    // usable size, that allows us to not call GetActuallyAllocatedSize.
+    uptr LiableSize = Size + (OldHeader.Offset << MinAlignment);
     if (LIKELY(!ThreadTornDown)) {
       AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache), Chunk, UsableSize);
+                              QuarantineCallback(&Cache), Chunk, LiableSize);
     } else {
       SpinMutexLock l(&FallbackMutex);
       AllocatorQuarantine.Put(&FallbackQuarantineCache,
                               QuarantineCallback(&FallbackAllocatorCache),
-                              Chunk, UsableSize);
+                              Chunk, LiableSize);
     }
   }
 
@@ -504,24 +524,30 @@ struct Allocator {
   void *reallocate(void *OldPtr, uptr NewSize) {
     if (UNLIKELY(!ThreadInited))
       initThread();
-    uptr ChunkBeg = reinterpret_cast<uptr>(OldPtr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    uptr UserBeg = reinterpret_cast<uptr>(OldPtr);
+    if (UNLIKELY(!IsAligned(UserBeg, MinAlignment))) {
+      dieWithMessage("ERROR: attempted to reallocate a chunk not properly "
+                     "aligned at address %p\n", OldPtr);
+    }
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
     UnpackedHeader OldHeader;
     Chunk->loadHeader(&OldHeader);
-    if (OldHeader.State != ChunkAllocated) {
+    if (UNLIKELY(OldHeader.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when reallocating address "
                      "%p\n", OldPtr);
     }
-    uptr Size = Chunk->getUsableSize(&OldHeader);
-    if (OldHeader.AllocType != FromMalloc) {
+    if (UNLIKELY(OldHeader.AllocType != FromMalloc)) {
       dieWithMessage("ERROR: invalid chunk type when reallocating address %p\n",
-                     Chunk);
+                     OldPtr);
     }
+    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
     UnpackedHeader NewHeader = OldHeader;
-    // The new size still fits in the current chunk.
-    if (NewSize <= Size) {
-      NewHeader.UnusedBytes = Size - NewSize;
+    // The new size still fits in the current chunk, and the size difference
+    // is reasonable.
+    if (NewSize <= UsableSize &&
+        (UsableSize - NewSize) < (SizeClassMap::kMaxSize / 2)) {
+      NewHeader.SizeOrUnusedBytes =
+                OldHeader.FromPrimary ? NewSize : UsableSize - NewSize;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       return OldPtr;
     }
@@ -529,18 +555,19 @@ struct Allocator {
     // old one.
     void *NewPtr = allocate(NewSize, MinAlignment, FromMalloc);
     if (NewPtr) {
-      uptr OldSize = Size - OldHeader.UnusedBytes;
+      uptr OldSize = OldHeader.FromPrimary ? OldHeader.SizeOrUnusedBytes :
+          UsableSize - OldHeader.SizeOrUnusedBytes;
       memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
       NewHeader.State = ChunkQuarantine;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       if (LIKELY(!ThreadTornDown)) {
         AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                                QuarantineCallback(&Cache), Chunk, Size);
+                                QuarantineCallback(&Cache), Chunk, UsableSize);
       } else {
         SpinMutexLock l(&FallbackMutex);
         AllocatorQuarantine.Put(&FallbackQuarantineCache,
                                 QuarantineCallback(&FallbackAllocatorCache),
-                                Chunk, Size);
+                                Chunk, UsableSize);
       }
     }
     return NewPtr;
@@ -552,13 +579,12 @@ struct Allocator {
       initThread();
     if (!Ptr)
       return 0;
-    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    uptr UserBeg = reinterpret_cast<uptr>(Ptr);
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
     UnpackedHeader Header;
     Chunk->loadHeader(&Header);
     // Getting the usable size of a chunk only makes sense if it's allocated.
-    if (Header.State != ChunkAllocated) {
+    if (UNLIKELY(Header.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when sizing address %p\n",
                      Ptr);
     }
@@ -569,13 +595,9 @@ struct Allocator {
     if (UNLIKELY(!ThreadInited))
       initThread();
     uptr Total = NMemB * Size;
-    if (Size != 0 && Total / Size != NMemB) // Overflow check
+    if (Size != 0 && Total / Size != NMemB)  // Overflow check
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
-    void *Ptr = allocate(Total, MinAlignment, FromMalloc);
-    // If ZeroContents, the content of the chunk has already been zero'd out.
-    if (!ZeroContents && Ptr && BackendAllocator.FromPrimary(Ptr))
-      memset(Ptr, 0, getUsableSize(Ptr));
-    return Ptr;
+    return allocate(Total, MinAlignment, FromMalloc, true);
   }
 
   void drainQuarantine() {
@@ -592,9 +614,9 @@ struct Allocator {
   }
 };
 
-static Allocator Instance(LINKER_INITIALIZED);
+static ScudoAllocator Instance(LINKER_INITIALIZED);
 
-static ScudoAllocator &getAllocator() {
+static ScudoBackendAllocator &getBackendAllocator() {
   return Instance.BackendAllocator;
 }
 
diff --git a/lib/scudo/scudo_allocator.h b/lib/scudo/scudo_allocator.h
index 5f5225b36286..e7428f170271 100644
--- a/lib/scudo/scudo_allocator.h
+++ b/lib/scudo/scudo_allocator.h
@@ -41,19 +41,20 @@ enum ChunkState : u8 {
 // using functions such as GetBlockBegin, that is fairly costly. Our first
 // implementation used the MetaData as well, which offers the advantage of
 // being stored away from the chunk itself, but accessing it was costly as
-// well. The header will be atomically loaded and stored using the 16-byte
-// primitives offered by the platform (likely requires cmpxchg16b support).
+// well. The header will be atomically loaded and stored.
 typedef u64 PackedHeader;
 struct UnpackedHeader {
-  u64 Checksum    : 16;
-  u64 UnusedBytes : 20; // Needed for reallocation purposes.
-  u64 State       : 2;  // available, allocated, or quarantined
-  u64 AllocType   : 2;  // malloc, new, new[], or memalign
-  u64 Offset      : 16; // Offset from the beginning of the backend
-                        // allocation to the beginning of the chunk itself,
-                        // in multiples of MinAlignment. See comment about
-                        // its maximum value and test in init().
-  u64 Salt        : 8;
+  u64 Checksum          : 16;
+  u64 SizeOrUnusedBytes : 19; // Size for Primary backed allocations, amount of
+                              // unused bytes in the chunk for Secondary ones.
+  u64 FromPrimary       : 1;
+  u64 State             : 2;  // available, allocated, or quarantined
+  u64 AllocType         : 2;  // malloc, new, new[], or memalign
+  u64 Offset            : 16; // Offset from the beginning of the backend
+                              // allocation to the beginning of the chunk
+                              // itself, in multiples of MinAlignment. See
+                              /// comment about its maximum value and in init().
+  u64 Salt              : 8;
 };
 
 typedef atomic_uint64_t AtomicPackedHeader;
diff --git a/lib/scudo/scudo_allocator_secondary.h b/lib/scudo/scudo_allocator_secondary.h
index b984f0db4dbd..fbc7f247d708 100644
--- a/lib/scudo/scudo_allocator_secondary.h
+++ b/lib/scudo/scudo_allocator_secondary.h
@@ -88,8 +88,11 @@ class ScudoLargeMmapAllocator {
     // The primary adds the whole class size to the stats when allocating a
     // chunk, so we will do something similar here. But we will not account for
     // the guard pages.
-    Stats->Add(AllocatorStatAllocated, MapSize - 2 * PageSize);
-    Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
+    {
+      SpinMutexLock l(&StatsMutex);
+      Stats->Add(AllocatorStatAllocated, MapSize - 2 * PageSize);
+      Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
+    }
 
     return reinterpret_cast<void *>(UserBeg);
   }
@@ -112,8 +115,11 @@ class ScudoLargeMmapAllocator {
 
   void Deallocate(AllocatorStats *Stats, void *Ptr) {
     SecondaryHeader *Header = getHeader(Ptr);
-    Stats->Sub(AllocatorStatAllocated, Header->MapSize - 2 * PageSize);
-    Stats->Sub(AllocatorStatMapped, Header->MapSize - 2 * PageSize);
+    {
+      SpinMutexLock l(&StatsMutex);
+      Stats->Sub(AllocatorStatAllocated, Header->MapSize - 2 * PageSize);
+      Stats->Sub(AllocatorStatMapped, Header->MapSize - 2 * PageSize);
+    }
     UnmapOrDie(reinterpret_cast<void *>(Header->MapBeg), Header->MapSize);
   }
 
@@ -127,7 +133,7 @@ class ScudoLargeMmapAllocator {
 
   uptr GetActuallyAllocatedSize(void *Ptr) {
     SecondaryHeader *Header = getHeader(Ptr);
-    // Deduct PageSize as MapEnd includes the trailing guard page.
+    // Deduct PageSize as MapSize includes the trailing guard page.
     uptr MapEnd = Header->MapBeg + Header->MapSize - PageSize;
     return MapEnd - reinterpret_cast<uptr>(Ptr);
   }
@@ -182,6 +188,7 @@ class ScudoLargeMmapAllocator {
   const uptr SecondaryHeaderSize = sizeof(SecondaryHeader);
   const uptr HeadersSize = SecondaryHeaderSize + AlignedChunkHeaderSize;
   uptr PageSize;
+  SpinMutex StatsMutex;
   atomic_uint8_t MayReturnNull;
 };
 
diff --git a/lib/scudo/scudo_utils.cpp b/lib/scudo/scudo_utils.cpp
index 4e2f6e08e80d..98bd591aa868 100644
--- a/lib/scudo/scudo_utils.cpp
+++ b/lib/scudo/scudo_utils.cpp
@@ -159,58 +159,4 @@ Xorshift128Plus::Xorshift128Plus() {
   fillRandom(reinterpret_cast<u8 *>(State), sizeof(State));
 }
 
-const static u32 CRC32Table[] = {
-  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
-  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
-  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
-  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-};
-
-u32 computeSoftwareCRC32(u32 Crc, uptr Data) {
-  for (uptr i = 0; i < sizeof(Data); i++) {
-    Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
-    Data >>= 8;
-  }
-  return Crc;
-}
-
 }  // namespace __scudo
diff --git a/lib/scudo/scudo_utils.h b/lib/scudo/scudo_utils.h
index 5082d79f6954..f30c86125799 100644
--- a/lib/scudo/scudo_utils.h
+++ b/lib/scudo/scudo_utils.h
@@ -41,7 +41,7 @@ bool testCPUFeature(CPUFeature feature);
 struct Xorshift128Plus {
  public:
   Xorshift128Plus();
-  u64 Next() {
+  u64 getNext() {
     u64 x = State[0];
     const u64 y = State[1];
     State[0] = y;
@@ -58,7 +58,59 @@ enum : u8 {
   CRC32Hardware = 1,
 };
 
-u32 computeSoftwareCRC32(u32 Crc, uptr Data);
+const static u32 CRC32Table[] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
+  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
+  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+INLINE u32 computeSoftwareCRC32(u32 Crc, uptr Data) {
+  for (uptr i = 0; i < sizeof(Data); i++) {
+    Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
+    Data >>= 8;
+  }
+  return Crc;
+}
 
 }  // namespace __scudo
 
diff --git a/lib/tsan/rtl/tsan_debugging.cc b/lib/tsan/rtl/tsan_debugging.cc
index 06154bc135a9..a44b13632c61 100644
--- a/lib/tsan/rtl/tsan_debugging.cc
+++ b/lib/tsan/rtl/tsan_debugging.cc
@@ -151,7 +151,7 @@ int __tsan_get_report_mutex(void *report, uptr idx, uptr *mutex_id, void **addr,
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
-int __tsan_get_report_thread(void *report, uptr idx, int *tid, uptr *os_id,
+int __tsan_get_report_thread(void *report, uptr idx, int *tid, tid_t *os_id,
                              int *running, const char **name, int *parent_tid,
                              void **trace, uptr trace_size) {
   const ReportDesc *rep = (ReportDesc *)report;
@@ -228,7 +228,7 @@ const char *__tsan_locate_address(uptr addr, char *name, uptr name_size,
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_get_alloc_stack(uptr addr, uptr *trace, uptr size, int *thread_id,
-                           uptr *os_id) {
+                           tid_t *os_id) {
   MBlock *b = 0;
   Allocator *a = allocator();
   if (a->PointerIsMine((void *)addr)) {
diff --git a/lib/tsan/rtl/tsan_interface.h b/lib/tsan/rtl/tsan_interface.h
index 496a8717f155..71986283ee17 100644
--- a/lib/tsan/rtl/tsan_interface.h
+++ b/lib/tsan/rtl/tsan_interface.h
@@ -18,6 +18,7 @@
 
 #include <sanitizer_common/sanitizer_internal_defs.h>
 using __sanitizer::uptr;
+using __sanitizer::tid_t;
 
 // This header should NOT include any other headers.
 // All functions in this header are extern "C" and start with __tsan_.
@@ -143,7 +144,7 @@ int __tsan_get_report_mutex(void *report, uptr idx, uptr *mutex_id, void **addr,
 
 // Returns information about threads included in the report.
 SANITIZER_INTERFACE_ATTRIBUTE
-int __tsan_get_report_thread(void *report, uptr idx, int *tid, uptr *os_id,
+int __tsan_get_report_thread(void *report, uptr idx, int *tid, tid_t *os_id,
                              int *running, const char **name, int *parent_tid,
                              void **trace, uptr trace_size);
 
@@ -160,7 +161,7 @@ const char *__tsan_locate_address(uptr addr, char *name, uptr name_size,
 // Returns the allocation stack for a heap pointer.
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_get_alloc_stack(uptr addr, uptr *trace, uptr size, int *thread_id,
-                           uptr *os_id);
+                           tid_t *os_id);
 
 #endif  // SANITIZER_GO
 
diff --git a/lib/tsan/rtl/tsan_report.h b/lib/tsan/rtl/tsan_report.h
index 8d8ae0fd8f58..a0473e8dbdad 100644
--- a/lib/tsan/rtl/tsan_report.h
+++ b/lib/tsan/rtl/tsan_report.h
@@ -90,7 +90,7 @@ struct ReportLocation {
 
 struct ReportThread {
   int id;
-  uptr os_id;
+  tid_t os_id;
   bool running;
   bool workerthread;
   char *name;
diff --git a/lib/tsan/rtl/tsan_rtl.h b/lib/tsan/rtl/tsan_rtl.h
index 0d62af00a05d..3481c31ebb1c 100644
--- a/lib/tsan/rtl/tsan_rtl.h
+++ b/lib/tsan/rtl/tsan_rtl.h
@@ -720,7 +720,7 @@ void FuncEntry(ThreadState *thr, uptr pc);
 void FuncExit(ThreadState *thr);
 
 int ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached);
-void ThreadStart(ThreadState *thr, int tid, uptr os_id, bool workerthread);
+void ThreadStart(ThreadState *thr, int tid, tid_t os_id, bool workerthread);
 void ThreadFinish(ThreadState *thr);
 int ThreadTid(ThreadState *thr, uptr pc, uptr uid);
 void ThreadJoin(ThreadState *thr, uptr pc, int tid);
diff --git a/lib/tsan/rtl/tsan_rtl_thread.cc b/lib/tsan/rtl/tsan_rtl_thread.cc
index 7357d97a264c..6a0943c49588 100644
--- a/lib/tsan/rtl/tsan_rtl_thread.cc
+++ b/lib/tsan/rtl/tsan_rtl_thread.cc
@@ -236,7 +236,7 @@ int ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached) {
   return tid;
 }
 
-void ThreadStart(ThreadState *thr, int tid, uptr os_id, bool workerthread) {
+void ThreadStart(ThreadState *thr, int tid, tid_t os_id, bool workerthread) {
   uptr stk_addr = 0;
   uptr stk_size = 0;
   uptr tls_addr = 0;
diff --git a/lib/ubsan/ubsan_flags.cc b/lib/ubsan/ubsan_flags.cc
index 3d404c1b7d34..8e1f40885a58 100644
--- a/lib/ubsan/ubsan_flags.cc
+++ b/lib/ubsan/ubsan_flags.cc
@@ -45,6 +45,7 @@ void InitializeFlags() {
     CommonFlags cf;
     cf.CopyFrom(*common_flags());
     cf.print_summary = false;
+    cf.external_symbolizer_path = GetEnv("UBSAN_SYMBOLIZER_PATH");
     OverrideCommonFlags(cf);
   }
 
diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc
index c5b63b0a564e..e538b477a3de 100644
--- a/lib/xray/xray_fdr_logging.cc
+++ b/lib/xray/xray_fdr_logging.cc
@@ -118,11 +118,15 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
     return Result;
   }
 
+  // Test for required CPU features and cache the cycle frequency
+  static bool TSCSupported = probeRequiredCPUFeatures();
+  static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency()
+                                   : __xray::NanosecondsPerSecond;
+
   XRayFileHeader Header;
   Header.Version = 1;
   Header.Type = FileTypes::FDR_LOG;
-  Header.CycleFrequency = probeRequiredCPUFeatures()
-                          ? getTSCFrequency() : __xray::NanosecondsPerSecond;
+  Header.CycleFrequency = CycleFrequency;
   // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
   // before setting the values in the header.
   Header.ConstantTSC = 1;
@@ -196,7 +200,10 @@ void fdrLoggingHandleArg0(int32_t FuncId,
   unsigned char CPU;
   uint64_t TSC;
 
-  if(probeRequiredCPUFeatures()) {
+  // Test once for required CPU features
+  static bool TSCSupported = probeRequiredCPUFeatures();
+
+  if(TSCSupported) {
     TSC = __xray::readTSC(CPU);
   } else {
     // FIXME: This code needs refactoring as it appears in multiple locations
diff --git a/lib/xray/xray_inmemory_log.cc b/lib/xray/xray_inmemory_log.cc
index cdaa6d1b5c86..83aecfaf7700 100644
--- a/lib/xray/xray_inmemory_log.cc
+++ b/lib/xray/xray_inmemory_log.cc
@@ -79,15 +79,19 @@ static int __xray_OpenLogFile() XRAY_NEVER_INSTRUMENT {
   int F = getLogFD();
   if (F == -1)
     return -1;
+
+  // Test for required CPU features and cache the cycle frequency
+  static bool TSCSupported = probeRequiredCPUFeatures();
+  static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency()
+                                   : __xray::NanosecondsPerSecond;
+
   // Since we're here, we get to write the header. We set it up so that the
   // header will only be written once, at the start, and let the threads
   // logging do writes which just append.
   XRayFileHeader Header;
   Header.Version = 1;
   Header.Type = FileTypes::NAIVE_LOG;
-  Header.CycleFrequency = probeRequiredCPUFeatures()
-                              ? getTSCFrequency()
-                              : __xray::NanosecondsPerSecond;
+  Header.CycleFrequency = CycleFrequency;
 
   // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
   // before setting the values in the header.
diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S
index da0aae326bdc..847ecef8d425 100644
--- a/lib/xray/xray_trampoline_x86_64.S
+++ b/lib/xray/xray_trampoline_x86_64.S
@@ -16,41 +16,48 @@
 #include "../builtins/assembly.h"
 
 .macro SAVE_REGISTERS
-	subq $200, %rsp
-	movupd	%xmm0, 184(%rsp)
-	movupd	%xmm1, 168(%rsp)
-	movupd	%xmm2, 152(%rsp)
-	movupd	%xmm3, 136(%rsp)
-	movupd	%xmm4, 120(%rsp)
-	movupd	%xmm5, 104(%rsp)
-	movupd	%xmm6, 88(%rsp)
-	movupd	%xmm7, 72(%rsp)
-	movq	%rdi, 64(%rsp)
-	movq	%rax, 56(%rsp)
-	movq	%rdx, 48(%rsp)
-	movq	%rsi, 40(%rsp)
-	movq	%rcx, 32(%rsp)
-	movq	%r8, 24(%rsp)
-	movq	%r9, 16(%rsp)
+	subq $192, %rsp
+	.cfi_def_cfa_offset 200
+	// At this point, the stack pointer should be aligned to an 8-byte boundary,
+	// because any call instructions that come after this will add another 8
+	// bytes and therefore align it to 16-bytes.
+	movq %rbp, 184(%rsp)
+	movupd	%xmm0, 168(%rsp)
+	movupd	%xmm1, 152(%rsp)
+	movupd	%xmm2, 136(%rsp)
+	movupd	%xmm3, 120(%rsp)
+	movupd	%xmm4, 104(%rsp)
+	movupd	%xmm5, 88(%rsp)
+	movupd	%xmm6, 72(%rsp)
+	movupd	%xmm7, 56(%rsp)
+	movq	%rdi, 48(%rsp)
+	movq	%rax, 40(%rsp)
+	movq	%rdx, 32(%rsp)
+	movq	%rsi, 24(%rsp)
+	movq	%rcx, 16(%rsp)
+	movq	%r8, 8(%rsp)
+	movq	%r9, 0(%rsp)
 .endm
 
 .macro RESTORE_REGISTERS
-	movupd	184(%rsp), %xmm0
-	movupd	168(%rsp), %xmm1
-	movupd	152(%rsp), %xmm2
-	movupd	136(%rsp), %xmm3
-	movupd	120(%rsp), %xmm4
-	movupd	104(%rsp), %xmm5
-	movupd	88(%rsp) , %xmm6
-	movupd	72(%rsp) , %xmm7
-	movq	64(%rsp), %rdi
-	movq	56(%rsp), %rax
-	movq	48(%rsp), %rdx
-	movq	40(%rsp), %rsi
-	movq	32(%rsp), %rcx
-	movq	24(%rsp), %r8
-	movq	16(%rsp), %r9
-	addq	$200, %rsp
+	movq  184(%rsp), %rbp
+	movupd	168(%rsp), %xmm0
+	movupd	152(%rsp), %xmm1
+	movupd	136(%rsp), %xmm2
+	movupd	120(%rsp), %xmm3
+	movupd	104(%rsp), %xmm4
+	movupd	88(%rsp), %xmm5
+	movupd	72(%rsp) , %xmm6
+	movupd	56(%rsp) , %xmm7
+	movq	48(%rsp), %rdi
+	movq	40(%rsp), %rax
+	movq	32(%rsp), %rdx
+	movq	24(%rsp), %rsi
+	movq	16(%rsp), %rcx
+	movq	8(%rsp), %r8
+	movq	0(%rsp), %r9
+	addq	$192, %rsp
+	.cfi_def_cfa_offset 8
 .endm
 
 	.text
@@ -64,8 +71,6 @@
 
 __xray_FunctionEntry:
 	.cfi_startproc
-	pushq %rbp
-	.cfi_def_cfa_offset 16
 	SAVE_REGISTERS
 
 	// This load has to be atomic, it's concurrent with __xray_patch().
@@ -80,7 +85,6 @@ __xray_FunctionEntry:
 	callq	*%rax
 .Ltmp0:
 	RESTORE_REGISTERS
-	popq	%rbp
 	retq
 .Ltmp1:
 	.size __xray_FunctionEntry, .Ltmp1-__xray_FunctionEntry
@@ -96,14 +100,13 @@ __xray_FunctionExit:
 	// Save the important registers first. Since we're assuming that this
 	// function is only jumped into, we only preserve the registers for
 	// returning.
-	pushq	%rbp
-	.cfi_def_cfa_offset 16
 	subq	$56, %rsp
-	.cfi_def_cfa_offset 32
-	movupd	%xmm0, 40(%rsp)
-	movupd	%xmm1, 24(%rsp)
-	movq	%rax, 16(%rsp)
-	movq	%rdx, 8(%rsp)
+	.cfi_def_cfa_offset 64
+	movq  %rbp, 48(%rsp)
+	movupd	%xmm0, 32(%rsp)
+	movupd	%xmm1, 16(%rsp)
+	movq	%rax, 8(%rsp)
+	movq	%rdx, 0(%rsp)
 	movq	_ZN6__xray19XRayPatchedFunctionE(%rip), %rax
 	testq %rax,%rax
 	je	.Ltmp2
@@ -113,12 +116,13 @@ __xray_FunctionExit:
 	callq	*%rax
 .Ltmp2:
 	// Restore the important registers.
-	movupd	40(%rsp), %xmm0
-	movupd	24(%rsp), %xmm1
-	movq	16(%rsp), %rax
-	movq	8(%rsp), %rdx
+	movq  48(%rsp), %rbp
+	movupd	32(%rsp), %xmm0
+	movupd	16(%rsp), %xmm1
+	movq	8(%rsp), %rax
+	movq	0(%rsp), %rdx
 	addq	$56, %rsp
-	popq	%rbp
+	.cfi_def_cfa_offset 8
 	retq
 .Ltmp3:
 	.size __xray_FunctionExit, .Ltmp3-__xray_FunctionExit
@@ -135,8 +139,6 @@ __xray_FunctionTailExit:
 	// this is an exit. In the future, we will introduce a new entry type that
 	// differentiates between a normal exit and a tail exit, but we'd have to do
 	// this and increment the version number for the header.
-	pushq %rbp
-	.cfi_def_cfa_offset 16
 	SAVE_REGISTERS
 
 	movq	_ZN6__xray19XRayPatchedFunctionE(%rip), %rax
@@ -149,7 +151,6 @@ __xray_FunctionTailExit:
 
 .Ltmp4:
 	RESTORE_REGISTERS
-	popq	%rbp
 	retq
 .Ltmp5:
 	.size __xray_FunctionTailExit, .Ltmp5-__xray_FunctionTailExit
@@ -162,8 +163,6 @@ __xray_FunctionTailExit:
 	.type __xray_ArgLoggerEntry,@function
 __xray_ArgLoggerEntry:
 	.cfi_startproc
-	pushq	%rbp
-	.cfi_def_cfa_offset 16
 	SAVE_REGISTERS
 
 	// Again, these function pointer loads must be atomic; MOV is fine.
@@ -184,7 +183,6 @@ __xray_ArgLoggerEntry:
 
 .Larg1entryFail:
 	RESTORE_REGISTERS
-	popq	%rbp
 	retq
 
 .Larg1entryEnd:
diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc
index 8c2a4e313e3a..2e9a8d270c33 100644
--- a/lib/xray/xray_x86_64.cc
+++ b/lib/xray/xray_x86_64.cc
@@ -214,6 +214,12 @@ bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
     Report("Missing rdtscp support.\n");
     return false;
   }
+  // Also check whether we can determine the CPU frequency, since if we cannot,
+  // we should use the emulated TSC instead.
+  if (!getTSCFrequency()) {
+    Report("Unable to determine CPU frequency.\n");
+    return false;
+  }
   return true;
 }
author	Dimitry Andric <dim@FreeBSD.org>	2017-04-20 21:20:59 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-04-20 21:20:59 +0000
commit	f351c8a560ddc5b5df9ee5ba4ccc1cfb9029146d (patch)
tree	a1af403c7ce4e7447ee7e01c045d260dba9a409b /lib
parent	ab0bf875a5f328a6710f4e48258979ae1bc8da1c (diff)
download	src-test2-f351c8a560ddc5b5df9ee5ba4ccc1cfb9029146d.tar.gz src-test2-f351c8a560ddc5b5df9ee5ba4ccc1cfb9029146d.zip