vendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9 vendor/llvm-project/master

author: Dimitry Andric <dim@FreeBSD.org> 2020-07-26 19:36:28 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2020-07-26 19:36:28 +0000
commit: cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree: 209fb2a2d68f8f277793fc8df46c753d31bc853b /compiler-rt/lib/scudo/standalone
parent: 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
33 files changed, 1997 insertions, 501 deletions
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h
index 3a5aaae73674..ad2a17ef7014 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.h
@@ -32,20 +32,23 @@ struct DefaultConfig {
   // 512KB regions
   typedef SizeClassAllocator32<SizeClassMap, 19U> Primary;
 #endif
-  typedef MapAllocator<> Secondary;
+  typedef MapAllocator<MapAllocatorCache<>> Secondary;
   template <class A> using TSDRegistryT = TSDRegistryExT<A>; // Exclusive
 };
 
 struct AndroidConfig {
   using SizeClassMap = AndroidSizeClassMap;
 #if SCUDO_CAN_USE_PRIMARY64
-  // 1GB regions
-  typedef SizeClassAllocator64<SizeClassMap, 30U> Primary;
+  // 256MB regions
+  typedef SizeClassAllocator64<SizeClassMap, 28U, 1000, 1000,
+                               /*MaySupportMemoryTagging=*/true>
+      Primary;
 #else
-  // 512KB regions
-  typedef SizeClassAllocator32<SizeClassMap, 19U> Primary;
+  // 256KB regions
+  typedef SizeClassAllocator32<SizeClassMap, 18U, 1000, 1000> Primary;
 #endif
-  typedef MapAllocator<> Secondary;
+  // Cache blocks up to 2MB
+  typedef MapAllocator<MapAllocatorCache<32U, 2UL << 20, 0, 1000>> Secondary;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 2U>; // Shared, max 2 TSDs.
 };
@@ -53,13 +56,13 @@ struct AndroidConfig {
 struct AndroidSvelteConfig {
   using SizeClassMap = SvelteSizeClassMap;
 #if SCUDO_CAN_USE_PRIMARY64
-  // 512MB regions
-  typedef SizeClassAllocator64<SizeClassMap, 29U> Primary;
+  // 128MB regions
+  typedef SizeClassAllocator64<SizeClassMap, 27U, 1000, 1000> Primary;
 #else
   // 64KB regions
-  typedef SizeClassAllocator32<SizeClassMap, 16U> Primary;
+  typedef SizeClassAllocator32<SizeClassMap, 16U, 1000, 1000> Primary;
 #endif
-  typedef MapAllocator<0U> Secondary;
+  typedef MapAllocator<MapAllocatorCache<4U, 1UL << 18, 0, 0>> Secondary;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 1U>; // Shared, only 1 TSD.
 };
@@ -68,7 +71,7 @@ struct AndroidSvelteConfig {
 struct FuchsiaConfig {
   // 1GB Regions
   typedef SizeClassAllocator64<DefaultSizeClassMap, 30U> Primary;
-  typedef MapAllocator<0U> Secondary;
+  typedef MapAllocator<MapAllocatorNoCache> Secondary;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 8U>; // Shared, max 8 TSDs.
 };
diff --git a/compiler-rt/lib/scudo/standalone/atomic_helpers.h b/compiler-rt/lib/scudo/standalone/atomic_helpers.h
index 6c84ba86ed32..1ea1a86ae506 100644
--- a/compiler-rt/lib/scudo/standalone/atomic_helpers.h
+++ b/compiler-rt/lib/scudo/standalone/atomic_helpers.h
@@ -51,7 +51,7 @@ struct atomic_u32 {
 struct atomic_u64 {
   typedef u64 Type;
   // On 32-bit platforms u64 is not necessarily aligned on 8 bytes.
-  ALIGNED(8) volatile Type ValDoNotUse;
+  alignas(8) volatile Type ValDoNotUse;
 };
 
 struct atomic_uptr {
diff --git a/compiler-rt/lib/scudo/standalone/bytemap.h b/compiler-rt/lib/scudo/standalone/bytemap.h
index a03a0c471062..e0d54f4e5971 100644
--- a/compiler-rt/lib/scudo/standalone/bytemap.h
+++ b/compiler-rt/lib/scudo/standalone/bytemap.h
@@ -17,12 +17,10 @@ namespace scudo {
 
 template <uptr Size> class FlatByteMap {
 public:
-  void initLinkerInitialized() {
-    Map = reinterpret_cast<u8 *>(map(nullptr, Size, "scudo:bytemap"));
-  }
-  void init() { initLinkerInitialized(); }
+  void initLinkerInitialized() {}
+  void init() { memset(Map, 0, sizeof(Map)); }
 
-  void unmapTestOnly() { unmap(reinterpret_cast<void *>(Map), Size); }
+  void unmapTestOnly() {}
 
   void set(uptr Index, u8 Value) {
     DCHECK_LT(Index, Size);
@@ -38,78 +36,7 @@ public:
   void enable() {}
 
 private:
-  u8 *Map;
-};
-
-template <uptr Level1Size, uptr Level2Size> class TwoLevelByteMap {
-public:
-  void initLinkerInitialized() {
-    Level1Map = reinterpret_cast<atomic_uptr *>(
-        map(nullptr, sizeof(atomic_uptr) * Level1Size, "scudo:bytemap"));
-  }
-  void init() {
-    Mutex.init();
-    initLinkerInitialized();
-  }
-
-  void reset() {
-    for (uptr I = 0; I < Level1Size; I++) {
-      u8 *P = get(I);
-      if (!P)
-        continue;
-      unmap(P, Level2Size);
-    }
-    memset(Level1Map, 0, sizeof(atomic_uptr) * Level1Size);
-  }
-
-  void unmapTestOnly() {
-    reset();
-    unmap(reinterpret_cast<void *>(Level1Map),
-          sizeof(atomic_uptr) * Level1Size);
-  }
-
-  uptr size() const { return Level1Size * Level2Size; }
-
-  void set(uptr Index, u8 Value) {
-    DCHECK_LT(Index, Level1Size * Level2Size);
-    u8 *Level2Map = getOrCreate(Index / Level2Size);
-    DCHECK_EQ(0U, Level2Map[Index % Level2Size]);
-    Level2Map[Index % Level2Size] = Value;
-  }
-
-  u8 operator[](uptr Index) const {
-    DCHECK_LT(Index, Level1Size * Level2Size);
-    u8 *Level2Map = get(Index / Level2Size);
-    if (!Level2Map)
-      return 0;
-    return Level2Map[Index % Level2Size];
-  }
-
-  void disable() { Mutex.lock(); }
-  void enable() { Mutex.unlock(); }
-
-private:
-  u8 *get(uptr Index) const {
-    DCHECK_LT(Index, Level1Size);
-    return reinterpret_cast<u8 *>(
-        atomic_load(&Level1Map[Index], memory_order_acquire));
-  }
-
-  u8 *getOrCreate(uptr Index) {
-    u8 *Res = get(Index);
-    if (!Res) {
-      ScopedLock L(Mutex);
-      if (!(Res = get(Index))) {
-        Res = reinterpret_cast<u8 *>(map(nullptr, Level2Size, "scudo:bytemap"));
-        atomic_store(&Level1Map[Index], reinterpret_cast<uptr>(Res),
-                     memory_order_release);
-      }
-    }
-    return Res;
-  }
-
-  atomic_uptr *Level1Map;
-  HybridMutex Mutex;
+  u8 Map[Size];
 };
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/checksum.cpp b/compiler-rt/lib/scudo/standalone/checksum.cpp
index 5de049a0931b..05d4ba54bfc8 100644
--- a/compiler-rt/lib/scudo/standalone/checksum.cpp
+++ b/compiler-rt/lib/scudo/standalone/checksum.cpp
@@ -31,6 +31,13 @@ Checksum HashAlgorithm = {Checksum::BSD};
 #define bit_SSE4_2 bit_SSE42 // clang and gcc have different defines.
 #endif
 
+#ifndef signature_HYGON_ebx // They are not defined in gcc.
+// HYGON: "HygonGenuine".
+#define signature_HYGON_ebx 0x6f677948
+#define signature_HYGON_edx 0x6e65476e
+#define signature_HYGON_ecx 0x656e6975
+#endif
+
 bool hasHardwareCRC32() {
   u32 Eax, Ebx = 0, Ecx = 0, Edx = 0;
   __get_cpuid(0, &Eax, &Ebx, &Ecx, &Edx);
@@ -39,7 +46,10 @@ bool hasHardwareCRC32() {
                        (Ecx == signature_INTEL_ecx);
   const bool IsAMD = (Ebx == signature_AMD_ebx) && (Edx == signature_AMD_edx) &&
                      (Ecx == signature_AMD_ecx);
-  if (!IsIntel && !IsAMD)
+  const bool IsHygon = (Ebx == signature_HYGON_ebx) &&
+                       (Edx == signature_HYGON_edx) &&
+                       (Ecx == signature_HYGON_ecx);
+  if (!IsIntel && !IsAMD && !IsHygon)
     return false;
   __get_cpuid(1, &Eax, &Ebx, &Ecx, &Edx);
   return !!(Ecx & bit_SSE4_2);
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index a0b4b2973e96..3bb41eca88f7 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -13,28 +13,36 @@
 #include "common.h"
 #include "flags.h"
 #include "flags_parser.h"
-#include "interface.h"
 #include "local_cache.h"
+#include "memtag.h"
 #include "quarantine.h"
 #include "report.h"
 #include "secondary.h"
+#include "stack_depot.h"
 #include "string_utils.h"
 #include "tsd.h"
 
+#include "scudo/interface.h"
+
 #ifdef GWP_ASAN_HOOKS
 #include "gwp_asan/guarded_pool_allocator.h"
-// GWP-ASan is declared here in order to avoid indirect call overhead. It's also
-// instantiated outside of the Allocator class, as the allocator is only
-// zero-initialised. GWP-ASan requires constant initialisation, and the Scudo
-// allocator doesn't have a constexpr constructor (see discussion here:
-// https://reviews.llvm.org/D69265#inline-624315).
-static gwp_asan::GuardedPoolAllocator GuardedAlloc;
+#include "gwp_asan/optional/backtrace.h"
+#include "gwp_asan/optional/segv_handler.h"
 #endif // GWP_ASAN_HOOKS
 
 extern "C" inline void EmptyCallback() {}
 
+#ifdef HAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE
+// This function is not part of the NDK so it does not appear in any public
+// header files. We only declare/use it when targeting the platform.
+extern "C" size_t android_unsafe_frame_pointer_chase(scudo::uptr *buf,
+                                                     size_t num_entries);
+#endif
+
 namespace scudo {
 
+enum class Option { ReleaseInterval };
+
 template <class Params, void (*PostInitCallback)(void) = EmptyCallback>
 class Allocator {
 public:
@@ -139,20 +147,29 @@ public:
 
     // Store some flags locally.
     Options.MayReturnNull = getFlags()->may_return_null;
-    Options.ZeroContents = getFlags()->zero_contents;
+    Options.FillContents =
+        getFlags()->zero_contents
+            ? ZeroFill
+            : (getFlags()->pattern_fill_contents ? PatternOrZeroFill : NoFill);
     Options.DeallocTypeMismatch = getFlags()->dealloc_type_mismatch;
     Options.DeleteSizeMismatch = getFlags()->delete_size_mismatch;
+    Options.TrackAllocationStacks = false;
     Options.QuarantineMaxChunkSize =
         static_cast<u32>(getFlags()->quarantine_max_chunk_size);
 
     Stats.initLinkerInitialized();
-    Primary.initLinkerInitialized(getFlags()->release_to_os_interval_ms);
-    Secondary.initLinkerInitialized(&Stats);
+    const s32 ReleaseToOsIntervalMs = getFlags()->release_to_os_interval_ms;
+    Primary.initLinkerInitialized(ReleaseToOsIntervalMs);
+    Secondary.initLinkerInitialized(&Stats, ReleaseToOsIntervalMs);
 
     Quarantine.init(
         static_cast<uptr>(getFlags()->quarantine_size_kb << 10),
         static_cast<uptr>(getFlags()->thread_local_quarantine_size_kb << 10));
+  }
 
+  // Initialize the embedded GWP-ASan instance. Requires the main allocator to
+  // be functional, best called from PostInitCallback.
+  void initGwpAsan() {
 #ifdef GWP_ASAN_HOOKS
     gwp_asan::options::Options Opt;
     Opt.Enabled = getFlags()->GWP_ASAN_Enabled;
@@ -165,8 +182,17 @@ public:
         getFlags()->GWP_ASAN_MaxSimultaneousAllocations;
     Opt.SampleRate = getFlags()->GWP_ASAN_SampleRate;
     Opt.InstallSignalHandlers = getFlags()->GWP_ASAN_InstallSignalHandlers;
-    Opt.Printf = Printf;
+    // Embedded GWP-ASan is locked through the Scudo atfork handler (via
+    // Allocator::disable calling GWPASan.disable). Disable GWP-ASan's atfork
+    // handler.
+    Opt.InstallForkHandlers = false;
+    Opt.Backtrace = gwp_asan::options::getBacktraceFunction();
     GuardedAlloc.init(Opt);
+
+    if (Opt.InstallSignalHandlers)
+      gwp_asan::crash_handler::installSignalHandlers(
+          &GuardedAlloc, Printf, gwp_asan::options::getPrintBacktraceFunction(),
+          Opt.Backtrace);
 #endif // GWP_ASAN_HOOKS
   }
 
@@ -175,6 +201,11 @@ public:
   void unmapTestOnly() {
     TSDRegistry.unmapTestOnly();
     Primary.unmapTestOnly();
+#ifdef GWP_ASAN_HOOKS
+    if (getFlags()->GWP_ASAN_InstallSignalHandlers)
+      gwp_asan::crash_handler::uninstallSignalHandlers();
+    GuardedAlloc.uninitTestOnly();
+#endif // GWP_ASAN_HOOKS
   }
 
   TSDRegistryT *getTSDRegistry() { return &TSDRegistry; }
@@ -195,6 +226,27 @@ public:
     TSD->Cache.destroy(&Stats);
   }
 
+  ALWAYS_INLINE void *untagPointerMaybe(void *Ptr) {
+    if (Primary.SupportsMemoryTagging)
+      return reinterpret_cast<void *>(
+          untagPointer(reinterpret_cast<uptr>(Ptr)));
+    return Ptr;
+  }
+
+  NOINLINE u32 collectStackTrace() {
+#ifdef HAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE
+    // Discard collectStackTrace() frame and allocator function frame.
+    constexpr uptr DiscardFrames = 2;
+    uptr Stack[MaxTraceSize + DiscardFrames];
+    uptr Size =
+        android_unsafe_frame_pointer_chase(Stack, MaxTraceSize + DiscardFrames);
+    Size = Min<uptr>(Size, MaxTraceSize + DiscardFrames);
+    return Depot.insert(Stack + Min<uptr>(DiscardFrames, Size), Stack + Size);
+#else
+    return 0;
+#endif
+  }
+
   NOINLINE void *allocate(uptr Size, Chunk::Origin Origin,
                           uptr Alignment = MinAlignment,
                           bool ZeroContents = false) {
@@ -207,7 +259,8 @@ public:
     }
 #endif // GWP_ASAN_HOOKS
 
-    ZeroContents |= static_cast<bool>(Options.ZeroContents);
+    FillContentsMode FillContents =
+        ZeroContents ? ZeroFill : Options.FillContents;
 
     if (UNLIKELY(Alignment > MaxAlignment)) {
       if (Options.MayReturnNull)
@@ -235,22 +288,36 @@ public:
     }
     DCHECK_LE(Size, NeededSize);
 
-    void *Block;
-    uptr ClassId;
-    uptr BlockEnd;
+    void *Block = nullptr;
+    uptr ClassId = 0;
+    uptr SecondaryBlockEnd;
     if (LIKELY(PrimaryT::canAllocate(NeededSize))) {
       ClassId = SizeClassMap::getClassIdBySize(NeededSize);
       DCHECK_NE(ClassId, 0U);
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
       Block = TSD->Cache.allocate(ClassId);
+      // If the allocation failed, the most likely reason with a 32-bit primary
+      // is the region being full. In that event, retry in each successively
+      // larger class until it fits. If it fails to fit in the largest class,
+      // fallback to the Secondary.
+      if (UNLIKELY(!Block)) {
+        while (ClassId < SizeClassMap::LargestClassId) {
+          Block = TSD->Cache.allocate(++ClassId);
+          if (LIKELY(Block)) {
+            break;
+          }
+        }
+        if (UNLIKELY(!Block)) {
+          ClassId = 0;
+        }
+      }
       if (UnlockRequired)
         TSD->unlock();
-    } else {
-      ClassId = 0;
-      Block =
-          Secondary.allocate(NeededSize, Alignment, &BlockEnd, ZeroContents);
     }
+    if (UNLIKELY(ClassId == 0))
+      Block = Secondary.allocate(NeededSize, Alignment, &SecondaryBlockEnd,
+                                 FillContents);
 
     if (UNLIKELY(!Block)) {
       if (Options.MayReturnNull)
@@ -258,16 +325,88 @@ public:
       reportOutOfMemory(NeededSize);
     }
 
-    // We only need to zero the contents for Primary backed allocations. This
-    // condition is not necessarily unlikely, but since memset is costly, we
-    // might as well mark it as such.
-    if (UNLIKELY(ZeroContents && ClassId))
-      memset(Block, 0, PrimaryT::getSizeByClassId(ClassId));
-
-    const uptr UnalignedUserPtr =
-        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+    const uptr BlockUptr = reinterpret_cast<uptr>(Block);
+    const uptr UnalignedUserPtr = BlockUptr + Chunk::getHeaderSize();
     const uptr UserPtr = roundUpTo(UnalignedUserPtr, Alignment);
 
+    void *Ptr = reinterpret_cast<void *>(UserPtr);
+    void *TaggedPtr = Ptr;
+    if (ClassId) {
+      // We only need to zero or tag the contents for Primary backed
+      // allocations. We only set tags for primary allocations in order to avoid
+      // faulting potentially large numbers of pages for large secondary
+      // allocations. We assume that guard pages are enough to protect these
+      // allocations.
+      //
+      // FIXME: When the kernel provides a way to set the background tag of a
+      // mapping, we should be able to tag secondary allocations as well.
+      //
+      // When memory tagging is enabled, zeroing the contents is done as part of
+      // setting the tag.
+      if (UNLIKELY(useMemoryTagging())) {
+        uptr PrevUserPtr;
+        Chunk::UnpackedHeader Header;
+        const uptr BlockEnd = BlockUptr + PrimaryT::getSizeByClassId(ClassId);
+        // If possible, try to reuse the UAF tag that was set by deallocate().
+        // For simplicity, only reuse tags if we have the same start address as
+        // the previous allocation. This handles the majority of cases since
+        // most allocations will not be more aligned than the minimum alignment.
+        //
+        // We need to handle situations involving reclaimed chunks, and retag
+        // the reclaimed portions if necessary. In the case where the chunk is
+        // fully reclaimed, the chunk's header will be zero, which will trigger
+        // the code path for new mappings and invalid chunks that prepares the
+        // chunk from scratch. There are three possibilities for partial
+        // reclaiming:
+        //
+        // (1) Header was reclaimed, data was partially reclaimed.
+        // (2) Header was not reclaimed, all data was reclaimed (e.g. because
+        //     data started on a page boundary).
+        // (3) Header was not reclaimed, data was partially reclaimed.
+        //
+        // Case (1) will be handled in the same way as for full reclaiming,
+        // since the header will be zero.
+        //
+        // We can detect case (2) by loading the tag from the start
+        // of the chunk. If it is zero, it means that either all data was
+        // reclaimed (since we never use zero as the chunk tag), or that the
+        // previous allocation was of size zero. Either way, we need to prepare
+        // a new chunk from scratch.
+        //
+        // We can detect case (3) by moving to the next page (if covered by the
+        // chunk) and loading the tag of its first granule. If it is zero, it
+        // means that all following pages may need to be retagged. On the other
+        // hand, if it is nonzero, we can assume that all following pages are
+        // still tagged, according to the logic that if any of the pages
+        // following the next page were reclaimed, the next page would have been
+        // reclaimed as well.
+        uptr TaggedUserPtr;
+        if (getChunkFromBlock(BlockUptr, &PrevUserPtr, &Header) &&
+            PrevUserPtr == UserPtr &&
+            (TaggedUserPtr = loadTag(UserPtr)) != UserPtr) {
+          uptr PrevEnd = TaggedUserPtr + Header.SizeOrUnusedBytes;
+          const uptr NextPage = roundUpTo(TaggedUserPtr, getPageSizeCached());
+          if (NextPage < PrevEnd && loadTag(NextPage) != NextPage)
+            PrevEnd = NextPage;
+          TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
+          resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, BlockEnd);
+          if (Size) {
+            // Clear any stack metadata that may have previously been stored in
+            // the chunk data.
+            memset(TaggedPtr, 0, archMemoryTagGranuleSize());
+          }
+        } else {
+          TaggedPtr = prepareTaggedChunk(Ptr, Size, BlockEnd);
+        }
+        storeAllocationStackMaybe(Ptr);
+      } else if (UNLIKELY(FillContents != NoFill)) {
+        // This condition is not necessarily unlikely, but since memset is
+        // costly, we might as well mark it as such.
+        memset(Block, FillContents == ZeroFill ? 0 : PatternFillByte,
+               PrimaryT::getSizeByClassId(ClassId));
+      }
+    }
+
     Chunk::UnpackedHeader Header = {};
     if (UNLIKELY(UnalignedUserPtr != UserPtr)) {
       const uptr Offset = UserPtr - UnalignedUserPtr;
@@ -283,15 +422,15 @@ public:
     Header.ClassId = ClassId & Chunk::ClassIdMask;
     Header.State = Chunk::State::Allocated;
     Header.Origin = Origin & Chunk::OriginMask;
-    Header.SizeOrUnusedBytes = (ClassId ? Size : BlockEnd - (UserPtr + Size)) &
-                               Chunk::SizeOrUnusedBytesMask;
-    void *Ptr = reinterpret_cast<void *>(UserPtr);
+    Header.SizeOrUnusedBytes =
+        (ClassId ? Size : SecondaryBlockEnd - (UserPtr + Size)) &
+        Chunk::SizeOrUnusedBytesMask;
     Chunk::storeHeader(Cookie, Ptr, &Header);
 
     if (&__scudo_allocate_hook)
-      __scudo_allocate_hook(Ptr, Size);
+      __scudo_allocate_hook(TaggedPtr, Size);
 
-    return Ptr;
+    return TaggedPtr;
   }
 
   NOINLINE void deallocate(void *Ptr, Chunk::Origin Origin, uptr DeleteSize = 0,
@@ -319,6 +458,8 @@ public:
     if (UNLIKELY(!isAligned(reinterpret_cast<uptr>(Ptr), MinAlignment)))
       reportMisalignedPointer(AllocatorAction::Deallocating, Ptr);
 
+    Ptr = untagPointerMaybe(Ptr);
+
     Chunk::UnpackedHeader Header;
     Chunk::loadHeader(Cookie, Ptr, &Header);
 
@@ -346,6 +487,15 @@ public:
   void *reallocate(void *OldPtr, uptr NewSize, uptr Alignment = MinAlignment) {
     initThreadMaybe();
 
+    if (UNLIKELY(NewSize >= MaxAllowedMallocSize)) {
+      if (Options.MayReturnNull)
+        return nullptr;
+      reportAllocationSizeTooBig(NewSize, 0, MaxAllowedMallocSize);
+    }
+
+    void *OldTaggedPtr = OldPtr;
+    OldPtr = untagPointerMaybe(OldPtr);
+
     // The following cases are handled by the C wrappers.
     DCHECK_NE(OldPtr, nullptr);
     DCHECK_NE(NewSize, 0);
@@ -396,16 +546,20 @@ public:
     // reasonable delta), we just keep the old block, and update the chunk
     // header to reflect the size change.
     if (reinterpret_cast<uptr>(OldPtr) + NewSize <= BlockEnd) {
-      const uptr Delta =
-          OldSize < NewSize ? NewSize - OldSize : OldSize - NewSize;
-      if (Delta <= SizeClassMap::MaxSize / 2) {
+      if (NewSize > OldSize || (OldSize - NewSize) < getPageSizeCached()) {
         Chunk::UnpackedHeader NewHeader = OldHeader;
         NewHeader.SizeOrUnusedBytes =
             (ClassId ? NewSize
                      : BlockEnd - (reinterpret_cast<uptr>(OldPtr) + NewSize)) &
             Chunk::SizeOrUnusedBytesMask;
         Chunk::compareExchangeHeader(Cookie, OldPtr, &NewHeader, &OldHeader);
-        return OldPtr;
+        if (UNLIKELY(ClassId && useMemoryTagging())) {
+          resizeTaggedChunk(reinterpret_cast<uptr>(OldTaggedPtr) + OldSize,
+                            reinterpret_cast<uptr>(OldTaggedPtr) + NewSize,
+                            BlockEnd);
+          storeAllocationStackMaybe(OldPtr);
+        }
+        return OldTaggedPtr;
       }
     }
 
@@ -416,7 +570,7 @@ public:
     void *NewPtr = allocate(NewSize, Chunk::Origin::Malloc, Alignment);
     if (NewPtr) {
       const uptr OldSize = getSize(OldPtr, &OldHeader);
-      memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
+      memcpy(NewPtr, OldTaggedPtr, Min(NewSize, OldSize));
       quarantineOrDeallocateChunk(OldPtr, &OldHeader, OldSize);
     }
     return NewPtr;
@@ -427,6 +581,9 @@ public:
   //                this function finishes. We will revisit that later.
   void disable() {
     initThreadMaybe();
+#ifdef GWP_ASAN_HOOKS
+    GuardedAlloc.disable();
+#endif
     TSDRegistry.disable();
     Stats.disable();
     Quarantine.disable();
@@ -441,6 +598,9 @@ public:
     Quarantine.enable();
     Stats.enable();
     TSDRegistry.enable();
+#ifdef GWP_ASAN_HOOKS
+    GuardedAlloc.enable();
+#endif
   }
 
   // The function returns the amount of bytes required to store the statistics,
@@ -473,6 +633,7 @@ public:
   void releaseToOS() {
     initThreadMaybe();
     Primary.releaseToOS();
+    Secondary.releaseToOS();
   }
 
   // Iterate over all chunks and call a callback for all busy chunks located
@@ -489,11 +650,19 @@ public:
       uptr Chunk;
       Chunk::UnpackedHeader Header;
       if (getChunkFromBlock(Block, &Chunk, &Header) &&
-          Header.State == Chunk::State::Allocated)
-        Callback(Chunk, getSize(reinterpret_cast<void *>(Chunk), &Header), Arg);
+          Header.State == Chunk::State::Allocated) {
+        uptr TaggedChunk = Chunk;
+        if (useMemoryTagging())
+          TaggedChunk = loadTag(Chunk);
+        Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header),
+                 Arg);
+      }
     };
     Primary.iterateOverBlocks(Lambda);
     Secondary.iterateOverBlocks(Lambda);
+#ifdef GWP_ASAN_HOOKS
+    GuardedAlloc.iterate(reinterpret_cast<void *>(Base), Size, Callback, Arg);
+#endif
   }
 
   bool canReturnNull() {
@@ -501,8 +670,14 @@ public:
     return Options.MayReturnNull;
   }
 
-  // TODO(kostyak): implement this as a "backend" to mallopt.
-  bool setOption(UNUSED uptr Option, UNUSED uptr Value) { return false; }
+  bool setOption(Option O, sptr Value) {
+    if (O == Option::ReleaseInterval) {
+      Primary.setReleaseToOsIntervalMs(static_cast<s32>(Value));
+      Secondary.setReleaseToOsIntervalMs(static_cast<s32>(Value));
+      return true;
+    }
+    return false;
+  }
 
   // Return the usable size for a given chunk. Technically we lie, as we just
   // report the actual size of a chunk. This is done to counteract code actively
@@ -519,6 +694,7 @@ public:
       return GuardedAlloc.getSize(Ptr);
 #endif // GWP_ASAN_HOOKS
 
+    Ptr = untagPointerMaybe(const_cast<void *>(Ptr));
     Chunk::UnpackedHeader Header;
     Chunk::loadHeader(Cookie, Ptr, &Header);
     // Getting the usable size of a chunk only makes sense if it's allocated.
@@ -543,11 +719,151 @@ public:
 #endif // GWP_ASAN_HOOKS
     if (!Ptr || !isAligned(reinterpret_cast<uptr>(Ptr), MinAlignment))
       return false;
+    Ptr = untagPointerMaybe(const_cast<void *>(Ptr));
     Chunk::UnpackedHeader Header;
     return Chunk::isValid(Cookie, Ptr, &Header) &&
            Header.State == Chunk::State::Allocated;
   }
 
+  bool useMemoryTagging() { return Primary.useMemoryTagging(); }
+
+  void disableMemoryTagging() { Primary.disableMemoryTagging(); }
+
+  void setTrackAllocationStacks(bool Track) {
+    initThreadMaybe();
+    Options.TrackAllocationStacks = Track;
+  }
+
+  void setFillContents(FillContentsMode FillContents) {
+    initThreadMaybe();
+    Options.FillContents = FillContents;
+  }
+
+  const char *getStackDepotAddress() const {
+    return reinterpret_cast<const char *>(&Depot);
+  }
+
+  const char *getRegionInfoArrayAddress() const {
+    return Primary.getRegionInfoArrayAddress();
+  }
+
+  static uptr getRegionInfoArraySize() {
+    return PrimaryT::getRegionInfoArraySize();
+  }
+
+  static void getErrorInfo(struct scudo_error_info *ErrorInfo,
+                           uintptr_t FaultAddr, const char *DepotPtr,
+                           const char *RegionInfoPtr, const char *Memory,
+                           const char *MemoryTags, uintptr_t MemoryAddr,
+                           size_t MemorySize) {
+    *ErrorInfo = {};
+    if (!PrimaryT::SupportsMemoryTagging ||
+        MemoryAddr + MemorySize < MemoryAddr)
+      return;
+
+    uptr UntaggedFaultAddr = untagPointer(FaultAddr);
+    u8 FaultAddrTag = extractTag(FaultAddr);
+    BlockInfo Info =
+        PrimaryT::findNearestBlock(RegionInfoPtr, UntaggedFaultAddr);
+
+    auto GetGranule = [&](uptr Addr, const char **Data, uint8_t *Tag) -> bool {
+      if (Addr < MemoryAddr ||
+          Addr + archMemoryTagGranuleSize() < Addr ||
+          Addr + archMemoryTagGranuleSize() > MemoryAddr + MemorySize)
+        return false;
+      *Data = &Memory[Addr - MemoryAddr];
+      *Tag = static_cast<u8>(
+          MemoryTags[(Addr - MemoryAddr) / archMemoryTagGranuleSize()]);
+      return true;
+    };
+
+    auto ReadBlock = [&](uptr Addr, uptr *ChunkAddr,
+                         Chunk::UnpackedHeader *Header, const u32 **Data,
+                         u8 *Tag) {
+      const char *BlockBegin;
+      u8 BlockBeginTag;
+      if (!GetGranule(Addr, &BlockBegin, &BlockBeginTag))
+        return false;
+      uptr ChunkOffset = getChunkOffsetFromBlock(BlockBegin);
+      *ChunkAddr = Addr + ChunkOffset;
+
+      const char *ChunkBegin;
+      if (!GetGranule(*ChunkAddr, &ChunkBegin, Tag))
+        return false;
+      *Header = *reinterpret_cast<const Chunk::UnpackedHeader *>(
+          ChunkBegin - Chunk::getHeaderSize());
+      *Data = reinterpret_cast<const u32 *>(ChunkBegin);
+      return true;
+    };
+
+    auto *Depot = reinterpret_cast<const StackDepot *>(DepotPtr);
+
+    auto MaybeCollectTrace = [&](uintptr_t(&Trace)[MaxTraceSize], u32 Hash) {
+      uptr RingPos, Size;
+      if (!Depot->find(Hash, &RingPos, &Size))
+        return;
+      for (unsigned I = 0; I != Size && I != MaxTraceSize; ++I)
+        Trace[I] = (*Depot)[RingPos + I];
+    };
+
+    size_t NextErrorReport = 0;
+
+    // First, check for UAF.
+    {
+      uptr ChunkAddr;
+      Chunk::UnpackedHeader Header;
+      const u32 *Data;
+      uint8_t Tag;
+      if (ReadBlock(Info.BlockBegin, &ChunkAddr, &Header, &Data, &Tag) &&
+          Header.State != Chunk::State::Allocated &&
+          Data[MemTagPrevTagIndex] == FaultAddrTag) {
+        auto *R = &ErrorInfo->reports[NextErrorReport++];
+        R->error_type = USE_AFTER_FREE;
+        R->allocation_address = ChunkAddr;
+        R->allocation_size = Header.SizeOrUnusedBytes;
+        MaybeCollectTrace(R->allocation_trace,
+                          Data[MemTagAllocationTraceIndex]);
+        R->allocation_tid = Data[MemTagAllocationTidIndex];
+        MaybeCollectTrace(R->deallocation_trace,
+                          Data[MemTagDeallocationTraceIndex]);
+        R->deallocation_tid = Data[MemTagDeallocationTidIndex];
+      }
+    }
+
+    auto CheckOOB = [&](uptr BlockAddr) {
+      if (BlockAddr < Info.RegionBegin || BlockAddr >= Info.RegionEnd)
+        return false;
+
+      uptr ChunkAddr;
+      Chunk::UnpackedHeader Header;
+      const u32 *Data;
+      uint8_t Tag;
+      if (!ReadBlock(BlockAddr, &ChunkAddr, &Header, &Data, &Tag) ||
+          Header.State != Chunk::State::Allocated || Tag != FaultAddrTag)
+        return false;
+
+      auto *R = &ErrorInfo->reports[NextErrorReport++];
+      R->error_type =
+          UntaggedFaultAddr < ChunkAddr ? BUFFER_UNDERFLOW : BUFFER_OVERFLOW;
+      R->allocation_address = ChunkAddr;
+      R->allocation_size = Header.SizeOrUnusedBytes;
+      MaybeCollectTrace(R->allocation_trace, Data[MemTagAllocationTraceIndex]);
+      R->allocation_tid = Data[MemTagAllocationTidIndex];
+      return NextErrorReport ==
+             sizeof(ErrorInfo->reports) / sizeof(ErrorInfo->reports[0]);
+    };
+
+    if (CheckOOB(Info.BlockBegin))
+      return;
+
+    // Check for OOB in the 30 surrounding blocks. Beyond that we are likely to
+    // hit false positives.
+    for (int I = 1; I != 16; ++I)
+      if (CheckOOB(Info.BlockBegin + I * Info.BlockSize) ||
+          CheckOOB(Info.BlockBegin - I * Info.BlockSize))
+        return;
+  }
+
 private:
   using SecondaryT = typename Params::Secondary;
   typedef typename PrimaryT::SizeClassMap SizeClassMap;
@@ -561,9 +877,32 @@ private:
 
   static_assert(MinAlignment >= sizeof(Chunk::PackedHeader),
                 "Minimal alignment must at least cover a chunk header.");
+  static_assert(!PrimaryT::SupportsMemoryTagging ||
+                    MinAlignment >= archMemoryTagGranuleSize(),
+                "");
 
   static const u32 BlockMarker = 0x44554353U;
 
+  // These are indexes into an "array" of 32-bit values that store information
+  // inline with a chunk that is relevant to diagnosing memory tag faults, where
+  // 0 corresponds to the address of the user memory. This means that negative
+  // indexes may be used to store information about allocations, while positive
+  // indexes may only be used to store information about deallocations, because
+  // the user memory is in use until it has been deallocated. The smallest index
+  // that may be used is -2, which corresponds to 8 bytes before the user
+  // memory, because the chunk header size is 8 bytes and in allocators that
+  // support memory tagging the minimum alignment is at least the tag granule
+  // size (16 on aarch64), and the largest index that may be used is 3 because
+  // we are only guaranteed to have at least a granule's worth of space in the
+  // user memory.
+  static const sptr MemTagAllocationTraceIndex = -2;
+  static const sptr MemTagAllocationTidIndex = -1;
+  static const sptr MemTagDeallocationTraceIndex = 0;
+  static const sptr MemTagDeallocationTidIndex = 1;
+  static const sptr MemTagPrevTagIndex = 2;
+
+  static const uptr MaxTraceSize = 64;
+
   GlobalStats Stats;
   TSDRegistryT TSDRegistry;
   PrimaryT Primary;
@@ -574,12 +913,19 @@ private:
 
   struct {
     u8 MayReturnNull : 1;       // may_return_null
-    u8 ZeroContents : 1;        // zero_contents
+    FillContentsMode FillContents : 2; // zero_contents, pattern_fill_contents
     u8 DeallocTypeMismatch : 1; // dealloc_type_mismatch
     u8 DeleteSizeMismatch : 1;  // delete_size_mismatch
+    u8 TrackAllocationStacks : 1;
     u32 QuarantineMaxChunkSize; // quarantine_max_chunk_size
   } Options;
 
+#ifdef GWP_ASAN_HOOKS
+  gwp_asan::GuardedPoolAllocator GuardedAlloc;
+#endif // GWP_ASAN_HOOKS
+
+  StackDepot Depot;
+
   // The following might get optimized out by the compiler.
   NOINLINE void performSanityChecks() {
     // Verify that the header offset field can hold the maximum offset. In the
@@ -638,6 +984,14 @@ private:
   void quarantineOrDeallocateChunk(void *Ptr, Chunk::UnpackedHeader *Header,
                                    uptr Size) {
     Chunk::UnpackedHeader NewHeader = *Header;
+    if (UNLIKELY(NewHeader.ClassId && useMemoryTagging())) {
+      u8 PrevTag = extractTag(loadTag(reinterpret_cast<uptr>(Ptr)));
+      uptr TaggedBegin, TaggedEnd;
+      // Exclude the previous tag so that immediate use after free is detected
+      // 100% of the time.
+      setRandomTag(Ptr, Size, 1UL << PrevTag, &TaggedBegin, &TaggedEnd);
+      storeDeallocationStackMaybe(Ptr, PrevTag);
+    }
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
     // than the maximum allowed, we return a chunk directly to the backend.
     // Logical Or can be short-circuited, which introduces unnecessary
@@ -672,13 +1026,39 @@ private:
 
   bool getChunkFromBlock(uptr Block, uptr *Chunk,
                          Chunk::UnpackedHeader *Header) {
-    u32 Offset = 0;
-    if (reinterpret_cast<u32 *>(Block)[0] == BlockMarker)
-      Offset = reinterpret_cast<u32 *>(Block)[1];
-    *Chunk = Block + Offset + Chunk::getHeaderSize();
+    *Chunk =
+        Block + getChunkOffsetFromBlock(reinterpret_cast<const char *>(Block));
     return Chunk::isValid(Cookie, reinterpret_cast<void *>(*Chunk), Header);
   }
 
+  static uptr getChunkOffsetFromBlock(const char *Block) {
+    u32 Offset = 0;
+    if (reinterpret_cast<const u32 *>(Block)[0] == BlockMarker)
+      Offset = reinterpret_cast<const u32 *>(Block)[1];
+    return Offset + Chunk::getHeaderSize();
+  }
+
+  void storeAllocationStackMaybe(void *Ptr) {
+    if (!UNLIKELY(Options.TrackAllocationStacks))
+      return;
+    auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
+    Ptr32[MemTagAllocationTraceIndex] = collectStackTrace();
+    Ptr32[MemTagAllocationTidIndex] = getThreadID();
+  }
+
+  void storeDeallocationStackMaybe(void *Ptr, uint8_t PrevTag) {
+    if (!UNLIKELY(Options.TrackAllocationStacks))
+      return;
+
+    // Disable tag checks here so that we don't need to worry about zero sized
+    // allocations.
+    ScopedDisableMemoryTagChecks x;
+    auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
+    Ptr32[MemTagDeallocationTraceIndex] = collectStackTrace();
+    Ptr32[MemTagDeallocationTidIndex] = getThreadID();
+    Ptr32[MemTagPrevTagIndex] = PrevTag;
+  }
+
   uptr getStats(ScopedString *Str) {
     Primary.getStats(Str);
     Secondary.getStats(Str);
diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h
index a76eb6bbc164..9037f92b4976 100644
--- a/compiler-rt/lib/scudo/standalone/common.h
+++ b/compiler-rt/lib/scudo/standalone/common.h
@@ -126,12 +126,15 @@ inline uptr getPageSizeCached() {
   return getPageSizeSlow();
 }
 
+// Returns 0 if the number of CPUs could not be determined.
 u32 getNumberOfCPUs();
 
 const char *getEnv(const char *Name);
 
 u64 getMonotonicTime();
 
+u32 getThreadID();
+
 // Our randomness gathering function is limited to 256 bytes to ensure we get
 // as many bytes as requested, and avoid interruptions (on Linux).
 constexpr uptr MaxRandomLength = 256U;
@@ -142,6 +145,7 @@ bool getRandom(void *Buffer, uptr Length, bool Blocking = false);
 #define MAP_ALLOWNOMEM (1U << 0)
 #define MAP_NOACCESS (1U << 1)
 #define MAP_RESIZABLE (1U << 2)
+#define MAP_MEMTAG (1U << 3)
 
 // Our platform memory mapping use is restricted to 3 scenarios:
 // - reserve memory at a random address (MAP_NOACCESS);
@@ -171,6 +175,22 @@ void NORETURN dieOnMapUnmapError(bool OutOfMemory = false);
 
 void setAbortMessage(const char *Message);
 
+struct BlockInfo {
+  uptr BlockBegin;
+  uptr BlockSize;
+  uptr RegionBegin;
+  uptr RegionEnd;
+};
+
+constexpr unsigned char PatternFillByte = 0xAB;
+
+enum FillContentsMode {
+  NoFill = 0,
+  ZeroFill = 1,
+  PatternOrZeroFill = 2 // Pattern fill unless the memory is known to be
+                        // zero-initialized already.
+};
+
 } // namespace scudo
 
 #endif // SCUDO_COMMON_H_
diff --git a/compiler-rt/lib/scudo/standalone/flags.cpp b/compiler-rt/lib/scudo/standalone/flags.cpp
index dd9f050a2d20..de5153b288b1 100644
--- a/compiler-rt/lib/scudo/standalone/flags.cpp
+++ b/compiler-rt/lib/scudo/standalone/flags.cpp
@@ -9,7 +9,8 @@
 #include "flags.h"
 #include "common.h"
 #include "flags_parser.h"
-#include "interface.h"
+
+#include "scudo/interface.h"
 
 namespace scudo {
 
diff --git a/compiler-rt/lib/scudo/standalone/flags.inc b/compiler-rt/lib/scudo/standalone/flags.inc
index 25b86e14fa94..b5cab4734166 100644
--- a/compiler-rt/lib/scudo/standalone/flags.inc
+++ b/compiler-rt/lib/scudo/standalone/flags.inc
@@ -34,6 +34,9 @@ SCUDO_FLAG(bool, delete_size_mismatch, true,
 
 SCUDO_FLAG(bool, zero_contents, false, "Zero chunk contents on allocation.")
 
+SCUDO_FLAG(bool, pattern_fill_contents, false,
+           "Pattern fill chunk contents on allocation.")
+
 SCUDO_FLAG(int, rss_limit_mb, -1,
            "Enforce an upper limit (in megabytes) to the process RSS. The "
            "allocator will terminate or return NULL when allocations are "
@@ -45,6 +48,6 @@ SCUDO_FLAG(bool, may_return_null, true,
            "returning NULL in otherwise non-fatal error scenarios, eg: OOM, "
            "invalid allocation alignments, etc.")
 
-SCUDO_FLAG(int, release_to_os_interval_ms, 5000,
+SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000,
            "Interval (in milliseconds) at which to attempt release of unused "
            "memory to the OS. Negative values disable the feature.")
diff --git a/compiler-rt/lib/scudo/standalone/fuchsia.cpp b/compiler-rt/lib/scudo/standalone/fuchsia.cpp
index b3d72de158cf..d4ea33277941 100644
--- a/compiler-rt/lib/scudo/standalone/fuchsia.cpp
+++ b/compiler-rt/lib/scudo/standalone/fuchsia.cpp
@@ -170,6 +170,8 @@ u64 getMonotonicTime() { return _zx_clock_get_monotonic(); }
 
 u32 getNumberOfCPUs() { return _zx_system_get_num_cpus(); }
 
+u32 getThreadID() { return 0; }
+
 bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
   static_assert(MaxRandomLength <= ZX_CPRNG_DRAW_MAX_LEN, "");
   if (UNLIKELY(!Buffer || !Length || Length > MaxRandomLength))
diff --git a/compiler-rt/lib/scudo/standalone/fuzz/get_error_info_fuzzer.cpp b/compiler-rt/lib/scudo/standalone/fuzz/get_error_info_fuzzer.cpp
new file mode 100644
index 000000000000..d29f515215e6
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/fuzz/get_error_info_fuzzer.cpp
@@ -0,0 +1,48 @@
+//===-- get_error_info_fuzzer.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SCUDO_FUZZ
+#include "allocator_config.h"
+#include "combined.h"
+
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include <string>
+#include <vector>
+
+extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
+  using AllocatorT = scudo::Allocator<scudo::AndroidConfig>;
+  FuzzedDataProvider FDP(Data, Size);
+
+  uintptr_t FaultAddr = FDP.ConsumeIntegral<uintptr_t>();
+  uintptr_t MemoryAddr = FDP.ConsumeIntegral<uintptr_t>();
+
+  std::string MemoryAndTags = FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
+  const char *Memory = MemoryAndTags.c_str();
+  // Assume 16-byte alignment.
+  size_t MemorySize = (MemoryAndTags.length() / 17) * 16;
+  const char *MemoryTags = Memory + MemorySize;
+
+  std::string StackDepotBytes = FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
+  std::vector<char> StackDepot(sizeof(scudo::StackDepot), 0);
+  for (size_t i = 0; i < StackDepotBytes.length() && i < StackDepot.size(); ++i) {
+    StackDepot[i] = StackDepotBytes[i];
+  }
+
+  std::string RegionInfoBytes = FDP.ConsumeRemainingBytesAsString();
+  std::vector<char> RegionInfo(AllocatorT::getRegionInfoArraySize(), 0);
+  for (size_t i = 0; i < RegionInfoBytes.length() && i < RegionInfo.size(); ++i) {
+    RegionInfo[i] = RegionInfoBytes[i];
+  }
+
+  scudo_error_info ErrorInfo;
+  AllocatorT::getErrorInfo(&ErrorInfo, FaultAddr, StackDepot.data(),
+                           RegionInfo.data(), Memory, MemoryTags, MemoryAddr,
+                           MemorySize);
+  return 0;
+}
diff --git a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
new file mode 100644
index 000000000000..d30fb6514a14
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
@@ -0,0 +1,110 @@
+//===-- scudo/interface.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_INTERFACE_H_
+#define SCUDO_INTERFACE_H_
+
+#include <stddef.h>
+
+extern "C" {
+
+__attribute__((weak)) const char *__scudo_default_options();
+
+// Post-allocation & pre-deallocation hooks.
+// They must be thread-safe and not use heap related functions.
+__attribute__((weak)) void __scudo_allocate_hook(void *ptr, size_t size);
+__attribute__((weak)) void __scudo_deallocate_hook(void *ptr);
+
+void __scudo_print_stats(void);
+
+typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
+
+// Determine the likely cause of a tag check fault or other memory protection
+// error on a system with memory tagging support. The results are returned via
+// the error_info data structure. Up to three possible causes are returned in
+// the reports array, in decreasing order of probability. The remaining elements
+// of reports are zero-initialized.
+//
+// This function may be called from a different process from the one that
+// crashed. In this case, various data structures must be copied from the
+// crashing process to the process that analyzes the crash.
+//
+// This interface is not guaranteed to be stable and may change at any time.
+// Furthermore, the version of scudo in the crashing process must be the same as
+// the version in the process that analyzes the crash.
+//
+// fault_addr is the fault address. On aarch64 this is available in the system
+// register FAR_ELx, or far_context.far in an upcoming release of the Linux
+// kernel. This address must include the pointer tag; note that the kernel
+// strips the tag from the fields siginfo.si_addr and sigcontext.fault_address,
+// so these addresses are not suitable to be passed as fault_addr.
+//
+// stack_depot is a pointer to the stack depot data structure, which may be
+// obtained by calling the function __scudo_get_stack_depot_addr() in the
+// crashing process. The size of the stack depot is available by calling the
+// function __scudo_get_stack_depot_size().
+//
+// region_info is a pointer to the region info data structure, which may be
+// obtained by calling the function __scudo_get_region_info_addr() in the
+// crashing process. The size of the region info is available by calling the
+// function __scudo_get_region_info_size().
+//
+// memory is a pointer to a region of memory surrounding the fault address.
+// The more memory available via this pointer, the more likely it is that the
+// function will be able to analyze a crash correctly. It is recommended to
+// provide an amount of memory equal to 16 * the primary allocator's largest
+// size class either side of the fault address.
+//
+// memory_tags is a pointer to an array of memory tags for the memory accessed
+// via memory. Each byte of this array corresponds to a region of memory of size
+// equal to the architecturally defined memory tag granule size (16 on aarch64).
+//
+// memory_addr is the start address of memory in the crashing process's address
+// space.
+//
+// memory_size is the size of the memory region referred to by the memory
+// pointer.
+void __scudo_get_error_info(struct scudo_error_info *error_info,
+                            uintptr_t fault_addr, const char *stack_depot,
+                            const char *region_info, const char *memory,
+                            const char *memory_tags, uintptr_t memory_addr,
+                            size_t memory_size);
+
+enum scudo_error_type {
+  UNKNOWN,
+  USE_AFTER_FREE,
+  BUFFER_OVERFLOW,
+  BUFFER_UNDERFLOW,
+};
+
+struct scudo_error_report {
+  enum scudo_error_type error_type;
+
+  uintptr_t allocation_address;
+  uintptr_t allocation_size;
+
+  uint32_t allocation_tid;
+  uintptr_t allocation_trace[64];
+
+  uint32_t deallocation_tid;
+  uintptr_t deallocation_trace[64];
+};
+
+struct scudo_error_info {
+  struct scudo_error_report reports[3];
+};
+
+const char *__scudo_get_stack_depot_addr();
+size_t __scudo_get_stack_depot_size();
+
+const char *__scudo_get_region_info_addr();
+size_t __scudo_get_region_info_size();
+
+} // extern "C"
+
+#endif // SCUDO_INTERFACE_H_
diff --git a/compiler-rt/lib/scudo/standalone/interface.h b/compiler-rt/lib/scudo/standalone/interface.h
deleted file mode 100644
index e2639823f426..000000000000
--- a/compiler-rt/lib/scudo/standalone/interface.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//===-- interface.h ---------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SCUDO_INTERFACE_H_
-#define SCUDO_INTERFACE_H_
-
-#include "internal_defs.h"
-
-extern "C" {
-
-WEAK INTERFACE const char *__scudo_default_options();
-
-// Post-allocation & pre-deallocation hooks.
-// They must be thread-safe and not use heap related functions.
-WEAK INTERFACE void __scudo_allocate_hook(void *ptr, size_t size);
-WEAK INTERFACE void __scudo_deallocate_hook(void *ptr);
-
-WEAK INTERFACE void __scudo_print_stats(void);
-
-typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
-
-} // extern "C"
-
-#endif // SCUDO_INTERFACE_H_
diff --git a/compiler-rt/lib/scudo/standalone/internal_defs.h b/compiler-rt/lib/scudo/standalone/internal_defs.h
index 8f6a89ecba73..a884f1f3a40e 100644
--- a/compiler-rt/lib/scudo/standalone/internal_defs.h
+++ b/compiler-rt/lib/scudo/standalone/internal_defs.h
@@ -29,12 +29,10 @@
 // Attributes & builtins related macros.
 
 #define INTERFACE __attribute__((visibility("default")))
+#define HIDDEN __attribute__((visibility("hidden")))
 #define WEAK __attribute__((weak))
 #define ALWAYS_INLINE inline __attribute__((always_inline))
 #define ALIAS(X) __attribute__((alias(X)))
-// Please only use the ALIGNED macro before the type. Using ALIGNED after the
-// variable declaration is not portable.
-#define ALIGNED(X) __attribute__((aligned(X)))
 #define FORMAT(F, A) __attribute__((format(printf, F, A)))
 #define NOINLINE __attribute__((noinline))
 #define NORETURN __attribute__((noreturn))
diff --git a/compiler-rt/lib/scudo/standalone/linux.cpp b/compiler-rt/lib/scudo/standalone/linux.cpp
index 8266a528f42c..69ffdd9a165b 100644
--- a/compiler-rt/lib/scudo/standalone/linux.cpp
+++ b/compiler-rt/lib/scudo/standalone/linux.cpp
@@ -35,6 +35,10 @@
 #define ANDROID_PR_SET_VMA_ANON_NAME 0
 #endif
 
+#ifdef ANDROID_EXPERIMENTAL_MTE
+#include <bionic/mte_kernel.h>
+#endif
+
 namespace scudo {
 
 uptr getPageSize() { return static_cast<uptr>(sysconf(_SC_PAGESIZE)); }
@@ -50,6 +54,10 @@ void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
     MmapProt = PROT_NONE;
   } else {
     MmapProt = PROT_READ | PROT_WRITE;
+#if defined(__aarch64__) && defined(ANDROID_EXPERIMENTAL_MTE)
+    if (Flags & MAP_MEMTAG)
+      MmapProt |= PROT_MTE;
+#endif
   }
   if (Addr) {
     // Currently no scenario for a noaccess mapping with a fixed address.
@@ -124,10 +132,21 @@ u64 getMonotonicTime() {
 
 u32 getNumberOfCPUs() {
   cpu_set_t CPUs;
-  CHECK_EQ(sched_getaffinity(0, sizeof(cpu_set_t), &CPUs), 0);
+  // sched_getaffinity can fail for a variety of legitimate reasons (lack of
+  // CAP_SYS_NICE, syscall filtering, etc), in which case we shall return 0.
+  if (sched_getaffinity(0, sizeof(cpu_set_t), &CPUs) != 0)
+    return 0;
   return static_cast<u32>(CPU_COUNT(&CPUs));
 }
 
+u32 getThreadID() {
+#if SCUDO_ANDROID
+  return static_cast<u32>(gettid());
+#else
+  return static_cast<u32>(syscall(SYS_gettid));
+#endif
+}
+
 // Blocking is possibly unused if the getrandom block is not compiled in.
 bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
   if (!Buffer || !Length || Length > MaxRandomLength)
@@ -153,10 +172,34 @@ bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
   return (ReadBytes == static_cast<ssize_t>(Length));
 }
 
+// Allocation free syslog-like API.
+extern "C" WEAK int async_safe_write_log(int pri, const char *tag,
+                                         const char *msg);
+
 void outputRaw(const char *Buffer) {
-  static HybridMutex Mutex;
-  ScopedLock L(Mutex);
-  write(2, Buffer, strlen(Buffer));
+  if (&async_safe_write_log) {
+    constexpr s32 AndroidLogInfo = 4;
+    constexpr uptr MaxLength = 1024U;
+    char LocalBuffer[MaxLength];
+    while (strlen(Buffer) > MaxLength) {
+      uptr P;
+      for (P = MaxLength - 1; P > 0; P--) {
+        if (Buffer[P] == '\n') {
+          memcpy(LocalBuffer, Buffer, P);
+          LocalBuffer[P] = '\0';
+          async_safe_write_log(AndroidLogInfo, "scudo", LocalBuffer);
+          Buffer = &Buffer[P + 1];
+          break;
+        }
+      }
+      // If no newline was found, just log the buffer.
+      if (P == 0)
+        break;
+    }
+    async_safe_write_log(AndroidLogInfo, "scudo", Buffer);
+  } else {
+    write(2, Buffer, strlen(Buffer));
+  }
 }
 
 extern "C" WEAK void android_set_abort_message(const char *);
diff --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
index b08abd3e5d9b..a6425fc6d1ea 100644
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -165,13 +165,14 @@ private:
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
     const u32 Count = Min(C->MaxCount / 2, C->Count);
-    const uptr FirstIndexToDrain = C->Count - Count;
-    TransferBatch *B = createBatch(ClassId, C->Chunks[FirstIndexToDrain]);
+    TransferBatch *B = createBatch(ClassId, C->Chunks[0]);
     if (UNLIKELY(!B))
       reportOutOfMemory(
           SizeClassAllocator::getSizeByClassId(SizeClassMap::BatchClassId));
-    B->setFromArray(&C->Chunks[FirstIndexToDrain], Count);
+    B->setFromArray(&C->Chunks[0], Count);
     C->Count -= Count;
+    for (uptr I = 0; I < C->Count; I++)
+      C->Chunks[I] = C->Chunks[I + Count];
     Allocator->pushBatch(ClassId, B);
   }
 };
diff --git a/compiler-rt/lib/scudo/standalone/memtag.h b/compiler-rt/lib/scudo/standalone/memtag.h
new file mode 100644
index 000000000000..6f347f4694e8
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/memtag.h
@@ -0,0 +1,261 @@
+//===-- memtag.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_MEMTAG_H_
+#define SCUDO_MEMTAG_H_
+
+#include "internal_defs.h"
+
+#if SCUDO_LINUX
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#if defined(ANDROID_EXPERIMENTAL_MTE)
+#include <bionic/mte_kernel.h>
+#endif
+#endif
+
+namespace scudo {
+
+#if defined(__aarch64__) || defined(SCUDO_FUZZ)
+
+inline constexpr bool archSupportsMemoryTagging() { return true; }
+inline constexpr uptr archMemoryTagGranuleSize() { return 16; }
+
+inline uptr untagPointer(uptr Ptr) { return Ptr & ((1ULL << 56) - 1); }
+
+inline uint8_t extractTag(uptr Ptr) {
+  return (Ptr >> 56) & 0xf;
+}
+
+#else
+
+inline constexpr bool archSupportsMemoryTagging() { return false; }
+
+inline uptr archMemoryTagGranuleSize() {
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline uptr untagPointer(uptr Ptr) {
+  (void)Ptr;
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline uint8_t extractTag(uptr Ptr) {
+  (void)Ptr;
+  UNREACHABLE("memory tagging not supported");
+}
+
+#endif
+
+#if defined(__aarch64__)
+
+inline bool systemSupportsMemoryTagging() {
+#if defined(ANDROID_EXPERIMENTAL_MTE)
+  return getauxval(AT_HWCAP2) & HWCAP2_MTE;
+#else
+  return false;
+#endif
+}
+
+inline bool systemDetectsMemoryTagFaultsTestOnly() {
+#if defined(ANDROID_EXPERIMENTAL_MTE)
+  return (prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0) & PR_MTE_TCF_MASK) !=
+         PR_MTE_TCF_NONE;
+#else
+  return false;
+#endif
+}
+
+inline void disableMemoryTagChecksTestOnly() {
+  __asm__ __volatile__(".arch_extension mte; msr tco, #1");
+}
+
+inline void enableMemoryTagChecksTestOnly() {
+  __asm__ __volatile__(".arch_extension mte; msr tco, #0");
+}
+
+class ScopedDisableMemoryTagChecks {
+  size_t PrevTCO;
+
+ public:
+  ScopedDisableMemoryTagChecks() {
+    __asm__ __volatile__(".arch_extension mte; mrs %0, tco; msr tco, #1"
+                         : "=r"(PrevTCO));
+  }
+
+  ~ScopedDisableMemoryTagChecks() {
+    __asm__ __volatile__(".arch_extension mte; msr tco, %0" : : "r"(PrevTCO));
+  }
+};
+
+inline void setRandomTag(void *Ptr, uptr Size, uptr ExcludeMask,
+                         uptr *TaggedBegin, uptr *TaggedEnd) {
+  void *End;
+  __asm__ __volatile__(
+      R"(
+    .arch_extension mte
+
+    // Set a random tag for Ptr in TaggedPtr. This needs to happen even if
+    // Size = 0 so that TaggedPtr ends up pointing at a valid address.
+    irg %[TaggedPtr], %[Ptr], %[ExcludeMask]
+    mov %[Cur], %[TaggedPtr]
+
+    // Skip the loop if Size = 0. We don't want to do any tagging in this case.
+    cbz %[Size], 2f
+
+    // Set the memory tag of the region
+    // [TaggedPtr, TaggedPtr + roundUpTo(Size, 16))
+    // to the pointer tag stored in TaggedPtr.
+    add %[End], %[TaggedPtr], %[Size]
+
+  1:
+    stzg %[Cur], [%[Cur]], #16
+    cmp %[Cur], %[End]
+    b.lt 1b
+
+  2:
+  )"
+      :
+      [TaggedPtr] "=&r"(*TaggedBegin), [Cur] "=&r"(*TaggedEnd), [End] "=&r"(End)
+      : [Ptr] "r"(Ptr), [Size] "r"(Size), [ExcludeMask] "r"(ExcludeMask)
+      : "memory");
+}
+
+inline void *prepareTaggedChunk(void *Ptr, uptr Size, uptr BlockEnd) {
+  // Prepare the granule before the chunk to store the chunk header by setting
+  // its tag to 0. Normally its tag will already be 0, but in the case where a
+  // chunk holding a low alignment allocation is reused for a higher alignment
+  // allocation, the chunk may already have a non-zero tag from the previous
+  // allocation.
+  __asm__ __volatile__(".arch_extension mte; stg %0, [%0, #-16]"
+                       :
+                       : "r"(Ptr)
+                       : "memory");
+
+  uptr TaggedBegin, TaggedEnd;
+  setRandomTag(Ptr, Size, 0, &TaggedBegin, &TaggedEnd);
+
+  // Finally, set the tag of the granule past the end of the allocation to 0,
+  // to catch linear overflows even if a previous larger allocation used the
+  // same block and tag. Only do this if the granule past the end is in our
+  // block, because this would otherwise lead to a SEGV if the allocation
+  // covers the entire block and our block is at the end of a mapping. The tag
+  // of the next block's header granule will be set to 0, so it will serve the
+  // purpose of catching linear overflows in this case.
+  uptr UntaggedEnd = untagPointer(TaggedEnd);
+  if (UntaggedEnd != BlockEnd)
+    __asm__ __volatile__(".arch_extension mte; stg %0, [%0]"
+                         :
+                         : "r"(UntaggedEnd)
+                         : "memory");
+  return reinterpret_cast<void *>(TaggedBegin);
+}
+
+inline void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr BlockEnd) {
+  uptr RoundOldPtr = roundUpTo(OldPtr, 16);
+  if (RoundOldPtr >= NewPtr) {
+    // If the allocation is shrinking we just need to set the tag past the end
+    // of the allocation to 0. See explanation in prepareTaggedChunk above.
+    uptr RoundNewPtr = untagPointer(roundUpTo(NewPtr, 16));
+    if (RoundNewPtr != BlockEnd)
+      __asm__ __volatile__(".arch_extension mte; stg %0, [%0]"
+                           :
+                           : "r"(RoundNewPtr)
+                           : "memory");
+    return;
+  }
+
+  __asm__ __volatile__(R"(
+    .arch_extension mte
+
+    // Set the memory tag of the region
+    // [roundUpTo(OldPtr, 16), roundUpTo(NewPtr, 16))
+    // to the pointer tag stored in OldPtr.
+  1:
+    stzg %[Cur], [%[Cur]], #16
+    cmp %[Cur], %[End]
+    b.lt 1b
+
+    // Finally, set the tag of the granule past the end of the allocation to 0.
+    and %[Cur], %[Cur], #(1 << 56) - 1
+    cmp %[Cur], %[BlockEnd]
+    b.eq 2f
+    stg %[Cur], [%[Cur]]
+
+  2:
+  )"
+                       : [ Cur ] "+&r"(RoundOldPtr), [ End ] "+&r"(NewPtr)
+                       : [ BlockEnd ] "r"(BlockEnd)
+                       : "memory");
+}
+
+inline uptr loadTag(uptr Ptr) {
+  uptr TaggedPtr = Ptr;
+  __asm__ __volatile__(".arch_extension mte; ldg %0, [%0]"
+                       : "+r"(TaggedPtr)
+                       :
+                       : "memory");
+  return TaggedPtr;
+}
+
+#else
+
+inline bool systemSupportsMemoryTagging() {
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline bool systemDetectsMemoryTagFaultsTestOnly() {
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline void disableMemoryTagChecksTestOnly() {
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline void enableMemoryTagChecksTestOnly() {
+  UNREACHABLE("memory tagging not supported");
+}
+
+struct ScopedDisableMemoryTagChecks {
+  ScopedDisableMemoryTagChecks() {}
+};
+
+inline void setRandomTag(void *Ptr, uptr Size, uptr ExcludeMask,
+                         uptr *TaggedBegin, uptr *TaggedEnd) {
+  (void)Ptr;
+  (void)Size;
+  (void)ExcludeMask;
+  (void)TaggedBegin;
+  (void)TaggedEnd;
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline void *prepareTaggedChunk(void *Ptr, uptr Size, uptr BlockEnd) {
+  (void)Ptr;
+  (void)Size;
+  (void)BlockEnd;
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr BlockEnd) {
+  (void)OldPtr;
+  (void)NewPtr;
+  (void)BlockEnd;
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline uptr loadTag(uptr Ptr) {
+  (void)Ptr;
+  UNREACHABLE("memory tagging not supported");
+}
+
+#endif
+
+} // namespace scudo
+
+#endif
diff --git a/compiler-rt/lib/scudo/standalone/mutex.h b/compiler-rt/lib/scudo/standalone/mutex.h
index b26b2df06627..d6e6a5b33aae 100644
--- a/compiler-rt/lib/scudo/standalone/mutex.h
+++ b/compiler-rt/lib/scudo/standalone/mutex.h
@@ -22,7 +22,7 @@ namespace scudo {
 
 class HybridMutex {
 public:
-  void init() { memset(this, 0, sizeof(*this)); }
+  void init() { M = {}; }
   bool tryLock();
   NOINLINE void lock() {
     if (LIKELY(tryLock()))
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index e296a78778e0..29a268098185 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -38,14 +38,23 @@ namespace scudo {
 // Memory used by this allocator is never unmapped but can be partially
 // reclaimed if the platform allows for it.
 
-template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
+template <class SizeClassMapT, uptr RegionSizeLog,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX>
+class SizeClassAllocator32 {
 public:
   typedef SizeClassMapT SizeClassMap;
+  // The bytemap can only track UINT8_MAX - 1 classes.
+  static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
   static_assert((1UL << RegionSizeLog) >= SizeClassMap::MaxSize, "");
-  typedef SizeClassAllocator32<SizeClassMapT, RegionSizeLog> ThisT;
+  typedef SizeClassAllocator32<SizeClassMapT, RegionSizeLog,
+                               MinReleaseToOsIntervalMs,
+                               MaxReleaseToOsIntervalMs>
+      ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  static const bool SupportsMemoryTagging = false;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -63,20 +72,21 @@ public:
     MinRegionIndex = NumRegions; // MaxRegionIndex is already initialized to 0.
 
     u32 Seed;
+    const u64 Time = getMonotonicTime();
     if (UNLIKELY(!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed))))
-      Seed =
-          static_cast<u32>(getMonotonicTime() ^
-                           (reinterpret_cast<uptr>(SizeClassInfoArray) >> 6));
+      Seed = static_cast<u32>(
+          Time ^ (reinterpret_cast<uptr>(SizeClassInfoArray) >> 6));
     const uptr PageSize = getPageSizeCached();
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
       Sci->RandState = getRandomU32(&Seed);
       // See comment in the 64-bit primary about releasing smaller size classes.
-      Sci->CanRelease = (ReleaseToOsInterval >= 0) &&
-                        (I != SizeClassMap::BatchClassId) &&
+      Sci->CanRelease = (I != SizeClassMap::BatchClassId) &&
                         (getSizeByClassId(I) >= (PageSize / 32));
+      if (Sci->CanRelease)
+        Sci->ReleaseInfo.LastReleaseAtNs = Time;
     }
-    ReleaseToOsIntervalMs = ReleaseToOsInterval;
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
@@ -87,8 +97,7 @@ public:
     while (NumberOfStashedRegions > 0)
       unmap(reinterpret_cast<void *>(RegionsStash[--NumberOfStashedRegions]),
             RegionSize);
-    // TODO(kostyak): unmap the TransferBatch regions as well.
-    for (uptr I = 0; I < NumRegions; I++)
+    for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++)
       if (PossibleRegions[I])
         unmap(reinterpret_cast<void *>(I * RegionSize), RegionSize);
     PossibleRegions.unmapTestOnly();
@@ -147,8 +156,9 @@ public:
 
   template <typename F> void iterateOverBlocks(F Callback) {
     for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++)
-      if (PossibleRegions[I]) {
-        const uptr BlockSize = getSizeByClassId(PossibleRegions[I]);
+      if (PossibleRegions[I] &&
+          (PossibleRegions[I] - 1U) != SizeClassMap::BatchClassId) {
+        const uptr BlockSize = getSizeByClassId(PossibleRegions[I] - 1U);
         const uptr From = I * RegionSize;
         const uptr To = From + (RegionSize / BlockSize) * BlockSize;
         for (uptr Block = From; Block < To; Block += BlockSize)
@@ -174,11 +184,18 @@ public:
       getStats(Str, I, 0);
   }
 
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
+    }
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+  }
+
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
-      if (I == SizeClassMap::BatchClassId)
-        continue;
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L(Sci->Mutex);
       TotalReleasedBytes += releaseToOSMaybe(Sci, I, /*Force=*/true);
@@ -186,15 +203,24 @@ public:
     return TotalReleasedBytes;
   }
 
+  bool useMemoryTagging() { return false; }
+  void disableMemoryTagging() {}
+
+  const char *getRegionInfoArrayAddress() const { return nullptr; }
+  static uptr getRegionInfoArraySize() { return 0; }
+
+  static BlockInfo findNearestBlock(const char *RegionInfoData, uptr Ptr) {
+    (void)RegionInfoData;
+    (void)Ptr;
+    return {};
+  }
+
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr RegionSize = 1UL << RegionSizeLog;
   static const uptr NumRegions = SCUDO_MMAP_RANGE_SIZE >> RegionSizeLog;
-#if SCUDO_WORDSIZE == 32U
+  static const u32 MaxNumBatches = SCUDO_ANDROID ? 4U : 8U;
   typedef FlatByteMap<NumRegions> ByteMap;
-#else
-  typedef TwoLevelByteMap<(NumRegions >> 12), 1UL << 12> ByteMap;
-#endif
 
   struct SizeClassStats {
     uptr PoppedBlocks;
@@ -208,9 +234,11 @@ private:
     u64 LastReleaseAtNs;
   };
 
-  struct ALIGNED(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
+  struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
     HybridMutex Mutex;
     SinglyLinkedList<TransferBatch> FreeList;
+    uptr CurrentRegion;
+    uptr CurrentRegionAllocated;
     SizeClassStats Stats;
     bool CanRelease;
     u32 RandState;
@@ -261,14 +289,12 @@ private:
     if (!Region)
       Region = allocateRegionSlow();
     if (LIKELY(Region)) {
-      if (ClassId) {
-        const uptr RegionIndex = computeRegionId(Region);
-        if (RegionIndex < MinRegionIndex)
-          MinRegionIndex = RegionIndex;
-        if (RegionIndex > MaxRegionIndex)
-          MaxRegionIndex = RegionIndex;
-        PossibleRegions.set(RegionIndex, static_cast<u8>(ClassId));
-      }
+      const uptr RegionIndex = computeRegionId(Region);
+      if (RegionIndex < MinRegionIndex)
+        MinRegionIndex = RegionIndex;
+      if (RegionIndex > MaxRegionIndex)
+        MaxRegionIndex = RegionIndex;
+      PossibleRegions.set(RegionIndex, static_cast<u8>(ClassId + 1U));
     }
     return Region;
   }
@@ -303,21 +329,50 @@ private:
 
   NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
                                            SizeClassInfo *Sci) {
-    const uptr Region = allocateRegion(ClassId);
-    if (UNLIKELY(!Region))
-      return nullptr;
-    C->getStats().add(StatMapped, RegionSize);
+    uptr Region;
+    uptr Offset;
+    // If the size-class currently has a region associated to it, use it. The
+    // newly created blocks will be located after the currently allocated memory
+    // for that region (up to RegionSize). Otherwise, create a new region, where
+    // the new blocks will be carved from the beginning.
+    if (Sci->CurrentRegion) {
+      Region = Sci->CurrentRegion;
+      DCHECK_GT(Sci->CurrentRegionAllocated, 0U);
+      Offset = Sci->CurrentRegionAllocated;
+    } else {
+      DCHECK_EQ(Sci->CurrentRegionAllocated, 0U);
+      Region = allocateRegion(ClassId);
+      if (UNLIKELY(!Region))
+        return nullptr;
+      C->getStats().add(StatMapped, RegionSize);
+      Sci->CurrentRegion = Region;
+      Offset = 0;
+    }
+
     const uptr Size = getSizeByClassId(ClassId);
     const u32 MaxCount = TransferBatch::getMaxCached(Size);
-    DCHECK_GT(MaxCount, 0);
-    const uptr NumberOfBlocks = RegionSize / Size;
-    DCHECK_GT(NumberOfBlocks, 0);
+    DCHECK_GT(MaxCount, 0U);
+    // The maximum number of blocks we should carve in the region is dictated
+    // by the maximum number of batches we want to fill, and the amount of
+    // memory left in the current region (we use the lowest of the two). This
+    // will not be 0 as we ensure that a region can at least hold one block (via
+    // static_assert and at the end of this function).
+    const u32 NumberOfBlocks =
+        Min(MaxNumBatches * MaxCount,
+            static_cast<u32>((RegionSize - Offset) / Size));
+    DCHECK_GT(NumberOfBlocks, 0U);
+
     TransferBatch *B = nullptr;
-    constexpr u32 ShuffleArraySize = 8U * TransferBatch::MaxNumCached;
+    constexpr u32 ShuffleArraySize =
+        MaxNumBatches * TransferBatch::MaxNumCached;
+    // Fill the transfer batches and put them in the size-class freelist. We
+    // need to randomize the blocks for security purposes, so we first fill a
+    // local array that we then shuffle before populating the batches.
     void *ShuffleArray[ShuffleArraySize];
     u32 Count = 0;
     const uptr AllocatedUser = Size * NumberOfBlocks;
-    for (uptr I = Region; I < Region + AllocatedUser; I += Size) {
+    for (uptr I = Region + Offset; I < Region + Offset + AllocatedUser;
+         I += Size) {
       ShuffleArray[Count++] = reinterpret_cast<void *>(I);
       if (Count == ShuffleArraySize) {
         if (UNLIKELY(!populateBatches(C, Sci, ClassId, &B, MaxCount,
@@ -340,9 +395,18 @@ private:
     DCHECK_GT(B->getCount(), 0);
 
     C->getStats().add(StatFree, AllocatedUser);
+    DCHECK_LE(Sci->CurrentRegionAllocated + AllocatedUser, RegionSize);
+    // If there is not enough room in the region currently associated to fit
+    // more blocks, we deassociate the region by resetting CurrentRegion and
+    // CurrentRegionAllocated. Otherwise, update the allocated amount.
+    if (RegionSize - (Sci->CurrentRegionAllocated + AllocatedUser) < Size) {
+      Sci->CurrentRegion = 0;
+      Sci->CurrentRegionAllocated = 0;
+    } else {
+      Sci->CurrentRegionAllocated += AllocatedUser;
+    }
     Sci->AllocatedUser += AllocatedUser;
-    if (Sci->CanRelease)
-      Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
+
     return B;
   }
 
@@ -353,10 +417,14 @@ private:
     const uptr InUse = Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks;
     const uptr AvailableChunks = Sci->AllocatedUser / getSizeByClassId(ClassId);
     Str->append("  %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
-                "inuse: %6zu avail: %6zu rss: %6zuK\n",
+                "inuse: %6zu avail: %6zu rss: %6zuK releases: %6zu\n",
                 ClassId, getSizeByClassId(ClassId), Sci->AllocatedUser >> 10,
                 Sci->Stats.PoppedBlocks, Sci->Stats.PushedBlocks, InUse,
-                AvailableChunks, Rss >> 10);
+                AvailableChunks, Rss >> 10, Sci->ReleaseInfo.RangesReleased);
+  }
+
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
   }
 
   NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
@@ -370,18 +438,18 @@ private:
         (Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks) * BlockSize;
     if (BytesInFreeList < PageSize)
       return 0; // No chance to release anything.
-    if ((Sci->Stats.PushedBlocks - Sci->ReleaseInfo.PushedBlocksAtLastRelease) *
-            BlockSize <
-        PageSize) {
+    const uptr BytesPushed =
+        (Sci->Stats.PushedBlocks - Sci->ReleaseInfo.PushedBlocksAtLastRelease) *
+        BlockSize;
+    if (BytesPushed < PageSize)
       return 0; // Nothing new to release.
-    }
 
     if (!Force) {
-      const s32 IntervalMs = ReleaseToOsIntervalMs;
+      const s32 IntervalMs = getReleaseToOsIntervalMs();
       if (IntervalMs < 0)
         return 0;
       if (Sci->ReleaseInfo.LastReleaseAtNs +
-              static_cast<uptr>(IntervalMs) * 1000000ULL >
+              static_cast<u64>(IntervalMs) * 1000000 >
           getMonotonicTime()) {
         return 0; // Memory was returned recently.
       }
@@ -391,11 +459,18 @@ private:
     // iterate multiple times over the same freelist if a ClassId spans multiple
     // regions. But it will have to do for now.
     uptr TotalReleasedBytes = 0;
+    const uptr MaxSize = (RegionSize / BlockSize) * BlockSize;
     for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) {
-      if (PossibleRegions[I] == ClassId) {
-        ReleaseRecorder Recorder(I * RegionSize);
-        releaseFreeMemoryToOS(Sci->FreeList, I * RegionSize,
-                              RegionSize / PageSize, BlockSize, &Recorder);
+      if (PossibleRegions[I] - 1U == ClassId) {
+        const uptr Region = I * RegionSize;
+        // If the region is the one currently associated to the size-class, we
+        // only need to release up to CurrentRegionAllocated, MaxSize otherwise.
+        const uptr Size = (Region == Sci->CurrentRegion)
+                              ? Sci->CurrentRegionAllocated
+                              : MaxSize;
+        ReleaseRecorder Recorder(Region);
+        releaseFreeMemoryToOS(Sci->FreeList, Region, Size, BlockSize,
+                              &Recorder);
         if (Recorder.getReleasedRangesCount() > 0) {
           Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
           Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
@@ -410,12 +485,13 @@ private:
 
   SizeClassInfo SizeClassInfoArray[NumClasses];
 
+  // Track the regions in use, 0 is unused, otherwise store ClassId + 1.
   ByteMap PossibleRegions;
   // Keep track of the lowest & highest regions allocated to avoid looping
   // through the whole NumRegions.
   uptr MinRegionIndex;
   uptr MaxRegionIndex;
-  s32 ReleaseToOsIntervalMs;
+  atomic_s32 ReleaseToOsIntervalMs;
   // Unless several threads request regions simultaneously from different size
   // classes, the stash rarely contains more than 1 entry.
   static constexpr uptr MaxStashedRegions = 4;
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index ef02f0b772d6..d4767882ba2c 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -13,6 +13,7 @@
 #include "common.h"
 #include "list.h"
 #include "local_cache.h"
+#include "memtag.h"
 #include "release.h"
 #include "stats.h"
 #include "string_utils.h"
@@ -38,12 +39,21 @@ namespace scudo {
 // The memory used by this allocator is never unmapped, but can be partially
 // released if the platform allows for it.
 
-template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator64 {
+template <class SizeClassMapT, uptr RegionSizeLog,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX,
+          bool MaySupportMemoryTagging = false>
+class SizeClassAllocator64 {
 public:
   typedef SizeClassMapT SizeClassMap;
-  typedef SizeClassAllocator64<SizeClassMap, RegionSizeLog> ThisT;
+  typedef SizeClassAllocator64<
+      SizeClassMap, RegionSizeLog, MinReleaseToOsIntervalMs,
+      MaxReleaseToOsIntervalMs, MaySupportMemoryTagging>
+      ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  static const bool SupportsMemoryTagging =
+      MaySupportMemoryTagging && archSupportsMemoryTagging();
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -58,20 +68,17 @@ public:
     PrimaryBase = reinterpret_cast<uptr>(
         map(nullptr, PrimarySize, "scudo:primary", MAP_NOACCESS, &Data));
 
-    RegionInfoArray = reinterpret_cast<RegionInfo *>(
-        map(nullptr, sizeof(RegionInfo) * NumClasses, "scudo:regioninfo"));
-    DCHECK_EQ(reinterpret_cast<uptr>(RegionInfoArray) % SCUDO_CACHE_LINE_SIZE,
-              0);
-
     u32 Seed;
+    const u64 Time = getMonotonicTime();
     if (UNLIKELY(!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed))))
-      Seed = static_cast<u32>(getMonotonicTime() ^ (PrimaryBase >> 12));
+      Seed = static_cast<u32>(Time ^ (PrimaryBase >> 12));
     const uptr PageSize = getPageSizeCached();
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
       // The actual start of a region is offseted by a random number of pages.
       Region->RegionBeg =
           getRegionBaseByClassId(I) + (getRandomModN(&Seed, 16) + 1) * PageSize;
+      Region->RandState = getRandomU32(&Seed);
       // Releasing smaller size classes doesn't necessarily yield to a
       // meaningful RSS impact: there are more blocks per page, they are
       // randomized around, and thus pages are less likely to be entirely empty.
@@ -79,12 +86,15 @@ public:
       // memory accesses which ends up being fairly costly. The current lower
       // limit is mostly arbitrary and based on empirical observations.
       // TODO(kostyak): make the lower limit a runtime option
-      Region->CanRelease = (ReleaseToOsInterval >= 0) &&
-                           (I != SizeClassMap::BatchClassId) &&
+      Region->CanRelease = (I != SizeClassMap::BatchClassId) &&
                            (getSizeByClassId(I) >= (PageSize / 32));
-      Region->RandState = getRandomU32(&Seed);
+      if (Region->CanRelease)
+        Region->ReleaseInfo.LastReleaseAtNs = Time;
     }
-    ReleaseToOsIntervalMs = ReleaseToOsInterval;
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
+
+    if (SupportsMemoryTagging)
+      UseMemoryTagging = systemSupportsMemoryTagging();
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
@@ -93,8 +103,6 @@ public:
 
   void unmapTestOnly() {
     unmap(reinterpret_cast<void *>(PrimaryBase), PrimarySize, UNMAP_ALL, &Data);
-    unmap(reinterpret_cast<void *>(RegionInfoArray),
-          sizeof(RegionInfo) * NumClasses);
   }
 
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
@@ -143,7 +151,7 @@ public:
     }
   }
 
-  template <typename F> void iterateOverBlocks(F Callback) const {
+  template <typename F> void iterateOverBlocks(F Callback) {
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
         continue;
@@ -156,7 +164,7 @@ public:
     }
   }
 
-  void getStats(ScopedString *Str) const {
+  void getStats(ScopedString *Str) {
     // TODO(kostyak): get the RSS per region.
     uptr TotalMapped = 0;
     uptr PoppedBlocks = 0;
@@ -177,11 +185,18 @@ public:
       getStats(Str, I, 0);
   }
 
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
+    }
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+  }
+
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
-      if (I == SizeClassMap::BatchClassId)
-        continue;
       RegionInfo *Region = getRegionInfo(I);
       ScopedLock L(Region->Mutex);
       TotalReleasedBytes += releaseToOSMaybe(Region, I, /*Force=*/true);
@@ -189,15 +204,72 @@ public:
     return TotalReleasedBytes;
   }
 
+  bool useMemoryTagging() const {
+    return SupportsMemoryTagging && UseMemoryTagging;
+  }
+  void disableMemoryTagging() { UseMemoryTagging = false; }
+
+  const char *getRegionInfoArrayAddress() const {
+    return reinterpret_cast<const char *>(RegionInfoArray);
+  }
+
+  static uptr getRegionInfoArraySize() {
+    return sizeof(RegionInfoArray);
+  }
+
+  static BlockInfo findNearestBlock(const char *RegionInfoData, uptr Ptr) {
+    const RegionInfo *RegionInfoArray =
+        reinterpret_cast<const RegionInfo *>(RegionInfoData);
+    uptr ClassId;
+    uptr MinDistance = -1UL;
+    for (uptr I = 0; I != NumClasses; ++I) {
+      if (I == SizeClassMap::BatchClassId)
+        continue;
+      uptr Begin = RegionInfoArray[I].RegionBeg;
+      uptr End = Begin + RegionInfoArray[I].AllocatedUser;
+      if (Begin > End || End - Begin < SizeClassMap::getSizeByClassId(I))
+        continue;
+      uptr RegionDistance;
+      if (Begin <= Ptr) {
+        if (Ptr < End)
+          RegionDistance = 0;
+        else
+          RegionDistance = Ptr - End;
+      } else {
+        RegionDistance = Begin - Ptr;
+      }
+
+      if (RegionDistance < MinDistance) {
+        MinDistance = RegionDistance;
+        ClassId = I;
+      }
+    }
+
+    BlockInfo B = {};
+    if (MinDistance <= 8192) {
+      B.RegionBegin = RegionInfoArray[ClassId].RegionBeg;
+      B.RegionEnd = B.RegionBegin + RegionInfoArray[ClassId].AllocatedUser;
+      B.BlockSize = SizeClassMap::getSizeByClassId(ClassId);
+      B.BlockBegin =
+          B.RegionBegin + uptr(sptr(Ptr - B.RegionBegin) / sptr(B.BlockSize) *
+                               sptr(B.BlockSize));
+      while (B.BlockBegin < B.RegionBegin)
+        B.BlockBegin += B.BlockSize;
+      while (B.RegionEnd < B.BlockBegin + B.BlockSize)
+        B.BlockBegin -= B.BlockSize;
+    }
+    return B;
+  }
+
 private:
   static const uptr RegionSize = 1UL << RegionSizeLog;
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr PrimarySize = RegionSize * NumClasses;
 
   // Call map for user memory with at least this size.
-  static const uptr MapSizeIncrement = 1UL << 17;
+  static const uptr MapSizeIncrement = 1UL << 18;
   // Fill at most this number of batches from the newly map'd memory.
-  static const u32 MaxNumBatches = 8U;
+  static const u32 MaxNumBatches = SCUDO_ANDROID ? 4U : 8U;
 
   struct RegionStats {
     uptr PoppedBlocks;
@@ -211,7 +283,7 @@ private:
     u64 LastReleaseAtNs;
   };
 
-  struct ALIGNED(SCUDO_CACHE_LINE_SIZE) RegionInfo {
+  struct UnpaddedRegionInfo {
     HybridMutex Mutex;
     SinglyLinkedList<TransferBatch> FreeList;
     RegionStats Stats;
@@ -224,14 +296,19 @@ private:
     MapPlatformData Data;
     ReleaseToOsInfo ReleaseInfo;
   };
+  struct RegionInfo : UnpaddedRegionInfo {
+    char Padding[SCUDO_CACHE_LINE_SIZE -
+                 (sizeof(UnpaddedRegionInfo) % SCUDO_CACHE_LINE_SIZE)];
+  };
   static_assert(sizeof(RegionInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
   uptr PrimaryBase;
-  RegionInfo *RegionInfoArray;
   MapPlatformData Data;
-  s32 ReleaseToOsIntervalMs;
+  atomic_s32 ReleaseToOsIntervalMs;
+  bool UseMemoryTagging;
+  alignas(SCUDO_CACHE_LINE_SIZE) RegionInfo RegionInfoArray[NumClasses];
 
-  RegionInfo *getRegionInfo(uptr ClassId) const {
+  RegionInfo *getRegionInfo(uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     return &RegionInfoArray[ClassId];
   }
@@ -294,7 +371,9 @@ private:
         Region->Data = Data;
       if (UNLIKELY(!map(reinterpret_cast<void *>(RegionBeg + MappedUser),
                         UserMapSize, "scudo:primary",
-                        MAP_ALLOWNOMEM | MAP_RESIZABLE, &Region->Data)))
+                        MAP_ALLOWNOMEM | MAP_RESIZABLE |
+                            (useMemoryTagging() ? MAP_MEMTAG : 0),
+                        &Region->Data)))
         return nullptr;
       Region->MappedUser += UserMapSize;
       C->getStats().add(StatMapped, UserMapSize);
@@ -337,13 +416,11 @@ private:
     C->getStats().add(StatFree, AllocatedUser);
     Region->AllocatedUser += AllocatedUser;
     Region->Exhausted = false;
-    if (Region->CanRelease)
-      Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
 
     return B;
   }
 
-  void getStats(ScopedString *Str, uptr ClassId, uptr Rss) const {
+  void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
     RegionInfo *Region = getRegionInfo(ClassId);
     if (Region->MappedUser == 0)
       return;
@@ -360,6 +437,10 @@ private:
                 getRegionBaseByClassId(ClassId));
   }
 
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
+  }
+
   NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
                                  bool Force = false) {
     const uptr BlockSize = getSizeByClassId(ClassId);
@@ -371,19 +452,18 @@ private:
         (Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks) * BlockSize;
     if (BytesInFreeList < PageSize)
       return 0; // No chance to release anything.
-    if ((Region->Stats.PushedBlocks -
-         Region->ReleaseInfo.PushedBlocksAtLastRelease) *
-            BlockSize <
-        PageSize) {
+    const uptr BytesPushed = (Region->Stats.PushedBlocks -
+                              Region->ReleaseInfo.PushedBlocksAtLastRelease) *
+                             BlockSize;
+    if (BytesPushed < PageSize)
       return 0; // Nothing new to release.
-    }
 
     if (!Force) {
-      const s32 IntervalMs = ReleaseToOsIntervalMs;
+      const s32 IntervalMs = getReleaseToOsIntervalMs();
       if (IntervalMs < 0)
         return 0;
       if (Region->ReleaseInfo.LastReleaseAtNs +
-              static_cast<uptr>(IntervalMs) * 1000000ULL >
+              static_cast<u64>(IntervalMs) * 1000000 >
           getMonotonicTime()) {
         return 0; // Memory was returned recently.
       }
@@ -391,8 +471,7 @@ private:
 
     ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data);
     releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg,
-                          roundUpTo(Region->AllocatedUser, PageSize) / PageSize,
-                          BlockSize, &Recorder);
+                          Region->AllocatedUser, BlockSize, &Recorder);
 
     if (Recorder.getReleasedRangesCount() > 0) {
       Region->ReleaseInfo.PushedBlocksAtLastRelease =
diff --git a/compiler-rt/lib/scudo/standalone/quarantine.h b/compiler-rt/lib/scudo/standalone/quarantine.h
index 406a0e23804d..27aa4bfec91a 100644
--- a/compiler-rt/lib/scudo/standalone/quarantine.h
+++ b/compiler-rt/lib/scudo/standalone/quarantine.h
@@ -187,7 +187,12 @@ public:
     Cache.initLinkerInitialized();
   }
   void init(uptr Size, uptr CacheSize) {
-    memset(this, 0, sizeof(*this));
+    CacheMutex.init();
+    Cache.init();
+    RecycleMutex.init();
+    MinSize = {};
+    MaxSize = {};
+    MaxCacheSize = {};
     initLinkerInitialized(Size, CacheSize);
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/release.cpp b/compiler-rt/lib/scudo/standalone/release.cpp
new file mode 100644
index 000000000000..e144b354b258
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/release.cpp
@@ -0,0 +1,16 @@
+//===-- release.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "release.h"
+
+namespace scudo {
+
+HybridMutex PackedCounterArray::Mutex = {};
+uptr PackedCounterArray::StaticBuffer[1024];
+
+} // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h
index 4b5c56ce7c19..323bf9db6dca 100644
--- a/compiler-rt/lib/scudo/standalone/release.h
+++ b/compiler-rt/lib/scudo/standalone/release.h
@@ -11,6 +11,7 @@
 
 #include "common.h"
 #include "list.h"
+#include "mutex.h"
 
 namespace scudo {
 
@@ -39,11 +40,13 @@ private:
 };
 
 // A packed array of Counters. Each counter occupies 2^N bits, enough to store
-// counter's MaxValue. Ctor will try to allocate the required Buffer via map()
-// and the caller is expected to check whether the initialization was successful
-// by checking isAllocated() result. For the performance sake, none of the
-// accessors check the validity of the arguments, It is assumed that Index is
-// always in [0, N) range and the value is not incremented past MaxValue.
+// counter's MaxValue. Ctor will try to use a static buffer first, and if that
+// fails (the buffer is too small or already locked), will allocate the
+// required Buffer via map(). The caller is expected to check whether the
+// initialization was successful by checking isAllocated() result. For
+// performance sake, none of the accessors check the validity of the arguments,
+// It is assumed that Index is always in [0, N) range and the value is not
+// incremented past MaxValue.
 class PackedCounterArray {
 public:
   PackedCounterArray(uptr NumCounters, uptr MaxValue) : N(NumCounters) {
@@ -66,11 +69,20 @@ public:
     BufferSize = (roundUpTo(N, static_cast<uptr>(1U) << PackingRatioLog) >>
                   PackingRatioLog) *
                  sizeof(*Buffer);
-    Buffer = reinterpret_cast<uptr *>(
-        map(nullptr, BufferSize, "scudo:counters", MAP_ALLOWNOMEM));
+    if (BufferSize <= StaticBufferSize && Mutex.tryLock()) {
+      Buffer = &StaticBuffer[0];
+      memset(Buffer, 0, BufferSize);
+    } else {
+      Buffer = reinterpret_cast<uptr *>(
+          map(nullptr, BufferSize, "scudo:counters", MAP_ALLOWNOMEM));
+    }
   }
   ~PackedCounterArray() {
-    if (isAllocated())
+    if (!isAllocated())
+      return;
+    if (Buffer == &StaticBuffer[0])
+      Mutex.unlock();
+    else
       unmap(reinterpret_cast<void *>(Buffer), BufferSize);
   }
 
@@ -95,7 +107,8 @@ public:
 
   void incRange(uptr From, uptr To) const {
     DCHECK_LE(From, To);
-    for (uptr I = From; I <= To; I++)
+    const uptr Top = Min(To + 1, N);
+    for (uptr I = From; I < Top; I++)
       inc(I);
   }
 
@@ -110,6 +123,10 @@ private:
 
   uptr BufferSize;
   uptr *Buffer;
+
+  static HybridMutex Mutex;
+  static const uptr StaticBufferSize = 1024U;
+  static uptr StaticBuffer[StaticBufferSize];
 };
 
 template <class ReleaseRecorderT> class FreePagesRangeTracker {
@@ -150,8 +167,7 @@ private:
 template <class TransferBatchT, class ReleaseRecorderT>
 NOINLINE void
 releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
-                      uptr AllocatedPagesCount, uptr BlockSize,
-                      ReleaseRecorderT *Recorder) {
+                      uptr Size, uptr BlockSize, ReleaseRecorderT *Recorder) {
   const uptr PageSize = getPageSizeCached();
 
   // Figure out the number of chunks per page and whether we can take a fast
@@ -188,34 +204,51 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
     }
   }
 
-  PackedCounterArray Counters(AllocatedPagesCount, FullPagesBlockCountMax);
+  const uptr PagesCount = roundUpTo(Size, PageSize) / PageSize;
+  PackedCounterArray Counters(PagesCount, FullPagesBlockCountMax);
   if (!Counters.isAllocated())
     return;
 
   const uptr PageSizeLog = getLog2(PageSize);
-  const uptr End = Base + AllocatedPagesCount * PageSize;
+  const uptr RoundedSize = PagesCount << PageSizeLog;
 
   // Iterate over free chunks and count how many free chunks affect each
   // allocated page.
   if (BlockSize <= PageSize && PageSize % BlockSize == 0) {
     // Each chunk affects one page only.
     for (const auto &It : FreeList) {
-      for (u32 I = 0; I < It.getCount(); I++) {
-        const uptr P = reinterpret_cast<uptr>(It.get(I));
-        if (P >= Base && P < End)
-          Counters.inc((P - Base) >> PageSizeLog);
+      // If dealing with a TransferBatch, the first pointer of the batch will
+      // point to the batch itself, we do not want to mark this for release as
+      // the batch is in use, so skip the first entry.
+      const bool IsTransferBatch =
+          (It.getCount() != 0) &&
+          (reinterpret_cast<uptr>(It.get(0)) == reinterpret_cast<uptr>(&It));
+      for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
+        const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
+        // This takes care of P < Base and P >= Base + RoundedSize.
+        if (P < RoundedSize)
+          Counters.inc(P >> PageSizeLog);
       }
     }
+    for (uptr P = Size; P < RoundedSize; P += BlockSize)
+      Counters.inc(P >> PageSizeLog);
   } else {
     // In all other cases chunks might affect more than one page.
     for (const auto &It : FreeList) {
-      for (u32 I = 0; I < It.getCount(); I++) {
-        const uptr P = reinterpret_cast<uptr>(It.get(I));
-        if (P >= Base && P < End)
-          Counters.incRange((P - Base) >> PageSizeLog,
-                            (P - Base + BlockSize - 1) >> PageSizeLog);
+      // See TransferBatch comment above.
+      const bool IsTransferBatch =
+          (It.getCount() != 0) &&
+          (reinterpret_cast<uptr>(It.get(0)) == reinterpret_cast<uptr>(&It));
+      for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
+        const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
+        // This takes care of P < Base and P >= Base + RoundedSize.
+        if (P < RoundedSize)
+          Counters.incRange(P >> PageSizeLog,
+                            (P + BlockSize - 1) >> PageSizeLog);
       }
     }
+    for (uptr P = Size; P < RoundedSize; P += BlockSize)
+      Counters.incRange(P >> PageSizeLog, (P + BlockSize - 1) >> PageSizeLog);
   }
 
   // Iterate over pages detecting ranges of pages with chunk Counters equal
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index ab68e5a1d38d..84eaa5091b43 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -48,24 +48,195 @@ static Header *getHeader(const void *Ptr) {
 
 } // namespace LargeBlock
 
-template <uptr MaxFreeListSize = 32U> class MapAllocator {
+class MapAllocatorNoCache {
 public:
-  // Ensure the freelist is disabled on Fuchsia, since it doesn't support
-  // releasing Secondary blocks yet.
-  static_assert(!SCUDO_FUCHSIA || MaxFreeListSize == 0U, "");
+  void initLinkerInitialized(UNUSED s32 ReleaseToOsInterval) {}
+  void init(UNUSED s32 ReleaseToOsInterval) {}
+  bool retrieve(UNUSED uptr Size, UNUSED LargeBlock::Header **H) {
+    return false;
+  }
+  bool store(UNUSED LargeBlock::Header *H) { return false; }
+  static bool canCache(UNUSED uptr Size) { return false; }
+  void disable() {}
+  void enable() {}
+  void releaseToOS() {}
+  void setReleaseToOsIntervalMs(UNUSED s32 Interval) {}
+};
+
+template <uptr MaxEntriesCount = 32U, uptr MaxEntrySize = 1UL << 19,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX>
+class MapAllocatorCache {
+public:
+  // Fuchsia doesn't allow releasing Secondary blocks yet. Note that 0 length
+  // arrays are an extension for some compilers.
+  // FIXME(kostyak): support (partially) the cache on Fuchsia.
+  static_assert(!SCUDO_FUCHSIA || MaxEntriesCount == 0U, "");
+
+  void initLinkerInitialized(s32 ReleaseToOsInterval) {
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
+  }
+  void init(s32 ReleaseToOsInterval) {
+    memset(this, 0, sizeof(*this));
+    initLinkerInitialized(ReleaseToOsInterval);
+  }
+
+  bool store(LargeBlock::Header *H) {
+    bool EntryCached = false;
+    bool EmptyCache = false;
+    const u64 Time = getMonotonicTime();
+    {
+      ScopedLock L(Mutex);
+      if (EntriesCount == MaxEntriesCount) {
+        if (IsFullEvents++ == 4U)
+          EmptyCache = true;
+      } else {
+        for (uptr I = 0; I < MaxEntriesCount; I++) {
+          if (Entries[I].Block)
+            continue;
+          if (I != 0)
+            Entries[I] = Entries[0];
+          Entries[0].Block = reinterpret_cast<uptr>(H);
+          Entries[0].BlockEnd = H->BlockEnd;
+          Entries[0].MapBase = H->MapBase;
+          Entries[0].MapSize = H->MapSize;
+          Entries[0].Data = H->Data;
+          Entries[0].Time = Time;
+          EntriesCount++;
+          EntryCached = true;
+          break;
+        }
+      }
+    }
+    s32 Interval;
+    if (EmptyCache)
+      empty();
+    else if ((Interval = getReleaseToOsIntervalMs()) >= 0)
+      releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
+    return EntryCached;
+  }
+
+  bool retrieve(uptr Size, LargeBlock::Header **H) {
+    const uptr PageSize = getPageSizeCached();
+    ScopedLock L(Mutex);
+    if (EntriesCount == 0)
+      return false;
+    for (uptr I = 0; I < MaxEntriesCount; I++) {
+      if (!Entries[I].Block)
+        continue;
+      const uptr BlockSize = Entries[I].BlockEnd - Entries[I].Block;
+      if (Size > BlockSize)
+        continue;
+      if (Size < BlockSize - PageSize * 4U)
+        continue;
+      *H = reinterpret_cast<LargeBlock::Header *>(Entries[I].Block);
+      Entries[I].Block = 0;
+      (*H)->BlockEnd = Entries[I].BlockEnd;
+      (*H)->MapBase = Entries[I].MapBase;
+      (*H)->MapSize = Entries[I].MapSize;
+      (*H)->Data = Entries[I].Data;
+      EntriesCount--;
+      return true;
+    }
+    return false;
+  }
+
+  static bool canCache(uptr Size) {
+    return MaxEntriesCount != 0U && Size <= MaxEntrySize;
+  }
+
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
+    }
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+  }
+
+  void releaseToOS() { releaseOlderThan(UINT64_MAX); }
+
+  void disable() { Mutex.lock(); }
+
+  void enable() { Mutex.unlock(); }
+
+private:
+  void empty() {
+    struct {
+      void *MapBase;
+      uptr MapSize;
+      MapPlatformData Data;
+    } MapInfo[MaxEntriesCount];
+    uptr N = 0;
+    {
+      ScopedLock L(Mutex);
+      for (uptr I = 0; I < MaxEntriesCount; I++) {
+        if (!Entries[I].Block)
+          continue;
+        MapInfo[N].MapBase = reinterpret_cast<void *>(Entries[I].MapBase);
+        MapInfo[N].MapSize = Entries[I].MapSize;
+        MapInfo[N].Data = Entries[I].Data;
+        Entries[I].Block = 0;
+        N++;
+      }
+      EntriesCount = 0;
+      IsFullEvents = 0;
+    }
+    for (uptr I = 0; I < N; I++)
+      unmap(MapInfo[I].MapBase, MapInfo[I].MapSize, UNMAP_ALL,
+            &MapInfo[I].Data);
+  }
+
+  void releaseOlderThan(u64 Time) {
+    ScopedLock L(Mutex);
+    if (!EntriesCount)
+      return;
+    for (uptr I = 0; I < MaxEntriesCount; I++) {
+      if (!Entries[I].Block || !Entries[I].Time || Entries[I].Time > Time)
+        continue;
+      releasePagesToOS(Entries[I].Block, 0,
+                       Entries[I].BlockEnd - Entries[I].Block,
+                       &Entries[I].Data);
+      Entries[I].Time = 0;
+    }
+  }
+
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
+  }
 
-  void initLinkerInitialized(GlobalStats *S) {
+  struct CachedBlock {
+    uptr Block;
+    uptr BlockEnd;
+    uptr MapBase;
+    uptr MapSize;
+    MapPlatformData Data;
+    u64 Time;
+  };
+
+  HybridMutex Mutex;
+  CachedBlock Entries[MaxEntriesCount];
+  u32 EntriesCount;
+  uptr LargestSize;
+  u32 IsFullEvents;
+  atomic_s32 ReleaseToOsIntervalMs;
+};
+
+template <class CacheT> class MapAllocator {
+public:
+  void initLinkerInitialized(GlobalStats *S, s32 ReleaseToOsInterval = -1) {
+    Cache.initLinkerInitialized(ReleaseToOsInterval);
     Stats.initLinkerInitialized();
     if (LIKELY(S))
       S->link(&Stats);
   }
-  void init(GlobalStats *S) {
+  void init(GlobalStats *S, s32 ReleaseToOsInterval = -1) {
     memset(this, 0, sizeof(*this));
-    initLinkerInitialized(S);
+    initLinkerInitialized(S, ReleaseToOsInterval);
   }
 
   void *allocate(uptr Size, uptr AlignmentHint = 0, uptr *BlockEnd = nullptr,
-                 bool ZeroContents = false);
+                 FillContentsMode FillContents = NoFill);
 
   void deallocate(void *Ptr);
 
@@ -79,22 +250,34 @@ public:
 
   void getStats(ScopedString *Str) const;
 
-  void disable() { Mutex.lock(); }
+  void disable() {
+    Mutex.lock();
+    Cache.disable();
+  }
 
-  void enable() { Mutex.unlock(); }
+  void enable() {
+    Cache.enable();
+    Mutex.unlock();
+  }
 
   template <typename F> void iterateOverBlocks(F Callback) const {
     for (const auto &H : InUseBlocks)
       Callback(reinterpret_cast<uptr>(&H) + LargeBlock::getHeaderSize());
   }
 
-  static uptr getMaxFreeListSize(void) { return MaxFreeListSize; }
+  static uptr canCache(uptr Size) { return CacheT::canCache(Size); }
+
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    Cache.setReleaseToOsIntervalMs(Interval);
+  }
+
+  void releaseToOS() { Cache.releaseToOS(); }
 
 private:
+  CacheT Cache;
+
   HybridMutex Mutex;
   DoublyLinkedList<LargeBlock::Header> InUseBlocks;
-  // The free list is sorted based on the committed size of blocks.
-  DoublyLinkedList<LargeBlock::Header> FreeBlocks;
   uptr AllocatedBytes;
   uptr FreedBytes;
   uptr LargestSize;
@@ -114,35 +297,34 @@ private:
 // For allocations requested with an alignment greater than or equal to a page,
 // the committed memory will amount to something close to Size - AlignmentHint
 // (pending rounding and headers).
-template <uptr MaxFreeListSize>
-void *MapAllocator<MaxFreeListSize>::allocate(uptr Size, uptr AlignmentHint,
-                                              uptr *BlockEnd,
-                                              bool ZeroContents) {
+template <class CacheT>
+void *MapAllocator<CacheT>::allocate(uptr Size, uptr AlignmentHint,
+                                     uptr *BlockEnd,
+                                     FillContentsMode FillContents) {
   DCHECK_GE(Size, AlignmentHint);
   const uptr PageSize = getPageSizeCached();
   const uptr RoundedSize =
       roundUpTo(Size + LargeBlock::getHeaderSize(), PageSize);
 
-  if (MaxFreeListSize && AlignmentHint < PageSize) {
-    ScopedLock L(Mutex);
-    for (auto &H : FreeBlocks) {
-      const uptr FreeBlockSize = H.BlockEnd - reinterpret_cast<uptr>(&H);
-      if (FreeBlockSize < RoundedSize)
-        continue;
-      // Candidate free block should only be at most 4 pages larger.
-      if (FreeBlockSize > RoundedSize + 4 * PageSize)
-        break;
-      FreeBlocks.remove(&H);
-      InUseBlocks.push_back(&H);
-      AllocatedBytes += FreeBlockSize;
-      NumberOfAllocs++;
-      Stats.add(StatAllocated, FreeBlockSize);
+  if (AlignmentHint < PageSize && CacheT::canCache(RoundedSize)) {
+    LargeBlock::Header *H;
+    if (Cache.retrieve(RoundedSize, &H)) {
       if (BlockEnd)
-        *BlockEnd = H.BlockEnd;
-      void *Ptr = reinterpret_cast<void *>(reinterpret_cast<uptr>(&H) +
+        *BlockEnd = H->BlockEnd;
+      void *Ptr = reinterpret_cast<void *>(reinterpret_cast<uptr>(H) +
                                            LargeBlock::getHeaderSize());
-      if (ZeroContents)
-        memset(Ptr, 0, H.BlockEnd - reinterpret_cast<uptr>(Ptr));
+      if (FillContents)
+        memset(Ptr, FillContents == ZeroFill ? 0 : PatternFillByte,
+               H->BlockEnd - reinterpret_cast<uptr>(Ptr));
+      const uptr BlockSize = H->BlockEnd - reinterpret_cast<uptr>(H);
+      {
+        ScopedLock L(Mutex);
+        InUseBlocks.push_back(H);
+        AllocatedBytes += BlockSize;
+        NumberOfAllocs++;
+        Stats.add(StatAllocated, BlockSize);
+        Stats.add(StatMapped, H->MapSize);
+      }
       return Ptr;
     }
   }
@@ -191,6 +373,8 @@ void *MapAllocator<MaxFreeListSize>::allocate(uptr Size, uptr AlignmentHint,
   H->MapSize = MapEnd - MapBase;
   H->BlockEnd = CommitBase + CommitSize;
   H->Data = Data;
+  if (BlockEnd)
+    *BlockEnd = CommitBase + CommitSize;
   {
     ScopedLock L(Mutex);
     InUseBlocks.push_back(H);
@@ -201,52 +385,31 @@ void *MapAllocator<MaxFreeListSize>::allocate(uptr Size, uptr AlignmentHint,
     Stats.add(StatAllocated, CommitSize);
     Stats.add(StatMapped, H->MapSize);
   }
-  if (BlockEnd)
-    *BlockEnd = CommitBase + CommitSize;
   return reinterpret_cast<void *>(Ptr + LargeBlock::getHeaderSize());
 }
 
-template <uptr MaxFreeListSize>
-void MapAllocator<MaxFreeListSize>::deallocate(void *Ptr) {
+template <class CacheT> void MapAllocator<CacheT>::deallocate(void *Ptr) {
   LargeBlock::Header *H = LargeBlock::getHeader(Ptr);
   const uptr Block = reinterpret_cast<uptr>(H);
+  const uptr CommitSize = H->BlockEnd - Block;
   {
     ScopedLock L(Mutex);
     InUseBlocks.remove(H);
-    const uptr CommitSize = H->BlockEnd - Block;
     FreedBytes += CommitSize;
     NumberOfFrees++;
     Stats.sub(StatAllocated, CommitSize);
-    if (MaxFreeListSize && FreeBlocks.size() < MaxFreeListSize) {
-      bool Inserted = false;
-      for (auto &F : FreeBlocks) {
-        const uptr FreeBlockSize = F.BlockEnd - reinterpret_cast<uptr>(&F);
-        if (FreeBlockSize >= CommitSize) {
-          FreeBlocks.insert(H, &F);
-          Inserted = true;
-          break;
-        }
-      }
-      if (!Inserted)
-        FreeBlocks.push_back(H);
-      const uptr RoundedAllocationStart =
-          roundUpTo(Block + LargeBlock::getHeaderSize(), getPageSizeCached());
-      MapPlatformData Data = H->Data;
-      // TODO(kostyak): use release_to_os_interval_ms
-      releasePagesToOS(Block, RoundedAllocationStart - Block,
-                       H->BlockEnd - RoundedAllocationStart, &Data);
-      return;
-    }
     Stats.sub(StatMapped, H->MapSize);
   }
+  if (CacheT::canCache(CommitSize) && Cache.store(H))
+    return;
   void *Addr = reinterpret_cast<void *>(H->MapBase);
   const uptr Size = H->MapSize;
   MapPlatformData Data = H->Data;
   unmap(Addr, Size, UNMAP_ALL, &Data);
 }
 
-template <uptr MaxFreeListSize>
-void MapAllocator<MaxFreeListSize>::getStats(ScopedString *Str) const {
+template <class CacheT>
+void MapAllocator<CacheT>::getStats(ScopedString *Str) const {
   Str->append(
       "Stats: MapAllocator: allocated %zu times (%zuK), freed %zu times "
       "(%zuK), remains %zu (%zuK) max %zuM\n",
diff --git a/compiler-rt/lib/scudo/standalone/size_class_map.h b/compiler-rt/lib/scudo/standalone/size_class_map.h
index 947526e8aea1..5ed8e2845b38 100644
--- a/compiler-rt/lib/scudo/standalone/size_class_map.h
+++ b/compiler-rt/lib/scudo/standalone/size_class_map.h
@@ -9,11 +9,32 @@
 #ifndef SCUDO_SIZE_CLASS_MAP_H_
 #define SCUDO_SIZE_CLASS_MAP_H_
 
+#include "chunk.h"
 #include "common.h"
 #include "string_utils.h"
 
 namespace scudo {
 
+inline uptr scaledLog2(uptr Size, uptr ZeroLog, uptr LogBits) {
+  const uptr L = getMostSignificantSetBitIndex(Size);
+  const uptr LBits = (Size >> (L - LogBits)) - (1 << LogBits);
+  const uptr HBits = (L - ZeroLog) << LogBits;
+  return LBits + HBits;
+}
+
+template <typename Config> struct SizeClassMapBase {
+  static u32 getMaxCachedHint(uptr Size) {
+    DCHECK_NE(Size, 0);
+    u32 N;
+    // Force a 32-bit division if the template parameters allow for it.
+    if (Config::MaxBytesCachedLog > 31 || Config::MaxSizeLog > 31)
+      N = static_cast<u32>((1UL << Config::MaxBytesCachedLog) / Size);
+    else
+      N = (1U << Config::MaxBytesCachedLog) / static_cast<u32>(Size);
+    return Max(1U, Min(Config::MaxNumCachedHint, N));
+  }
+};
+
 // SizeClassMap maps allocation sizes into size classes and back, in an
 // efficient table-free manner.
 //
@@ -33,22 +54,24 @@ namespace scudo {
 // of chunks that can be cached per-thread:
 // - MaxNumCachedHint is a hint for the max number of chunks cached per class.
 // - 2^MaxBytesCachedLog is the max number of bytes cached per class.
+template <typename Config>
+class FixedSizeClassMap : public SizeClassMapBase<Config> {
+  typedef SizeClassMapBase<Config> Base;
 
-template <u8 NumBits, u8 MinSizeLog, u8 MidSizeLog, u8 MaxSizeLog,
-          u32 MaxNumCachedHintT, u8 MaxBytesCachedLog>
-class SizeClassMap {
-  static const uptr MinSize = 1UL << MinSizeLog;
-  static const uptr MidSize = 1UL << MidSizeLog;
+  static const uptr MinSize = 1UL << Config::MinSizeLog;
+  static const uptr MidSize = 1UL << Config::MidSizeLog;
   static const uptr MidClass = MidSize / MinSize;
-  static const u8 S = NumBits - 1;
+  static const u8 S = Config::NumBits - 1;
   static const uptr M = (1UL << S) - 1;
 
+  static const uptr SizeDelta = Chunk::getHeaderSize();
+
 public:
-  static const u32 MaxNumCachedHint = MaxNumCachedHintT;
+  static const u32 MaxNumCachedHint = Config::MaxNumCachedHint;
 
-  static const uptr MaxSize = 1UL << MaxSizeLog;
+  static const uptr MaxSize = (1UL << Config::MaxSizeLog) + SizeDelta;
   static const uptr NumClasses =
-      MidClass + ((MaxSizeLog - MidSizeLog) << S) + 1;
+      MidClass + ((Config::MaxSizeLog - Config::MidSizeLog) << S) + 1;
   static_assert(NumClasses <= 256, "");
   static const uptr LargestClassId = NumClasses - 1;
   static const uptr BatchClassId = 0;
@@ -56,97 +79,213 @@ public:
   static uptr getSizeByClassId(uptr ClassId) {
     DCHECK_NE(ClassId, BatchClassId);
     if (ClassId <= MidClass)
-      return ClassId << MinSizeLog;
+      return (ClassId << Config::MinSizeLog) + SizeDelta;
     ClassId -= MidClass;
     const uptr T = MidSize << (ClassId >> S);
-    return T + (T >> S) * (ClassId & M);
+    return T + (T >> S) * (ClassId & M) + SizeDelta;
   }
 
   static uptr getClassIdBySize(uptr Size) {
+    if (Size <= SizeDelta + (1 << Config::MinSizeLog))
+      return 1;
+    Size -= SizeDelta;
     DCHECK_LE(Size, MaxSize);
     if (Size <= MidSize)
-      return (Size + MinSize - 1) >> MinSizeLog;
-    const uptr L = getMostSignificantSetBitIndex(Size);
-    const uptr HBits = (Size >> (L - S)) & M;
-    const uptr LBits = Size & ((1UL << (L - S)) - 1);
-    const uptr L1 = L - MidSizeLog;
-    return MidClass + (L1 << S) + HBits + (LBits > 0);
+      return (Size + MinSize - 1) >> Config::MinSizeLog;
+    return MidClass + 1 + scaledLog2(Size - 1, Config::MidSizeLog, S);
   }
 
   static u32 getMaxCachedHint(uptr Size) {
     DCHECK_LE(Size, MaxSize);
-    DCHECK_NE(Size, 0);
-    u32 N;
-    // Force a 32-bit division if the template parameters allow for it.
-    if (MaxBytesCachedLog > 31 || MaxSizeLog > 31)
-      N = static_cast<u32>((1UL << MaxBytesCachedLog) / Size);
-    else
-      N = (1U << MaxBytesCachedLog) / static_cast<u32>(Size);
-    return Max(1U, Min(MaxNumCachedHint, N));
+    return Base::getMaxCachedHint(Size);
   }
+};
+
+template <typename Config>
+class TableSizeClassMap : public SizeClassMapBase<Config> {
+  typedef SizeClassMapBase<Config> Base;
+
+  static const u8 S = Config::NumBits - 1;
+  static const uptr M = (1UL << S) - 1;
+  static const uptr ClassesSize =
+      sizeof(Config::Classes) / sizeof(Config::Classes[0]);
 
-  static void print() {
-    ScopedString Buffer(1024);
-    uptr PrevS = 0;
-    uptr TotalCached = 0;
-    for (uptr I = 0; I < NumClasses; I++) {
-      if (I == BatchClassId)
-        continue;
-      const uptr S = getSizeByClassId(I);
-      if (S >= MidSize / 2 && (S & (S - 1)) == 0)
-        Buffer.append("\n");
-      const uptr D = S - PrevS;
-      const uptr P = PrevS ? (D * 100 / PrevS) : 0;
-      const uptr L = S ? getMostSignificantSetBitIndex(S) : 0;
-      const uptr Cached = getMaxCachedHint(S) * S;
-      Buffer.append(
-          "C%02zu => S: %zu diff: +%zu %02zu%% L %zu Cached: %zu %zu; id %zu\n",
-          I, getSizeByClassId(I), D, P, L, getMaxCachedHint(S), Cached,
-          getClassIdBySize(S));
-      TotalCached += Cached;
-      PrevS = S;
+  struct SizeTable {
+    constexpr SizeTable() {
+      uptr Pos = 1 << Config::MidSizeLog;
+      uptr Inc = 1 << (Config::MidSizeLog - S);
+      for (uptr i = 0; i != getTableSize(); ++i) {
+        Pos += Inc;
+        if ((Pos & (Pos - 1)) == 0)
+          Inc *= 2;
+        Tab[i] = computeClassId(Pos + Config::SizeDelta);
+      }
     }
-    Buffer.append("Total Cached: %zu\n", TotalCached);
-    Buffer.output();
-  }
 
-  static void validate() {
-    for (uptr C = 0; C < NumClasses; C++) {
-      if (C == BatchClassId)
-        continue;
-      const uptr S = getSizeByClassId(C);
-      CHECK_NE(S, 0U);
-      CHECK_EQ(getClassIdBySize(S), C);
-      if (C < LargestClassId)
-        CHECK_EQ(getClassIdBySize(S + 1), C + 1);
-      CHECK_EQ(getClassIdBySize(S - 1), C);
-      if (C - 1 != BatchClassId)
-        CHECK_GT(getSizeByClassId(C), getSizeByClassId(C - 1));
+    constexpr static u8 computeClassId(uptr Size) {
+      for (uptr i = 0; i != ClassesSize; ++i) {
+        if (Size <= Config::Classes[i])
+          return static_cast<u8>(i + 1);
+      }
+      return static_cast<u8>(-1);
     }
-    // Do not perform the loop if the maximum size is too large.
-    if (MaxSizeLog > 19)
-      return;
-    for (uptr S = 1; S <= MaxSize; S++) {
-      const uptr C = getClassIdBySize(S);
-      CHECK_LT(C, NumClasses);
-      CHECK_GE(getSizeByClassId(C), S);
-      if (C - 1 != BatchClassId)
-        CHECK_LT(getSizeByClassId(C - 1), S);
+
+    constexpr static uptr getTableSize() {
+      return (Config::MaxSizeLog - Config::MidSizeLog) << S;
     }
+
+    u8 Tab[getTableSize()] = {};
+  };
+
+  static constexpr SizeTable Table = {};
+
+public:
+  static const u32 MaxNumCachedHint = Config::MaxNumCachedHint;
+
+  static const uptr NumClasses = ClassesSize + 1;
+  static_assert(NumClasses < 256, "");
+  static const uptr LargestClassId = NumClasses - 1;
+  static const uptr BatchClassId = 0;
+  static const uptr MaxSize = Config::Classes[LargestClassId - 1];
+
+  static uptr getSizeByClassId(uptr ClassId) {
+    return Config::Classes[ClassId - 1];
   }
+
+  static uptr getClassIdBySize(uptr Size) {
+    if (Size <= Config::Classes[0])
+      return 1;
+    Size -= Config::SizeDelta;
+    DCHECK_LE(Size, MaxSize);
+    if (Size <= (1 << Config::MidSizeLog))
+      return ((Size - 1) >> Config::MinSizeLog) + 1;
+    return Table.Tab[scaledLog2(Size - 1, Config::MidSizeLog, S)];
+  }
+
+  static u32 getMaxCachedHint(uptr Size) {
+    DCHECK_LE(Size, MaxSize);
+    return Base::getMaxCachedHint(Size);
+  }
+};
+
+struct AndroidSizeClassConfig {
+#if SCUDO_WORDSIZE == 64U
+  static const uptr NumBits = 7;
+  static const uptr MinSizeLog = 4;
+  static const uptr MidSizeLog = 6;
+  static const uptr MaxSizeLog = 16;
+  static const u32 MaxNumCachedHint = 14;
+  static const uptr MaxBytesCachedLog = 13;
+
+  static constexpr u32 Classes[] = {
+      0x00020, 0x00030, 0x00040, 0x00050, 0x00060, 0x00070, 0x00090, 0x000b0,
+      0x000c0, 0x000e0, 0x00120, 0x00160, 0x001c0, 0x00250, 0x00320, 0x00450,
+      0x00670, 0x00830, 0x00a10, 0x00c30, 0x01010, 0x01210, 0x01bd0, 0x02210,
+      0x02d90, 0x03790, 0x04010, 0x04810, 0x05a10, 0x07310, 0x08210, 0x10010,
+  };
+  static const uptr SizeDelta = 16;
+#else
+  static const uptr NumBits = 8;
+  static const uptr MinSizeLog = 4;
+  static const uptr MidSizeLog = 7;
+  static const uptr MaxSizeLog = 16;
+  static const u32 MaxNumCachedHint = 14;
+  static const uptr MaxBytesCachedLog = 13;
+
+  static constexpr u32 Classes[] = {
+      0x00020, 0x00030, 0x00040, 0x00050, 0x00060, 0x00070, 0x00080, 0x00090,
+      0x000a0, 0x000b0, 0x000c0, 0x000e0, 0x000f0, 0x00110, 0x00120, 0x00130,
+      0x00150, 0x00160, 0x00170, 0x00190, 0x001d0, 0x00210, 0x00240, 0x002a0,
+      0x00330, 0x00370, 0x003a0, 0x00400, 0x00430, 0x004a0, 0x00530, 0x00610,
+      0x00730, 0x00840, 0x00910, 0x009c0, 0x00a60, 0x00b10, 0x00ca0, 0x00e00,
+      0x00fb0, 0x01030, 0x01130, 0x011f0, 0x01490, 0x01650, 0x01930, 0x02010,
+      0x02190, 0x02490, 0x02850, 0x02d50, 0x03010, 0x03210, 0x03c90, 0x04090,
+      0x04510, 0x04810, 0x05c10, 0x06f10, 0x07310, 0x08010, 0x0c010, 0x10010,
+  };
+  static const uptr SizeDelta = 16;
+#endif
+};
+
+typedef TableSizeClassMap<AndroidSizeClassConfig> AndroidSizeClassMap;
+
+struct DefaultSizeClassConfig {
+  static const uptr NumBits = 3;
+  static const uptr MinSizeLog = 5;
+  static const uptr MidSizeLog = 8;
+  static const uptr MaxSizeLog = 17;
+  static const u32 MaxNumCachedHint = 8;
+  static const uptr MaxBytesCachedLog = 10;
 };
 
-typedef SizeClassMap<3, 5, 8, 17, 8, 10> DefaultSizeClassMap;
+typedef FixedSizeClassMap<DefaultSizeClassConfig> DefaultSizeClassMap;
 
-// TODO(kostyak): further tune class maps for Android & Fuchsia.
+struct SvelteSizeClassConfig {
 #if SCUDO_WORDSIZE == 64U
-typedef SizeClassMap<4, 4, 8, 14, 4, 10> SvelteSizeClassMap;
-typedef SizeClassMap<3, 5, 8, 17, 14, 14> AndroidSizeClassMap;
+  static const uptr NumBits = 4;
+  static const uptr MinSizeLog = 4;
+  static const uptr MidSizeLog = 8;
+  static const uptr MaxSizeLog = 14;
+  static const u32 MaxNumCachedHint = 4;
+  static const uptr MaxBytesCachedLog = 10;
 #else
-typedef SizeClassMap<4, 3, 7, 14, 5, 10> SvelteSizeClassMap;
-typedef SizeClassMap<3, 5, 8, 17, 14, 14> AndroidSizeClassMap;
+  static const uptr NumBits = 4;
+  static const uptr MinSizeLog = 3;
+  static const uptr MidSizeLog = 7;
+  static const uptr MaxSizeLog = 14;
+  static const u32 MaxNumCachedHint = 5;
+  static const uptr MaxBytesCachedLog = 10;
 #endif
+};
+
+typedef FixedSizeClassMap<SvelteSizeClassConfig> SvelteSizeClassMap;
+
+template <typename SCMap> inline void printMap() {
+  ScopedString Buffer(1024);
+  uptr PrevS = 0;
+  uptr TotalCached = 0;
+  for (uptr I = 0; I < SCMap::NumClasses; I++) {
+    if (I == SCMap::BatchClassId)
+      continue;
+    const uptr S = SCMap::getSizeByClassId(I);
+    const uptr D = S - PrevS;
+    const uptr P = PrevS ? (D * 100 / PrevS) : 0;
+    const uptr L = S ? getMostSignificantSetBitIndex(S) : 0;
+    const uptr Cached = SCMap::getMaxCachedHint(S) * S;
+    Buffer.append(
+        "C%02zu => S: %zu diff: +%zu %02zu%% L %zu Cached: %zu %zu; id %zu\n",
+        I, S, D, P, L, SCMap::getMaxCachedHint(S), Cached,
+        SCMap::getClassIdBySize(S));
+    TotalCached += Cached;
+    PrevS = S;
+  }
+  Buffer.append("Total Cached: %zu\n", TotalCached);
+  Buffer.output();
+}
 
+template <typename SCMap> static void validateMap() {
+  for (uptr C = 0; C < SCMap::NumClasses; C++) {
+    if (C == SCMap::BatchClassId)
+      continue;
+    const uptr S = SCMap::getSizeByClassId(C);
+    CHECK_NE(S, 0U);
+    CHECK_EQ(SCMap::getClassIdBySize(S), C);
+    if (C < SCMap::LargestClassId)
+      CHECK_EQ(SCMap::getClassIdBySize(S + 1), C + 1);
+    CHECK_EQ(SCMap::getClassIdBySize(S - 1), C);
+    if (C - 1 != SCMap::BatchClassId)
+      CHECK_GT(SCMap::getSizeByClassId(C), SCMap::getSizeByClassId(C - 1));
+  }
+  // Do not perform the loop if the maximum size is too large.
+  if (SCMap::MaxSize > (1 << 19))
+    return;
+  for (uptr S = 1; S <= SCMap::MaxSize; S++) {
+    const uptr C = SCMap::getClassIdBySize(S);
+    CHECK_LT(C, SCMap::NumClasses);
+    CHECK_GE(SCMap::getSizeByClassId(C), S);
+    if (C - 1 != SCMap::BatchClassId)
+      CHECK_LT(SCMap::getSizeByClassId(C - 1), S);
+  }
+}
 } // namespace scudo
 
 #endif // SCUDO_SIZE_CLASS_MAP_H_
diff --git a/compiler-rt/lib/scudo/standalone/stack_depot.h b/compiler-rt/lib/scudo/standalone/stack_depot.h
new file mode 100644
index 000000000000..f2f4d9597795
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/stack_depot.h
@@ -0,0 +1,144 @@
+//===-- stack_depot.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_STACK_DEPOT_H_
+#define SCUDO_STACK_DEPOT_H_
+
+#include "atomic_helpers.h"
+#include "mutex.h"
+
+namespace scudo {
+
+class MurMur2HashBuilder {
+  static const u32 M = 0x5bd1e995;
+  static const u32 Seed = 0x9747b28c;
+  static const u32 R = 24;
+  u32 H;
+
+ public:
+  explicit MurMur2HashBuilder(u32 Init = 0) { H = Seed ^ Init; }
+  void add(u32 K) {
+    K *= M;
+    K ^= K >> R;
+    K *= M;
+    H *= M;
+    H ^= K;
+  }
+  u32 get() {
+    u32 X = H;
+    X ^= X >> 13;
+    X *= M;
+    X ^= X >> 15;
+    return X;
+  }
+};
+
+class StackDepot {
+  HybridMutex RingEndMu;
+  u32 RingEnd;
+
+  // This data structure stores a stack trace for each allocation and
+  // deallocation when stack trace recording is enabled, that may be looked up
+  // using a hash of the stack trace. The lower bits of the hash are an index
+  // into the Tab array, which stores an index into the Ring array where the
+  // stack traces are stored. As the name implies, Ring is a ring buffer, so a
+  // stack trace may wrap around to the start of the array.
+  //
+  // Each stack trace in Ring is prefixed by a stack trace marker consisting of
+  // a fixed 1 bit in bit 0 (this allows disambiguation between stack frames
+  // and stack trace markers in the case where instruction pointers are 4-byte
+  // aligned, as they are on arm64), the stack trace hash in bits 1-32, and the
+  // size of the stack trace in bits 33-63.
+  //
+  // The insert() function is potentially racy in its accesses to the Tab and
+  // Ring arrays, but find() is resilient to races in the sense that, barring
+  // hash collisions, it will either return the correct stack trace or no stack
+  // trace at all, even if two instances of insert() raced with one another.
+  // This is achieved by re-checking the hash of the stack trace before
+  // returning the trace.
+
+#ifdef SCUDO_FUZZ
+  // Use smaller table sizes for fuzzing in order to reduce input size.
+  static const uptr TabBits = 4;
+#else
+  static const uptr TabBits = 16;
+#endif
+  static const uptr TabSize = 1 << TabBits;
+  static const uptr TabMask = TabSize - 1;
+  atomic_u32 Tab[TabSize];
+
+#ifdef SCUDO_FUZZ
+  static const uptr RingBits = 4;
+#else
+  static const uptr RingBits = 19;
+#endif
+  static const uptr RingSize = 1 << RingBits;
+  static const uptr RingMask = RingSize - 1;
+  atomic_u64 Ring[RingSize];
+
+public:
+  // Insert hash of the stack trace [Begin, End) into the stack depot, and
+  // return the hash.
+  u32 insert(uptr *Begin, uptr *End) {
+    MurMur2HashBuilder B;
+    for (uptr *I = Begin; I != End; ++I)
+      B.add(u32(*I) >> 2);
+    u32 Hash = B.get();
+
+    u32 Pos = Hash & TabMask;
+    u32 RingPos = atomic_load_relaxed(&Tab[Pos]);
+    u64 Entry = atomic_load_relaxed(&Ring[RingPos]);
+    u64 Id = (u64(End - Begin) << 33) | (u64(Hash) << 1) | 1;
+    if (Entry == Id)
+      return Hash;
+
+    ScopedLock Lock(RingEndMu);
+    RingPos = RingEnd;
+    atomic_store_relaxed(&Tab[Pos], RingPos);
+    atomic_store_relaxed(&Ring[RingPos], Id);
+    for (uptr *I = Begin; I != End; ++I) {
+      RingPos = (RingPos + 1) & RingMask;
+      atomic_store_relaxed(&Ring[RingPos], *I);
+    }
+    RingEnd = (RingPos + 1) & RingMask;
+    return Hash;
+  }
+
+  // Look up a stack trace by hash. Returns true if successful. The trace may be
+  // accessed via operator[] passing indexes between *RingPosPtr and
+  // *RingPosPtr + *SizePtr.
+  bool find(u32 Hash, uptr *RingPosPtr, uptr *SizePtr) const {
+    u32 Pos = Hash & TabMask;
+    u32 RingPos = atomic_load_relaxed(&Tab[Pos]);
+    if (RingPos >= RingSize)
+      return false;
+    u64 Entry = atomic_load_relaxed(&Ring[RingPos]);
+    u64 HashWithTagBit = (u64(Hash) << 1) | 1;
+    if ((Entry & 0x1ffffffff) != HashWithTagBit)
+      return false;
+    u32 Size = u32(Entry >> 33);
+    if (Size >= RingSize)
+      return false;
+    *RingPosPtr = (RingPos + 1) & RingMask;
+    *SizePtr = Size;
+    MurMur2HashBuilder B;
+    for (uptr I = 0; I != Size; ++I) {
+      RingPos = (RingPos + 1) & RingMask;
+      B.add(u32(atomic_load_relaxed(&Ring[RingPos])) >> 2);
+    }
+    return B.get() == Hash;
+  }
+
+  u64 operator[](uptr RingPos) const {
+    return atomic_load_relaxed(&Ring[RingPos & RingMask]);
+  }
+};
+
+} // namespace scudo
+
+#endif // SCUDO_STACK_DEPOT_H_
diff --git a/compiler-rt/lib/scudo/standalone/stats.h b/compiler-rt/lib/scudo/standalone/stats.h
index 38481e98e48d..d76b904949ea 100644
--- a/compiler-rt/lib/scudo/standalone/stats.h
+++ b/compiler-rt/lib/scudo/standalone/stats.h
@@ -58,7 +58,9 @@ class GlobalStats : public LocalStats {
 public:
   void initLinkerInitialized() {}
   void init() {
-    memset(this, 0, sizeof(*this));
+    LocalStats::init();
+    Mutex.init();
+    StatsList = {};
     initLinkerInitialized();
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/tsd.h b/compiler-rt/lib/scudo/standalone/tsd.h
index 20f0d69cabfd..b3701c63f8a9 100644
--- a/compiler-rt/lib/scudo/standalone/tsd.h
+++ b/compiler-rt/lib/scudo/standalone/tsd.h
@@ -23,7 +23,7 @@
 
 namespace scudo {
 
-template <class Allocator> struct ALIGNED(SCUDO_CACHE_LINE_SIZE) TSD {
+template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
   typename Allocator::CacheT Cache;
   typename Allocator::QuarantineCacheT QuarantineCache;
   u8 DestructorIterations;
diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
index 69479ea7bdf4..3492509b5a8e 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
@@ -25,9 +25,7 @@ template <class Allocator> struct TSDRegistryExT {
   void initLinkerInitialized(Allocator *Instance) {
     Instance->initLinkerInitialized();
     CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread<Allocator>), 0);
-    FallbackTSD = reinterpret_cast<TSD<Allocator> *>(
-        map(nullptr, sizeof(TSD<Allocator>), "scudo:tsd"));
-    FallbackTSD->initLinkerInitialized(Instance);
+    FallbackTSD.initLinkerInitialized(Instance);
     Initialized = true;
   }
   void init(Allocator *Instance) {
@@ -35,9 +33,7 @@ template <class Allocator> struct TSDRegistryExT {
     initLinkerInitialized(Instance);
   }
 
-  void unmapTestOnly() {
-    unmap(reinterpret_cast<void *>(FallbackTSD), sizeof(TSD<Allocator>));
-  }
+  void unmapTestOnly() {}
 
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, bool MinimalInit) {
     if (LIKELY(State != ThreadState::NotInitialized))
@@ -51,23 +47,22 @@ template <class Allocator> struct TSDRegistryExT {
       *UnlockRequired = false;
       return &ThreadTSD;
     }
-    DCHECK(FallbackTSD);
-    FallbackTSD->lock();
+    FallbackTSD.lock();
     *UnlockRequired = true;
-    return FallbackTSD;
+    return &FallbackTSD;
   }
 
   // To disable the exclusive TSD registry, we effectively lock the fallback TSD
   // and force all threads to attempt to use it instead of their local one.
   void disable() {
     Mutex.lock();
-    FallbackTSD->lock();
+    FallbackTSD.lock();
     atomic_store(&Disabled, 1U, memory_order_release);
   }
 
   void enable() {
     atomic_store(&Disabled, 0U, memory_order_release);
-    FallbackTSD->unlock();
+    FallbackTSD.unlock();
     Mutex.unlock();
   }
 
@@ -96,7 +91,7 @@ private:
   pthread_key_t PThreadKey;
   bool Initialized;
   atomic_u8 Disabled;
-  TSD<Allocator> *FallbackTSD;
+  TSD<Allocator> FallbackTSD;
   HybridMutex Mutex;
   static THREADLOCAL ThreadState State;
   static THREADLOCAL TSD<Allocator> ThreadTSD;
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
index 5ab8269519a9..038a5905ff48 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -18,9 +18,10 @@ template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
   void initLinkerInitialized(Allocator *Instance) {
     Instance->initLinkerInitialized();
     CHECK_EQ(pthread_key_create(&PThreadKey, nullptr), 0); // For non-TLS
-    NumberOfTSDs = Min(Max(1U, getNumberOfCPUs()), MaxTSDCount);
-    TSDs = reinterpret_cast<TSD<Allocator> *>(
-        map(nullptr, sizeof(TSD<Allocator>) * NumberOfTSDs, "scudo:tsd"));
+    const u32 NumberOfCPUs = getNumberOfCPUs();
+    NumberOfTSDs = (SCUDO_ANDROID || NumberOfCPUs == 0)
+                       ? MaxTSDCount
+                       : Min(NumberOfCPUs, MaxTSDCount);
     for (u32 I = 0; I < NumberOfTSDs; I++)
       TSDs[I].initLinkerInitialized(Instance);
     // Compute all the coprimes of NumberOfTSDs. This will be used to walk the
@@ -46,8 +47,6 @@ template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
   }
 
   void unmapTestOnly() {
-    unmap(reinterpret_cast<void *>(TSDs),
-          sizeof(TSD<Allocator>) * NumberOfTSDs);
     setCurrentTSD(nullptr);
     pthread_key_delete(PThreadKey);
   }
@@ -77,7 +76,7 @@ template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
   }
 
   void enable() {
-    for (s32 I = NumberOfTSDs - 1; I >= 0; I--)
+    for (s32 I = static_cast<s32>(NumberOfTSDs - 1); I >= 0; I--)
       TSDs[I].unlock();
     Mutex.unlock();
   }
@@ -160,11 +159,11 @@ private:
   pthread_key_t PThreadKey;
   atomic_u32 CurrentIndex;
   u32 NumberOfTSDs;
-  TSD<Allocator> *TSDs;
   u32 NumberOfCoPrimes;
   u32 CoPrimes[MaxTSDCount];
   bool Initialized;
   HybridMutex Mutex;
+  TSD<Allocator> TSDs[MaxTSDCount];
 #if SCUDO_LINUX && !_BIONIC
   static THREADLOCAL TSD<Allocator> *ThreadTSD;
 #endif
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.cpp b/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
index 93a666c4d61e..098cc089a1ca 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
@@ -22,13 +22,11 @@
 #define SCUDO_ALLOCATOR Allocator
 
 extern "C" void SCUDO_PREFIX(malloc_postinit)();
-static scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)>
-    SCUDO_ALLOCATOR;
-// Pointer to the static allocator so that the C++ wrappers can access it.
+
+// Export the static allocator so that the C++ wrappers can access it.
 // Technically we could have a completely separated heap for C & C++ but in
 // reality the amount of cross pollination between the two is staggering.
-scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)> *
-    CONCATENATE(SCUDO_ALLOCATOR, Ptr) = &SCUDO_ALLOCATOR;
+scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)> SCUDO_ALLOCATOR;
 
 #include "wrappers_c.inc"
 
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
index 2fd709eaa1f6..4396dfc50d1d 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -150,13 +150,25 @@ INTERFACE WEAK void SCUDO_PREFIX(malloc_disable)() {
 }
 
 void SCUDO_PREFIX(malloc_postinit)() {
+  SCUDO_ALLOCATOR.initGwpAsan();
   pthread_atfork(SCUDO_PREFIX(malloc_disable), SCUDO_PREFIX(malloc_enable),
                  SCUDO_PREFIX(malloc_enable));
 }
 
 INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, UNUSED int value) {
   if (param == M_DECAY_TIME) {
-    // TODO(kostyak): set release_to_os_interval_ms accordingly.
+    if (SCUDO_ANDROID) {
+      if (value == 0) {
+        // Will set the release values to their minimum values.
+        value = INT32_MIN;
+      } else {
+        // Will set the release values to their maximum values.
+        value = INT32_MAX;
+      }
+    }
+
+    SCUDO_ALLOCATOR.setOption(scudo::Option::ReleaseInterval,
+                              static_cast<scudo::sptr>(value));
     return 1;
   } else if (param == M_PURGE) {
     SCUDO_ALLOCATOR.releaseToOS();
@@ -179,9 +191,56 @@ INTERFACE WEAK void *SCUDO_PREFIX(aligned_alloc)(size_t alignment,
 }
 
 INTERFACE WEAK int SCUDO_PREFIX(malloc_info)(UNUSED int options, FILE *stream) {
-  fputs("<malloc version=\"scudo-1\">", stream);
-  fputs("</malloc>", stream);
+  const scudo::uptr max_size =
+      decltype(SCUDO_ALLOCATOR)::PrimaryT::SizeClassMap::MaxSize;
+  auto *sizes = static_cast<scudo::uptr *>(
+      SCUDO_PREFIX(calloc)(max_size, sizeof(scudo::uptr)));
+  auto callback = [](uintptr_t, size_t size, void *arg) {
+    auto *sizes = reinterpret_cast<scudo::uptr *>(arg);
+    if (size < max_size)
+      sizes[size]++;
+  };
+  SCUDO_ALLOCATOR.iterateOverChunks(0, -1ul, callback, sizes);
+
+  fputs("<malloc version=\"scudo-1\">\n", stream);
+  for (scudo::uptr i = 0; i != max_size; ++i)
+    if (sizes[i])
+      fprintf(stream, "<alloc size=\"%lu\" count=\"%lu\"/>\n", i, sizes[i]);
+  fputs("</malloc>\n", stream);
+  SCUDO_PREFIX(free)(sizes);
   return 0;
 }
 
+// Disable memory tagging for the heap. The caller must disable memory tag
+// checks globally (e.g. by clearing TCF0 on aarch64) before calling this
+// function, and may not re-enable them after calling the function. The program
+// must be single threaded at the point when the function is called.
+INTERFACE WEAK void SCUDO_PREFIX(malloc_disable_memory_tagging)() {
+  SCUDO_ALLOCATOR.disableMemoryTagging();
+}
+
+// Sets whether scudo records stack traces and other metadata for allocations
+// and deallocations. This function only has an effect if the allocator and
+// hardware support memory tagging. The program must be single threaded at the
+// point when the function is called.
+INTERFACE WEAK void
+SCUDO_PREFIX(malloc_set_track_allocation_stacks)(int track) {
+  SCUDO_ALLOCATOR.setTrackAllocationStacks(track);
+}
+
+// Sets whether scudo zero-initializes all allocated memory. The program must
+// be single threaded at the point when the function is called.
+INTERFACE WEAK void SCUDO_PREFIX(malloc_set_zero_contents)(int zero_contents) {
+  SCUDO_ALLOCATOR.setFillContents(zero_contents ? scudo::ZeroFill
+                                                : scudo::NoFill);
+}
+
+// Sets whether scudo pattern-initializes all allocated memory. The program must
+// be single threaded at the point when the function is called.
+INTERFACE WEAK void
+SCUDO_PREFIX(malloc_set_pattern_fill_contents)(int pattern_fill_contents) {
+  SCUDO_ALLOCATOR.setFillContents(
+      pattern_fill_contents ? scudo::PatternOrZeroFill : scudo::NoFill);
+}
+
 } // extern "C"
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp b/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
index f004369d96cb..4298e69b5774 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
@@ -25,11 +25,6 @@
 extern "C" void SCUDO_PREFIX(malloc_postinit)();
 static scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)>
     SCUDO_ALLOCATOR;
-// Pointer to the static allocator so that the C++ wrappers can access it.
-// Technically we could have a completely separated heap for C & C++ but in
-// reality the amount of cross pollination between the two is staggering.
-scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)> *
-    CONCATENATE(SCUDO_ALLOCATOR, Ptr) = &SCUDO_ALLOCATOR;
 
 #include "wrappers_c.inc"
 
@@ -44,22 +39,37 @@ extern "C" void SCUDO_PREFIX(malloc_postinit)();
 static scudo::Allocator<scudo::AndroidSvelteConfig,
                         SCUDO_PREFIX(malloc_postinit)>
     SCUDO_ALLOCATOR;
-// Pointer to the static allocator so that the C++ wrappers can access it.
-// Technically we could have a completely separated heap for C & C++ but in
-// reality the amount of cross pollination between the two is staggering.
-scudo::Allocator<scudo::AndroidSvelteConfig, SCUDO_PREFIX(malloc_postinit)> *
-    CONCATENATE(SCUDO_ALLOCATOR, Ptr) = &SCUDO_ALLOCATOR;
 
 #include "wrappers_c.inc"
 
 #undef SCUDO_ALLOCATOR
 #undef SCUDO_PREFIX
 
-// The following is the only function that will end up initializing both
-// allocators, which will result in a slight increase in memory footprint.
-INTERFACE void __scudo_print_stats(void) {
-  Allocator.printStats();
-  SvelteAllocator.printStats();
+// TODO(kostyak): support both allocators.
+INTERFACE void __scudo_print_stats(void) { Allocator.printStats(); }
+
+INTERFACE void __scudo_get_error_info(
+    struct scudo_error_info *error_info, uintptr_t fault_addr,
+    const char *stack_depot, const char *region_info, const char *memory,
+    const char *memory_tags, uintptr_t memory_addr, size_t memory_size) {
+  Allocator.getErrorInfo(error_info, fault_addr, stack_depot, region_info,
+                         memory, memory_tags, memory_addr, memory_size);
+}
+
+INTERFACE const char *__scudo_get_stack_depot_addr() {
+  return Allocator.getStackDepotAddress();
+}
+
+INTERFACE size_t __scudo_get_stack_depot_size() {
+  return sizeof(scudo::StackDepot);
+}
+
+INTERFACE const char *__scudo_get_region_info_addr() {
+  return Allocator.getRegionInfoArrayAddress();
+}
+
+INTERFACE size_t __scudo_get_region_info_size() {
+  return Allocator.getRegionInfoArraySize();
 }
 
 #endif // SCUDO_ANDROID && _BIONIC
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp b/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
index 1da5385c7789..adb104118123 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
@@ -16,7 +16,7 @@
 #include <stdint.h>
 
 extern "C" void malloc_postinit();
-extern scudo::Allocator<scudo::Config, malloc_postinit> *AllocatorPtr;
+extern HIDDEN scudo::Allocator<scudo::Config, malloc_postinit> Allocator;
 
 namespace std {
 struct nothrow_t {};
@@ -24,85 +24,85 @@ enum class align_val_t : size_t {};
 } // namespace std
 
 INTERFACE WEAK void *operator new(size_t size) {
-  return AllocatorPtr->allocate(size, scudo::Chunk::Origin::New);
+  return Allocator.allocate(size, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void *operator new[](size_t size) {
-  return AllocatorPtr->allocate(size, scudo::Chunk::Origin::NewArray);
+  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
 }
 INTERFACE WEAK void *operator new(size_t size,
                                   std::nothrow_t const &) NOEXCEPT {
-  return AllocatorPtr->allocate(size, scudo::Chunk::Origin::New);
+  return Allocator.allocate(size, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void *operator new[](size_t size,
                                     std::nothrow_t const &) NOEXCEPT {
-  return AllocatorPtr->allocate(size, scudo::Chunk::Origin::NewArray);
+  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
 }
 INTERFACE WEAK void *operator new(size_t size, std::align_val_t align) {
-  return AllocatorPtr->allocate(size, scudo::Chunk::Origin::New,
-                                static_cast<scudo::uptr>(align));
+  return Allocator.allocate(size, scudo::Chunk::Origin::New,
+                            static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void *operator new[](size_t size, std::align_val_t align) {
-  return AllocatorPtr->allocate(size, scudo::Chunk::Origin::NewArray,
-                                static_cast<scudo::uptr>(align));
+  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
+                            static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void *operator new(size_t size, std::align_val_t align,
                                   std::nothrow_t const &) NOEXCEPT {
-  return AllocatorPtr->allocate(size, scudo::Chunk::Origin::New,
-                                static_cast<scudo::uptr>(align));
+  return Allocator.allocate(size, scudo::Chunk::Origin::New,
+                            static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void *operator new[](size_t size, std::align_val_t align,
                                     std::nothrow_t const &) NOEXCEPT {
-  return AllocatorPtr->allocate(size, scudo::Chunk::Origin::NewArray,
-                                static_cast<scudo::uptr>(align));
+  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
+                            static_cast<scudo::uptr>(align));
 }
 
 INTERFACE WEAK void operator delete(void *ptr)NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::New);
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void operator delete[](void *ptr) NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::NewArray);
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray);
 }
 INTERFACE WEAK void operator delete(void *ptr, std::nothrow_t const &)NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::New);
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void operator delete[](void *ptr,
                                       std::nothrow_t const &) NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::NewArray);
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray);
 }
 INTERFACE WEAK void operator delete(void *ptr, size_t size)NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::New, size);
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::New, size);
 }
 INTERFACE WEAK void operator delete[](void *ptr, size_t size) NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::NewArray, size);
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, size);
 }
 INTERFACE WEAK void operator delete(void *ptr, std::align_val_t align)NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::New, 0,
-                           static_cast<scudo::uptr>(align));
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::New, 0,
+                       static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr,
                                       std::align_val_t align) NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::NewArray, 0,
-                           static_cast<scudo::uptr>(align));
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, 0,
+                       static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete(void *ptr, std::align_val_t align,
                                     std::nothrow_t const &)NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::New, 0,
-                           static_cast<scudo::uptr>(align));
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::New, 0,
+                       static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr, std::align_val_t align,
                                       std::nothrow_t const &) NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::NewArray, 0,
-                           static_cast<scudo::uptr>(align));
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, 0,
+                       static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete(void *ptr, size_t size,
                                     std::align_val_t align)NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::New, size,
-                           static_cast<scudo::uptr>(align));
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::New, size,
+                       static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr, size_t size,
                                       std::align_val_t align) NOEXCEPT {
-  AllocatorPtr->deallocate(ptr, scudo::Chunk::Origin::NewArray, size,
-                           static_cast<scudo::uptr>(align));
+  Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, size,
+                       static_cast<scudo::uptr>(align));
 }
 
 #endif // !SCUDO_ANDROID || !_BIONIC
author	Dimitry Andric <dim@FreeBSD.org>	2020-07-26 19:36:28 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-07-26 19:36:28 +0000
commit	cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree	209fb2a2d68f8f277793fc8df46c753d31bc853b /compiler-rt/lib/scudo/standalone
parent	706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)