47 files changed, 1840 insertions, 1044 deletions
diff --git a/compiler-rt/lib/scudo/standalone/allocator_common.h b/compiler-rt/lib/scudo/standalone/allocator_common.h
new file mode 100644
index 000000000000..95f4776ac596
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/allocator_common.h
@@ -0,0 +1,85 @@
+//===-- allocator_common.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_COMMON_H_
+#define SCUDO_ALLOCATOR_COMMON_H_
+
+#include "common.h"
+#include "list.h"
+
+namespace scudo {
+
+template <class SizeClassAllocator> struct TransferBatch {
+  typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
+  typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
+
+  static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
+  void setFromArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, MaxNumCached);
+    Count = N;
+    memcpy(Batch, Array, sizeof(Batch[0]) * Count);
+  }
+  void appendFromArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, MaxNumCached - Count);
+    memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
+    // u16 will be promoted to int by arithmetic type conversion.
+    Count = static_cast<u16>(Count + N);
+  }
+  void appendFromTransferBatch(TransferBatch *B, u16 N) {
+    DCHECK_LE(N, MaxNumCached - Count);
+    DCHECK_GE(B->Count, N);
+    // Append from the back of `B`.
+    memcpy(Batch + Count, B->Batch + (B->Count - N), sizeof(Batch[0]) * N);
+    // u16 will be promoted to int by arithmetic type conversion.
+    Count = static_cast<u16>(Count + N);
+    B->Count = static_cast<u16>(B->Count - N);
+  }
+  void clear() { Count = 0; }
+  void add(CompactPtrT P) {
+    DCHECK_LT(Count, MaxNumCached);
+    Batch[Count++] = P;
+  }
+  void moveToArray(CompactPtrT *Array) {
+    memcpy(Array, Batch, sizeof(Batch[0]) * Count);
+    clear();
+  }
+  u16 getCount() const { return Count; }
+  bool isEmpty() const { return Count == 0U; }
+  CompactPtrT get(u16 I) const {
+    DCHECK_LE(I, Count);
+    return Batch[I];
+  }
+  TransferBatch *Next;
+
+private:
+  CompactPtrT Batch[MaxNumCached];
+  u16 Count;
+};
+
+// A BatchGroup is used to collect blocks. Each group has a group id to
+// identify the group kind of contained blocks.
+template <class SizeClassAllocator> struct BatchGroup {
+  // `Next` is used by IntrusiveList.
+  BatchGroup *Next;
+  // The compact base address of each group
+  uptr CompactPtrGroupBase;
+  // Cache value of SizeClassAllocatorLocalCache::getMaxCached()
+  u16 MaxCachedPerBatch;
+  // Number of blocks pushed into this group. This is an increment-only
+  // counter.
+  uptr PushedBlocks;
+  // This is used to track how many bytes are not in-use since last time we
+  // tried to release pages.
+  uptr BytesInBGAtLastCheckpoint;
+  // Blocks are managed by TransferBatch in a list.
+  SinglyLinkedList<TransferBatch<SizeClassAllocator>> Batches;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_ALLOCATOR_COMMON_H_
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h
index 315a04f7635d..3c6aa3acb0e4 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.h
@@ -11,6 +11,7 @@
 
 #include "combined.h"
 #include "common.h"
+#include "condition_variable.h"
 #include "flags.h"
 #include "primary32.h"
 #include "primary64.h"
@@ -82,6 +83,14 @@ namespace scudo {
 //     // Defines the minimal & maximal release interval that can be set.
 //     static const s32 MinReleaseToOsIntervalMs = INT32_MIN;
 //     static const s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+//
+//     // Use condition variable to shorten the waiting time of refillment of
+//     // freelist. Note that this depends on the implementation of condition
+//     // variable on each platform and the performance may vary so that it
+//     // doesn't guarantee a performance benefit.
+//     // Note that both variables have to be defined to enable it.
+//     static const bool UseConditionVariable = true;
+//     using ConditionVariableT = ConditionVariableLinux;
 //   };
 //   // Defines the type of Primary allocator to use.
 //   template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
@@ -195,50 +204,6 @@ struct AndroidConfig {
   template <typename Config> using SecondaryT = MapAllocator<Config>;
 };
 
-struct AndroidSvelteConfig {
-  static const bool MaySupportMemoryTagging = false;
-  template <class A>
-  using TSDRegistryT = TSDRegistrySharedT<A, 2U, 1U>; // Shared, max 2 TSDs.
-
-  struct Primary {
-    using SizeClassMap = SvelteSizeClassMap;
-#if SCUDO_CAN_USE_PRIMARY64
-    static const uptr RegionSizeLog = 27U;
-    typedef u32 CompactPtrT;
-    static const uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-    static const uptr GroupSizeLog = 18U;
-    static const bool EnableRandomOffset = true;
-    static const uptr MapSizeIncrement = 1UL << 18;
-#else
-    static const uptr RegionSizeLog = 16U;
-    static const uptr GroupSizeLog = 16U;
-    typedef uptr CompactPtrT;
-#endif
-    static const s32 MinReleaseToOsIntervalMs = 1000;
-    static const s32 MaxReleaseToOsIntervalMs = 1000;
-  };
-
-#if SCUDO_CAN_USE_PRIMARY64
-  template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
-#else
-  template <typename Config> using PrimaryT = SizeClassAllocator32<Config>;
-#endif
-
-  struct Secondary {
-    struct Cache {
-      static const u32 EntriesArraySize = 16U;
-      static const u32 QuarantineSize = 32U;
-      static const u32 DefaultMaxEntriesCount = 4U;
-      static const uptr DefaultMaxEntrySize = 1UL << 18;
-      static const s32 MinReleaseToOsIntervalMs = 0;
-      static const s32 MaxReleaseToOsIntervalMs = 0;
-    };
-    template <typename Config> using CacheT = MapAllocatorCache<Config>;
-  };
-
-  template <typename Config> using SecondaryT = MapAllocator<Config>;
-};
-
 #if SCUDO_CAN_USE_PRIMARY64
 struct FuchsiaConfig {
   static const bool MaySupportMemoryTagging = false;
diff --git a/compiler-rt/lib/scudo/standalone/atomic_helpers.h b/compiler-rt/lib/scudo/standalone/atomic_helpers.h
index d88f5d7be642..a68ffd16291c 100644
--- a/compiler-rt/lib/scudo/standalone/atomic_helpers.h
+++ b/compiler-rt/lib/scudo/standalone/atomic_helpers.h
@@ -133,10 +133,10 @@ inline void atomic_store_relaxed(volatile T *A, typename T::Type V) {
 }
 
 template <typename T>
-inline typename T::Type atomic_compare_exchange(volatile T *A,
-                                                typename T::Type Cmp,
-                                                typename T::Type Xchg) {
-  atomic_compare_exchange_strong(A, &Cmp, Xchg, memory_order_acquire);
+inline typename T::Type
+atomic_compare_exchange_strong(volatile T *A, typename T::Type Cmp,
+                               typename T::Type Xchg, memory_order MO) {
+  atomic_compare_exchange_strong(A, &Cmp, Xchg, MO);
   return Cmp;
 }
 
diff --git a/compiler-rt/lib/scudo/standalone/chunk.h b/compiler-rt/lib/scudo/standalone/chunk.h
index 32874a8df642..9228df047189 100644
--- a/compiler-rt/lib/scudo/standalone/chunk.h
+++ b/compiler-rt/lib/scudo/standalone/chunk.h
@@ -128,19 +128,6 @@ inline void loadHeader(u32 Cookie, const void *Ptr,
     reportHeaderCorruption(const_cast<void *>(Ptr));
 }
 
-inline void compareExchangeHeader(u32 Cookie, void *Ptr,
-                                  UnpackedHeader *NewUnpackedHeader,
-                                  UnpackedHeader *OldUnpackedHeader) {
-  NewUnpackedHeader->Checksum =
-      computeHeaderChecksum(Cookie, Ptr, NewUnpackedHeader);
-  PackedHeader NewPackedHeader = bit_cast<PackedHeader>(*NewUnpackedHeader);
-  PackedHeader OldPackedHeader = bit_cast<PackedHeader>(*OldUnpackedHeader);
-  if (UNLIKELY(!atomic_compare_exchange_strong(
-          getAtomicHeader(Ptr), &OldPackedHeader, NewPackedHeader,
-          memory_order_relaxed)))
-    reportHeaderRace(Ptr);
-}
-
 inline bool isValid(u32 Cookie, const void *Ptr,
                     UnpackedHeader *NewUnpackedHeader) {
   PackedHeader NewPackedHeader = atomic_load_relaxed(getConstAtomicHeader(Ptr));
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index b17acc71f892..65ddc488370a 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -14,11 +14,11 @@
 #include "flags.h"
 #include "flags_parser.h"
 #include "local_cache.h"
+#include "mem_map.h"
 #include "memtag.h"
 #include "options.h"
 #include "quarantine.h"
 #include "report.h"
-#include "rss_limit_checker.h"
 #include "secondary.h"
 #include "stack_depot.h"
 #include "string_utils.h"
@@ -68,14 +68,13 @@ public:
       if (UNLIKELY(Header.State != Chunk::State::Quarantined))
         reportInvalidChunkState(AllocatorAction::Recycling, Ptr);
 
-      Chunk::UnpackedHeader NewHeader = Header;
-      NewHeader.State = Chunk::State::Available;
-      Chunk::compareExchangeHeader(Allocator.Cookie, Ptr, &NewHeader, &Header);
+      Header.State = Chunk::State::Available;
+      Chunk::storeHeader(Allocator.Cookie, Ptr, &Header);
 
       if (allocatorSupportsMemoryTagging<Config>())
         Ptr = untagPointer(Ptr);
-      void *BlockBegin = Allocator::getBlockBegin(Ptr, &NewHeader);
-      Cache.deallocate(NewHeader.ClassId, BlockBegin);
+      void *BlockBegin = Allocator::getBlockBegin(Ptr, &Header);
+      Cache.deallocate(Header.ClassId, BlockBegin);
     }
 
     // We take a shortcut when allocating a quarantine batch by working with the
@@ -118,9 +117,8 @@ public:
       DCHECK_EQ(Header.Offset, 0);
       DCHECK_EQ(Header.SizeOrUnusedBytes, sizeof(QuarantineBatch));
 
-      Chunk::UnpackedHeader NewHeader = Header;
-      NewHeader.State = Chunk::State::Available;
-      Chunk::compareExchangeHeader(Allocator.Cookie, Ptr, &NewHeader, &Header);
+      Header.State = Chunk::State::Available;
+      Chunk::storeHeader(Allocator.Cookie, Ptr, &Header);
       Cache.deallocate(QuarantineClassId,
                        reinterpret_cast<void *>(reinterpret_cast<uptr>(Ptr) -
                                                 Chunk::getHeaderSize()));
@@ -149,9 +147,6 @@ public:
     initFlags();
     reportUnrecognizedFlags();
 
-    RssChecker.init(scudo::getFlags()->soft_rss_limit_mb,
-                    scudo::getFlags()->hard_rss_limit_mb);
-
     // Store some flags locally.
     if (getFlags()->may_return_null)
       Primary.Options.set(OptionBit::MayReturnNull);
@@ -251,12 +246,14 @@ public:
   // - unlinking the local stats from the global ones (destroying the cache does
   //   the last two items).
   void commitBack(TSD<ThisT> *TSD) {
+    TSD->assertLocked(/*BypassCheck=*/true);
     Quarantine.drain(&TSD->getQuarantineCache(),
                      QuarantineCallback(*this, TSD->getCache()));
     TSD->getCache().destroy(&Stats);
   }
 
   void drainCache(TSD<ThisT> *TSD) {
+    TSD->assertLocked(/*BypassCheck=*/true);
     Quarantine.drainAndRecycle(&TSD->getQuarantineCache(),
                                QuarantineCallback(*this, TSD->getCache()));
     TSD->getCache().drain();
@@ -299,7 +296,7 @@ public:
 #endif
   }
 
-  uptr computeOddEvenMaskForPointerMaybe(Options Options, uptr Ptr,
+  uptr computeOddEvenMaskForPointerMaybe(const Options &Options, uptr Ptr,
                                          uptr ClassId) {
     if (!Options.get(OptionBit::UseOddEvenTags))
       return 0;
@@ -329,8 +326,6 @@ public:
 #ifdef GWP_ASAN_HOOKS
     if (UNLIKELY(GuardedAlloc.shouldSample())) {
       if (void *Ptr = GuardedAlloc.allocate(Size, Alignment)) {
-        if (UNLIKELY(&__scudo_allocate_hook))
-          __scudo_allocate_hook(Ptr, Size);
         Stats.lock();
         Stats.add(StatAllocated, GuardedAllocSlotSize);
         Stats.sub(StatFree, GuardedAllocSlotSize);
@@ -363,19 +358,6 @@ public:
     }
     DCHECK_LE(Size, NeededSize);
 
-    switch (RssChecker.getRssLimitExceeded()) {
-    case RssLimitChecker::Neither:
-      break;
-    case RssLimitChecker::Soft:
-      if (Options.get(OptionBit::MayReturnNull))
-        return nullptr;
-      reportSoftRSSLimit(RssChecker.getSoftRssLimit());
-      break;
-    case RssLimitChecker::Hard:
-      reportHardRSSLimit(RssChecker.getHardRssLimit());
-      break;
-    }
-
     void *Block = nullptr;
     uptr ClassId = 0;
     uptr SecondaryBlockEnd = 0;
@@ -384,11 +366,11 @@ public:
       DCHECK_NE(ClassId, 0U);
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+      TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
       Block = TSD->getCache().allocate(ClassId);
-      // If the allocation failed, the most likely reason with a 32-bit primary
-      // is the region being full. In that event, retry in each successively
-      // larger class until it fits. If it fails to fit in the largest class,
-      // fallback to the Secondary.
+      // If the allocation failed, retry in each successively larger class until
+      // it fits. If it fails to fit in the largest class, fallback to the
+      // Secondary.
       if (UNLIKELY(!Block)) {
         while (ClassId < SizeClassMap::LargestClassId && !Block)
           Block = TSD->getCache().allocate(++ClassId);
@@ -406,6 +388,7 @@ public:
     if (UNLIKELY(!Block)) {
       if (Options.get(OptionBit::MayReturnNull))
         return nullptr;
+      printStats();
       reportOutOfMemory(NeededSize);
     }
 
@@ -535,14 +518,14 @@ public:
         Chunk::SizeOrUnusedBytesMask;
     Chunk::storeHeader(Cookie, Ptr, &Header);
 
-    if (UNLIKELY(&__scudo_allocate_hook))
-      __scudo_allocate_hook(TaggedPtr, Size);
-
     return TaggedPtr;
   }
 
   NOINLINE void deallocate(void *Ptr, Chunk::Origin Origin, uptr DeleteSize = 0,
                            UNUSED uptr Alignment = MinAlignment) {
+    if (UNLIKELY(!Ptr))
+      return;
+
     // For a deallocation, we only ensure minimal initialization, meaning thread
     // local data will be left uninitialized for now (when using ELF TLS). The
     // fallback cache will be used instead. This is a workaround for a situation
@@ -551,12 +534,6 @@ public:
     // being destroyed properly. Any other heap operation will do a full init.
     initThreadMaybe(/*MinimalInit=*/true);
 
-    if (UNLIKELY(&__scudo_deallocate_hook))
-      __scudo_deallocate_hook(Ptr);
-
-    if (UNLIKELY(!Ptr))
-      return;
-
 #ifdef GWP_ASAN_HOOKS
     if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr))) {
       GuardedAlloc.deallocate(Ptr);
@@ -635,47 +612,46 @@ public:
     if (UNLIKELY(!isAligned(reinterpret_cast<uptr>(OldPtr), MinAlignment)))
       reportMisalignedPointer(AllocatorAction::Reallocating, OldPtr);
 
-    Chunk::UnpackedHeader OldHeader;
-    Chunk::loadHeader(Cookie, OldPtr, &OldHeader);
+    Chunk::UnpackedHeader Header;
+    Chunk::loadHeader(Cookie, OldPtr, &Header);
 
-    if (UNLIKELY(OldHeader.State != Chunk::State::Allocated))
+    if (UNLIKELY(Header.State != Chunk::State::Allocated))
       reportInvalidChunkState(AllocatorAction::Reallocating, OldPtr);
 
     // Pointer has to be allocated with a malloc-type function. Some
     // applications think that it is OK to realloc a memalign'ed pointer, which
     // will trigger this check. It really isn't.
     if (Options.get(OptionBit::DeallocTypeMismatch)) {
-      if (UNLIKELY(OldHeader.OriginOrWasZeroed != Chunk::Origin::Malloc))
+      if (UNLIKELY(Header.OriginOrWasZeroed != Chunk::Origin::Malloc))
         reportDeallocTypeMismatch(AllocatorAction::Reallocating, OldPtr,
-                                  OldHeader.OriginOrWasZeroed,
+                                  Header.OriginOrWasZeroed,
                                   Chunk::Origin::Malloc);
     }
 
-    void *BlockBegin = getBlockBegin(OldTaggedPtr, &OldHeader);
+    void *BlockBegin = getBlockBegin(OldTaggedPtr, &Header);
     uptr BlockEnd;
     uptr OldSize;
-    const uptr ClassId = OldHeader.ClassId;
+    const uptr ClassId = Header.ClassId;
     if (LIKELY(ClassId)) {
       BlockEnd = reinterpret_cast<uptr>(BlockBegin) +
                  SizeClassMap::getSizeByClassId(ClassId);
-      OldSize = OldHeader.SizeOrUnusedBytes;
+      OldSize = Header.SizeOrUnusedBytes;
     } else {
       BlockEnd = SecondaryT::getBlockEnd(BlockBegin);
       OldSize = BlockEnd - (reinterpret_cast<uptr>(OldTaggedPtr) +
-                            OldHeader.SizeOrUnusedBytes);
+                            Header.SizeOrUnusedBytes);
     }
     // If the new chunk still fits in the previously allocated block (with a
     // reasonable delta), we just keep the old block, and update the chunk
     // header to reflect the size change.
     if (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize <= BlockEnd) {
       if (NewSize > OldSize || (OldSize - NewSize) < getPageSizeCached()) {
-        Chunk::UnpackedHeader NewHeader = OldHeader;
-        NewHeader.SizeOrUnusedBytes =
+        Header.SizeOrUnusedBytes =
             (ClassId ? NewSize
                      : BlockEnd -
                            (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize)) &
             Chunk::SizeOrUnusedBytesMask;
-        Chunk::compareExchangeHeader(Cookie, OldPtr, &NewHeader, &OldHeader);
+        Chunk::storeHeader(Cookie, OldPtr, &Header);
         if (UNLIKELY(useMemoryTagging<Config>(Options))) {
           if (ClassId) {
             resizeTaggedChunk(reinterpret_cast<uptr>(OldTaggedPtr) + OldSize,
@@ -697,9 +673,7 @@ public:
     void *NewPtr = allocate(NewSize, Chunk::Origin::Malloc, Alignment);
     if (LIKELY(NewPtr)) {
       memcpy(NewPtr, OldTaggedPtr, Min(NewSize, OldSize));
-      if (UNLIKELY(&__scudo_deallocate_hook))
-        __scudo_deallocate_hook(OldTaggedPtr);
-      quarantineOrDeallocateChunk(Options, OldTaggedPtr, &OldHeader, OldSize);
+      quarantineOrDeallocateChunk(Options, OldTaggedPtr, &Header, OldSize);
     }
     return NewPtr;
   }
@@ -754,6 +728,13 @@ public:
     Str.output();
   }
 
+  void printFragmentationInfo() {
+    ScopedString Str;
+    Primary.getFragmentationInfo(&Str);
+    // Secondary allocator dumps the fragmentation data in getStats().
+    Str.output();
+  }
+
   void releaseToOS(ReleaseToOS ReleaseType) {
     initThreadMaybe();
     if (ReleaseType == ReleaseToOS::ForceAll)
@@ -847,10 +828,15 @@ public:
   // for it, which then forces realloc to copy the usable size of a chunk as
   // opposed to its actual size.
   uptr getUsableSize(const void *Ptr) {
-    initThreadMaybe();
     if (UNLIKELY(!Ptr))
       return 0;
 
+    return getAllocSize(Ptr);
+  }
+
+  uptr getAllocSize(const void *Ptr) {
+    initThreadMaybe();
+
 #ifdef GWP_ASAN_HOOKS
     if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr)))
       return GuardedAlloc.getSize(Ptr);
@@ -859,9 +845,11 @@ public:
     Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
     Chunk::UnpackedHeader Header;
     Chunk::loadHeader(Cookie, Ptr, &Header);
-    // Getting the usable size of a chunk only makes sense if it's allocated.
+
+    // Getting the alloc size of a chunk only makes sense if it's allocated.
     if (UNLIKELY(Header.State != Chunk::State::Allocated))
       reportInvalidChunkState(AllocatorAction::Sizing, const_cast<void *>(Ptr));
+
     return getSize(Ptr, &Header);
   }
 
@@ -887,13 +875,6 @@ public:
            Header.State == Chunk::State::Allocated;
   }
 
-  void setRssLimitsTestOnly(int SoftRssLimitMb, int HardRssLimitMb,
-                            bool MayReturnNull) {
-    RssChecker.init(SoftRssLimitMb, HardRssLimitMb);
-    if (MayReturnNull)
-      Primary.Options.set(OptionBit::MayReturnNull);
-  }
-
   bool useMemoryTaggingTestOnly() const {
     return useMemoryTagging<Config>(Primary.Options.load());
   }
@@ -913,7 +894,7 @@ public:
 
   void setTrackAllocationStacks(bool Track) {
     initThreadMaybe();
-    if (getFlags()->allocation_ring_buffer_size == 0) {
+    if (getFlags()->allocation_ring_buffer_size <= 0) {
       DCHECK(!Primary.Options.load().get(OptionBit::TrackAllocationStacks));
       return;
     }
@@ -955,8 +936,7 @@ public:
 
   uptr getRingBufferSize() {
     initThreadMaybe();
-    auto *RingBuffer = getRingBuffer();
-    return RingBuffer ? ringBufferSizeInBytes(RingBuffer->Size) : 0;
+    return RingBufferElements ? ringBufferSizeInBytes(RingBufferElements) : 0;
   }
 
   static bool setRingBufferSizeForBuffer(char *Buffer, size_t Size) {
@@ -986,8 +966,9 @@ public:
   static void getErrorInfo(struct scudo_error_info *ErrorInfo,
                            uintptr_t FaultAddr, const char *DepotPtr,
                            const char *RegionInfoPtr, const char *RingBufferPtr,
-                           const char *Memory, const char *MemoryTags,
-                           uintptr_t MemoryAddr, size_t MemorySize) {
+                           size_t RingBufferSize, const char *Memory,
+                           const char *MemoryTags, uintptr_t MemoryAddr,
+                           size_t MemorySize) {
     *ErrorInfo = {};
     if (!allocatorSupportsMemoryTagging<Config>() ||
         MemoryAddr + MemorySize < MemoryAddr)
@@ -1006,7 +987,7 @@ public:
     // Check the ring buffer. For primary allocations this will only find UAF;
     // for secondary allocations we can find either UAF or OOB.
     getRingBufferErrorInfo(ErrorInfo, NextErrorReport, FaultAddr, Depot,
-                           RingBufferPtr);
+                           RingBufferPtr, RingBufferSize);
 
     // Check for OOB in the 28 blocks surrounding the 3 we checked earlier.
     // Beyond that we are likely to hit false positives.
@@ -1053,7 +1034,6 @@ private:
   QuarantineT Quarantine;
   TSDRegistryT TSDRegistry;
   pthread_once_t PostInitNonce = PTHREAD_ONCE_INIT;
-  RssLimitChecker RssChecker;
 
 #ifdef GWP_ASAN_HOOKS
   gwp_asan::GuardedPoolAllocator GuardedAlloc;
@@ -1073,13 +1053,14 @@ private:
     };
 
     atomic_uptr Pos;
-    u32 Size;
     // An array of Size (at least one) elements of type Entry is immediately
     // following to this struct.
   };
   // Pointer to memory mapped area starting with AllocationRingBuffer struct,
   // and immediately followed by Size elements of type Entry.
   char *RawRingBuffer = {};
+  u32 RingBufferElements = 0;
+  MemMapT RawRingBufferMap;
 
   // The following might get optimized out by the compiler.
   NOINLINE void performSanityChecks() {
@@ -1134,35 +1115,34 @@ private:
            reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
   }
 
-  void quarantineOrDeallocateChunk(Options Options, void *TaggedPtr,
+  void quarantineOrDeallocateChunk(const Options &Options, void *TaggedPtr,
                                    Chunk::UnpackedHeader *Header,
                                    uptr Size) NO_THREAD_SAFETY_ANALYSIS {
     void *Ptr = getHeaderTaggedPointer(TaggedPtr);
-    Chunk::UnpackedHeader NewHeader = *Header;
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
     // than the maximum allowed, we return a chunk directly to the backend.
     // This purposefully underflows for Size == 0.
     const bool BypassQuarantine = !Quarantine.getCacheSize() ||
                                   ((Size - 1) >= QuarantineMaxChunkSize) ||
-                                  !NewHeader.ClassId;
+                                  !Header->ClassId;
     if (BypassQuarantine)
-      NewHeader.State = Chunk::State::Available;
+      Header->State = Chunk::State::Available;
     else
-      NewHeader.State = Chunk::State::Quarantined;
-    NewHeader.OriginOrWasZeroed = useMemoryTagging<Config>(Options) &&
-                                  NewHeader.ClassId &&
-                                  !TSDRegistry.getDisableMemInit();
-    Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
+      Header->State = Chunk::State::Quarantined;
+    Header->OriginOrWasZeroed = useMemoryTagging<Config>(Options) &&
+                                Header->ClassId &&
+                                !TSDRegistry.getDisableMemInit();
+    Chunk::storeHeader(Cookie, Ptr, Header);
 
     if (UNLIKELY(useMemoryTagging<Config>(Options))) {
       u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
       storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
-      if (NewHeader.ClassId) {
+      if (Header->ClassId) {
         if (!TSDRegistry.getDisableMemInit()) {
           uptr TaggedBegin, TaggedEnd;
           const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
-              Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, &NewHeader)),
-              NewHeader.ClassId);
+              Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, Header)),
+              Header->ClassId);
           // Exclude the previous tag so that immediate use after free is
           // detected 100% of the time.
           setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
@@ -1173,11 +1153,12 @@ private:
     if (BypassQuarantine) {
       if (allocatorSupportsMemoryTagging<Config>())
         Ptr = untagPointer(Ptr);
-      void *BlockBegin = getBlockBegin(Ptr, &NewHeader);
-      const uptr ClassId = NewHeader.ClassId;
+      void *BlockBegin = getBlockBegin(Ptr, Header);
+      const uptr ClassId = Header->ClassId;
       if (LIKELY(ClassId)) {
         bool UnlockRequired;
         auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+        TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
         const bool CacheDrained =
             TSD->getCache().deallocate(ClassId, BlockBegin);
         if (UnlockRequired)
@@ -1197,6 +1178,7 @@ private:
     } else {
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+      TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
       Quarantine.put(&TSD->getQuarantineCache(),
                      QuarantineCallback(*this, TSD->getCache()), Ptr, Size);
       if (UnlockRequired)
@@ -1273,7 +1255,7 @@ private:
     storeEndMarker(RoundNewPtr, NewSize, BlockEnd);
   }
 
-  void storePrimaryAllocationStackMaybe(Options Options, void *Ptr) {
+  void storePrimaryAllocationStackMaybe(const Options &Options, void *Ptr) {
     if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
       return;
     auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
@@ -1286,7 +1268,7 @@ private:
                             u32 DeallocationTid) {
     uptr Pos = atomic_fetch_add(&getRingBuffer()->Pos, 1, memory_order_relaxed);
     typename AllocationRingBuffer::Entry *Entry =
-        getRingBufferEntry(RawRingBuffer, Pos % getRingBuffer()->Size);
+        getRingBufferEntry(RawRingBuffer, Pos % RingBufferElements);
 
     // First invalidate our entry so that we don't attempt to interpret a
     // partially written state in getSecondaryErrorInfo(). The fences below
@@ -1305,7 +1287,7 @@ private:
     atomic_store_relaxed(&Entry->Ptr, reinterpret_cast<uptr>(Ptr));
   }
 
-  void storeSecondaryAllocationStackMaybe(Options Options, void *Ptr,
+  void storeSecondaryAllocationStackMaybe(const Options &Options, void *Ptr,
                                           uptr Size) {
     if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
       return;
@@ -1320,8 +1302,8 @@ private:
     storeRingBufferEntry(untagPointer(Ptr), Trace, Tid, Size, 0, 0);
   }
 
-  void storeDeallocationStackMaybe(Options Options, void *Ptr, u8 PrevTag,
-                                   uptr Size) {
+  void storeDeallocationStackMaybe(const Options &Options, void *Ptr,
+                                   u8 PrevTag, uptr Size) {
     if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
       return;
 
@@ -1427,17 +1409,19 @@ private:
                                      size_t &NextErrorReport,
                                      uintptr_t FaultAddr,
                                      const StackDepot *Depot,
-                                     const char *RingBufferPtr) {
+                                     const char *RingBufferPtr,
+                                     size_t RingBufferSize) {
     auto *RingBuffer =
         reinterpret_cast<const AllocationRingBuffer *>(RingBufferPtr);
-    if (!RingBuffer || RingBuffer->Size == 0)
+    size_t RingBufferElements = ringBufferElementsFromBytes(RingBufferSize);
+    if (!RingBuffer || RingBufferElements == 0)
       return;
     uptr Pos = atomic_load_relaxed(&RingBuffer->Pos);
 
-    for (uptr I = Pos - 1;
-         I != Pos - 1 - RingBuffer->Size && NextErrorReport != NumErrorReports;
+    for (uptr I = Pos - 1; I != Pos - 1 - RingBufferElements &&
+                           NextErrorReport != NumErrorReports;
          --I) {
-      auto *Entry = getRingBufferEntry(RingBufferPtr, I % RingBuffer->Size);
+      auto *Entry = getRingBufferEntry(RingBufferPtr, I % RingBufferElements);
       uptr EntryPtr = atomic_load_relaxed(&Entry->Ptr);
       if (!EntryPtr)
         continue;
@@ -1516,17 +1500,19 @@ private:
   }
 
   void mapAndInitializeRingBuffer() {
+    if (getFlags()->allocation_ring_buffer_size <= 0)
+      return;
     u32 AllocationRingBufferSize =
         static_cast<u32>(getFlags()->allocation_ring_buffer_size);
-    if (AllocationRingBufferSize < 1)
-      return;
-    RawRingBuffer = static_cast<char *>(
-        map(/*Addr=*/nullptr,
-            roundUp(ringBufferSizeInBytes(AllocationRingBufferSize),
-                    getPageSizeCached()),
-            "AllocatorRingBuffer"));
-    auto *RingBuffer = reinterpret_cast<AllocationRingBuffer *>(RawRingBuffer);
-    RingBuffer->Size = AllocationRingBufferSize;
+    MemMapT MemMap;
+    MemMap.map(
+        /*Addr=*/0U,
+        roundUp(ringBufferSizeInBytes(AllocationRingBufferSize),
+                getPageSizeCached()),
+        "scudo:ring_buffer");
+    RawRingBuffer = reinterpret_cast<char *>(MemMap.getBase());
+    RawRingBufferMap = MemMap;
+    RingBufferElements = AllocationRingBufferSize;
     static_assert(sizeof(AllocationRingBuffer) %
                           alignof(typename AllocationRingBuffer::Entry) ==
                       0,
@@ -1534,14 +1520,25 @@ private:
   }
 
   void unmapRingBuffer() {
-    unmap(RawRingBuffer, roundUp(getRingBufferSize(), getPageSizeCached()));
+    auto *RingBuffer = getRingBuffer();
+    if (RingBuffer != nullptr) {
+      RawRingBufferMap.unmap(RawRingBufferMap.getBase(),
+                             RawRingBufferMap.getCapacity());
+    }
     RawRingBuffer = nullptr;
   }
 
-  static constexpr size_t ringBufferSizeInBytes(u32 AllocationRingBufferSize) {
+  static constexpr size_t ringBufferSizeInBytes(u32 RingBufferElements) {
     return sizeof(AllocationRingBuffer) +
-           AllocationRingBufferSize *
-               sizeof(typename AllocationRingBuffer::Entry);
+           RingBufferElements * sizeof(typename AllocationRingBuffer::Entry);
+  }
+
+  static constexpr size_t ringBufferElementsFromBytes(size_t Bytes) {
+    if (Bytes < sizeof(AllocationRingBuffer)) {
+      return 0;
+    }
+    return (Bytes - sizeof(AllocationRingBuffer)) /
+           sizeof(typename AllocationRingBuffer::Entry);
   }
 
   inline AllocationRingBuffer *getRingBuffer() {
diff --git a/compiler-rt/lib/scudo/standalone/common.cpp b/compiler-rt/lib/scudo/standalone/common.cpp
index 9f14faeef283..06e930638f6f 100644
--- a/compiler-rt/lib/scudo/standalone/common.cpp
+++ b/compiler-rt/lib/scudo/standalone/common.cpp
@@ -21,22 +21,4 @@ uptr getPageSizeSlow() {
   return PageSizeCached;
 }
 
-// Fatal internal map() or unmap() error (potentially OOM related).
-void NORETURN dieOnMapUnmapError(uptr SizeIfOOM) {
-  char Error[128] = "Scudo ERROR: internal map or unmap failure\n";
-  if (SizeIfOOM) {
-    formatString(
-        Error, sizeof(Error),
-        "Scudo ERROR: internal map failure (NO MEMORY) requesting %zuKB\n",
-        SizeIfOOM >> 10);
-  }
-  outputRaw(Error);
-  setAbortMessage(Error);
-  die();
-}
-
-#if !SCUDO_LINUX
-uptr GetRSS() { return 0; }
-#endif
-
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h
index 82e6cf4aee61..3581c946d160 100644
--- a/compiler-rt/lib/scudo/standalone/common.h
+++ b/compiler-rt/lib/scudo/standalone/common.h
@@ -17,6 +17,7 @@
 
 #include <stddef.h>
 #include <string.h>
+#include <unistd.h>
 
 namespace scudo {
 
@@ -111,29 +112,15 @@ template <typename T> inline void shuffle(T *A, u32 N, u32 *RandState) {
   *RandState = State;
 }
 
-// Hardware specific inlinable functions.
-
-inline void yieldProcessor(UNUSED u8 Count) {
-#if defined(__i386__) || defined(__x86_64__)
-  __asm__ __volatile__("" ::: "memory");
-  for (u8 I = 0; I < Count; I++)
-    __asm__ __volatile__("pause");
-#elif defined(__aarch64__) || defined(__arm__)
-  __asm__ __volatile__("" ::: "memory");
-  for (u8 I = 0; I < Count; I++)
-    __asm__ __volatile__("yield");
-#endif
-  __asm__ __volatile__("" ::: "memory");
-}
-
 // Platform specific functions.
 
 extern uptr PageSizeCached;
 uptr getPageSizeSlow();
 inline uptr getPageSizeCached() {
-  // Bionic uses a hardcoded value.
-  if (SCUDO_ANDROID)
-    return 4096U;
+#if SCUDO_ANDROID && defined(PAGE_SIZE)
+  // Most Android builds have a build-time constant page size.
+  return PAGE_SIZE;
+#endif
   if (LIKELY(PageSizeCached))
     return PageSizeCached;
   return getPageSizeSlow();
@@ -144,8 +131,6 @@ u32 getNumberOfCPUs();
 
 const char *getEnv(const char *Name);
 
-uptr GetRSS();
-
 u64 getMonotonicTime();
 // Gets the time faster but with less accuracy. Can call getMonotonicTime
 // if no fast version is available.
@@ -190,10 +175,6 @@ void setMemoryPermission(uptr Addr, uptr Size, uptr Flags,
 void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
                       MapPlatformData *Data = nullptr);
 
-// Internal map & unmap fatal error. This must not call map(). SizeIfOOM shall
-// hold the requested size on an out-of-memory error, 0 otherwise.
-void NORETURN dieOnMapUnmapError(uptr SizeIfOOM = 0);
-
 // Logging related functions.
 
 void setAbortMessage(const char *Message);
diff --git a/compiler-rt/lib/scudo/standalone/condition_variable.h b/compiler-rt/lib/scudo/standalone/condition_variable.h
new file mode 100644
index 000000000000..549f6e9f787b
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/condition_variable.h
@@ -0,0 +1,60 @@
+//===-- condition_variable.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_CONDITION_VARIABLE_H_
+#define SCUDO_CONDITION_VARIABLE_H_
+
+#include "condition_variable_base.h"
+
+#include "common.h"
+#include "platform.h"
+
+#include "condition_variable_linux.h"
+
+namespace scudo {
+
+// A default implementation of default condition variable. It doesn't do a real
+// `wait`, instead it spins a short amount of time only.
+class ConditionVariableDummy
+    : public ConditionVariableBase<ConditionVariableDummy> {
+public:
+  void notifyAllImpl(UNUSED HybridMutex &M) REQUIRES(M) {}
+
+  void waitImpl(UNUSED HybridMutex &M) REQUIRES(M) {
+    M.unlock();
+
+    constexpr u32 SpinTimes = 64;
+    volatile u32 V = 0;
+    for (u32 I = 0; I < SpinTimes; ++I) {
+      u32 Tmp = V + 1;
+      V = Tmp;
+    }
+
+    M.lock();
+  }
+};
+
+template <typename Config, typename = const bool>
+struct ConditionVariableState {
+  static constexpr bool enabled() { return false; }
+  // This is only used for compilation purpose so that we won't end up having
+  // many conditional compilations. If you want to use `ConditionVariableDummy`,
+  // define `ConditionVariableT` in your allocator configuration. See
+  // allocator_config.h for more details.
+  using ConditionVariableT = ConditionVariableDummy;
+};
+
+template <typename Config>
+struct ConditionVariableState<Config, decltype(Config::UseConditionVariable)> {
+  static constexpr bool enabled() { return true; }
+  using ConditionVariableT = typename Config::ConditionVariableT;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_CONDITION_VARIABLE_H_
diff --git a/compiler-rt/lib/scudo/standalone/condition_variable_base.h b/compiler-rt/lib/scudo/standalone/condition_variable_base.h
new file mode 100644
index 000000000000..416c327fed49
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/condition_variable_base.h
@@ -0,0 +1,56 @@
+//===-- condition_variable_base.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_CONDITION_VARIABLE_BASE_H_
+#define SCUDO_CONDITION_VARIABLE_BASE_H_
+
+#include "mutex.h"
+#include "thread_annotations.h"
+
+namespace scudo {
+
+template <typename Derived> class ConditionVariableBase {
+public:
+  constexpr ConditionVariableBase() = default;
+
+  void bindTestOnly(HybridMutex &Mutex) {
+#if SCUDO_DEBUG
+    boundMutex = &Mutex;
+#else
+    (void)Mutex;
+#endif
+  }
+
+  void notifyAll(HybridMutex &M) REQUIRES(M) {
+#if SCUDO_DEBUG
+    CHECK_EQ(&M, boundMutex);
+#endif
+    getDerived()->notifyAllImpl(M);
+  }
+
+  void wait(HybridMutex &M) REQUIRES(M) {
+#if SCUDO_DEBUG
+    CHECK_EQ(&M, boundMutex);
+#endif
+    getDerived()->waitImpl(M);
+  }
+
+protected:
+  Derived *getDerived() { return static_cast<Derived *>(this); }
+
+#if SCUDO_DEBUG
+  // Because thread-safety analysis doesn't support pointer aliasing, we are not
+  // able to mark the proper annotations without false positive. Instead, we
+  // pass the lock and do the same-lock check separately.
+  HybridMutex *boundMutex = nullptr;
+#endif
+};
+
+} // namespace scudo
+
+#endif // SCUDO_CONDITION_VARIABLE_BASE_H_
diff --git a/compiler-rt/lib/scudo/standalone/condition_variable_linux.cpp b/compiler-rt/lib/scudo/standalone/condition_variable_linux.cpp
new file mode 100644
index 000000000000..e6d9bd1771a4
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/condition_variable_linux.cpp
@@ -0,0 +1,52 @@
+//===-- condition_variable_linux.cpp ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "platform.h"
+
+#if SCUDO_LINUX
+
+#include "condition_variable_linux.h"
+
+#include "atomic_helpers.h"
+
+#include <limits.h>
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+namespace scudo {
+
+void ConditionVariableLinux::notifyAllImpl(UNUSED HybridMutex &M) {
+  const u32 V = atomic_load_relaxed(&Counter);
+  atomic_store_relaxed(&Counter, V + 1);
+
+  // TODO(chiahungduan): Move the waiters from the futex waiting queue
+  // `Counter` to futex waiting queue `M` so that the awoken threads won't be
+  // blocked again due to locked `M` by current thread.
+  if (LastNotifyAll != V) {
+    syscall(SYS_futex, reinterpret_cast<uptr>(&Counter), FUTEX_WAKE_PRIVATE,
+            INT_MAX, nullptr, nullptr, 0);
+  }
+
+  LastNotifyAll = V + 1;
+}
+
+void ConditionVariableLinux::waitImpl(HybridMutex &M) {
+  const u32 V = atomic_load_relaxed(&Counter) + 1;
+  atomic_store_relaxed(&Counter, V);
+
+  // TODO: Use ScopedUnlock when it's supported.
+  M.unlock();
+  syscall(SYS_futex, reinterpret_cast<uptr>(&Counter), FUTEX_WAIT_PRIVATE, V,
+          nullptr, nullptr, 0);
+  M.lock();
+}
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX
diff --git a/compiler-rt/lib/scudo/standalone/condition_variable_linux.h b/compiler-rt/lib/scudo/standalone/condition_variable_linux.h
new file mode 100644
index 000000000000..cd073287326d
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/condition_variable_linux.h
@@ -0,0 +1,38 @@
+//===-- condition_variable_linux.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_CONDITION_VARIABLE_LINUX_H_
+#define SCUDO_CONDITION_VARIABLE_LINUX_H_
+
+#include "platform.h"
+
+#if SCUDO_LINUX
+
+#include "atomic_helpers.h"
+#include "condition_variable_base.h"
+#include "thread_annotations.h"
+
+namespace scudo {
+
+class ConditionVariableLinux
+    : public ConditionVariableBase<ConditionVariableLinux> {
+public:
+  void notifyAllImpl(HybridMutex &M) REQUIRES(M);
+
+  void waitImpl(HybridMutex &M) REQUIRES(M);
+
+private:
+  u32 LastNotifyAll = 0;
+  atomic_u32 Counter = {};
+};
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX
+
+#endif // SCUDO_CONDITION_VARIABLE_LINUX_H_
diff --git a/compiler-rt/lib/scudo/standalone/flags.cpp b/compiler-rt/lib/scudo/standalone/flags.cpp
index de5153b288b1..f498edfbd326 100644
--- a/compiler-rt/lib/scudo/standalone/flags.cpp
+++ b/compiler-rt/lib/scudo/standalone/flags.cpp
@@ -68,6 +68,9 @@ void initFlags() {
   Parser.parseString(getCompileDefinitionScudoDefaultOptions());
   Parser.parseString(getScudoDefaultOptions());
   Parser.parseString(getEnv("SCUDO_OPTIONS"));
+  if (const char *V = getEnv("SCUDO_ALLOCATION_RING_BUFFER_SIZE")) {
+    Parser.parseStringPair("allocation_ring_buffer_size", V);
+  }
 }
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/flags.inc b/compiler-rt/lib/scudo/standalone/flags.inc
index c1f153bafdd9..f5a2bab5057a 100644
--- a/compiler-rt/lib/scudo/standalone/flags.inc
+++ b/compiler-rt/lib/scudo/standalone/flags.inc
@@ -46,14 +46,6 @@ SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000,
            "Interval (in milliseconds) at which to attempt release of unused "
            "memory to the OS. Negative values disable the feature.")
 
-SCUDO_FLAG(int, hard_rss_limit_mb, 0,
-           "Hard RSS Limit in Mb. If non-zero, once the limit is achieved, "
-           "abort the process")
-
-SCUDO_FLAG(int, soft_rss_limit_mb, 0,
-           "Soft RSS Limit in Mb. If non-zero, once the limit is reached, all "
-           "subsequent calls will fail or return NULL until the RSS goes below "
-           "the soft limit")
-
 SCUDO_FLAG(int, allocation_ring_buffer_size, 32768,
-           "Entries to keep in the allocation ring buffer for scudo.")
+           "Entries to keep in the allocation ring buffer for scudo. "
+           "Values less or equal to zero disable the buffer.")
diff --git a/compiler-rt/lib/scudo/standalone/flags_parser.cpp b/compiler-rt/lib/scudo/standalone/flags_parser.cpp
index be39fcd4f887..6f9b23ea90e2 100644
--- a/compiler-rt/lib/scudo/standalone/flags_parser.cpp
+++ b/compiler-rt/lib/scudo/standalone/flags_parser.cpp
@@ -10,6 +10,7 @@
 #include "common.h"
 #include "report.h"
 
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -80,7 +81,7 @@ void FlagParser::parseFlag() {
       ++Pos;
     Value = Buffer + ValueStart;
   }
-  if (!runHandler(Name, Value))
+  if (!runHandler(Name, Value, '='))
     reportError("flag parsing failed.");
 }
 
@@ -122,10 +123,16 @@ inline bool parseBool(const char *Value, bool *b) {
   return false;
 }
 
-bool FlagParser::runHandler(const char *Name, const char *Value) {
+void FlagParser::parseStringPair(const char *Name, const char *Value) {
+  if (!runHandler(Name, Value, '\0'))
+    reportError("flag parsing failed.");
+}
+
+bool FlagParser::runHandler(const char *Name, const char *Value,
+                            const char Sep) {
   for (u32 I = 0; I < NumberOfFlags; ++I) {
     const uptr Len = strlen(Flags[I].Name);
-    if (strncmp(Name, Flags[I].Name, Len) != 0 || Name[Len] != '=')
+    if (strncmp(Name, Flags[I].Name, Len) != 0 || Name[Len] != Sep)
       continue;
     bool Ok = false;
     switch (Flags[I].Type) {
@@ -136,8 +143,15 @@ bool FlagParser::runHandler(const char *Name, const char *Value) {
       break;
     case FlagType::FT_int:
       char *ValueEnd;
-      *reinterpret_cast<int *>(Flags[I].Var) =
-          static_cast<int>(strtol(Value, &ValueEnd, 10));
+      long V = strtol(Value, &ValueEnd, 10);
+      // strtol returns LONG_MAX on overflow and LONG_MIN on underflow.
+      // This is why we compare-equal here (and lose INT_MIN and INT_MAX as a
+      // value, but that's okay)
+      if (V >= INT_MAX || V <= INT_MIN) {
+        reportInvalidFlag("int", Value);
+        return false;
+      }
+      *reinterpret_cast<int *>(Flags[I].Var) = static_cast<int>(V);
       Ok =
           *ValueEnd == '"' || *ValueEnd == '\'' || isSeparatorOrNull(*ValueEnd);
       if (!Ok)
diff --git a/compiler-rt/lib/scudo/standalone/flags_parser.h b/compiler-rt/lib/scudo/standalone/flags_parser.h
index ba832adbd909..ded496fda3b9 100644
--- a/compiler-rt/lib/scudo/standalone/flags_parser.h
+++ b/compiler-rt/lib/scudo/standalone/flags_parser.h
@@ -27,6 +27,7 @@ public:
                     void *Var);
   void parseString(const char *S);
   void printFlagDescriptions();
+  void parseStringPair(const char *Name, const char *Value);
 
 private:
   static const u32 MaxFlags = 20;
@@ -45,7 +46,7 @@ private:
   void skipWhitespace();
   void parseFlags();
   void parseFlag();
-  bool runHandler(const char *Name, const char *Value);
+  bool runHandler(const char *Name, const char *Value, char Sep);
 };
 
 void reportUnrecognizedFlags();
diff --git a/compiler-rt/lib/scudo/standalone/fuzz/get_error_info_fuzzer.cpp b/compiler-rt/lib/scudo/standalone/fuzz/get_error_info_fuzzer.cpp
index 74456450a476..5b01ebe11c09 100644
--- a/compiler-rt/lib/scudo/standalone/fuzz/get_error_info_fuzzer.cpp
+++ b/compiler-rt/lib/scudo/standalone/fuzz/get_error_info_fuzzer.cpp
@@ -46,14 +46,11 @@ extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
   }
 
   std::string RingBufferBytes = FDP.ConsumeRemainingBytesAsString();
-  // RingBuffer is too short.
-  if (!AllocatorT::setRingBufferSizeForBuffer(RingBufferBytes.data(),
-                                              RingBufferBytes.size()))
-    return 0;
 
   scudo_error_info ErrorInfo;
   AllocatorT::getErrorInfo(&ErrorInfo, FaultAddr, StackDepot.data(),
-                           RegionInfo.data(), RingBufferBytes.data(), Memory,
-                           MemoryTags, MemoryAddr, MemorySize);
+                           RegionInfo.data(), RingBufferBytes.data(),
+                           RingBufferBytes.size(), Memory, MemoryTags,
+                           MemoryAddr, MemorySize);
   return 0;
 }
diff --git a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
index 6c0c521f8d82..a2dedea910cc 100644
--- a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
+++ b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
@@ -17,10 +17,22 @@ extern "C" {
 __attribute__((weak)) const char *__scudo_default_options(void);
 
 // Post-allocation & pre-deallocation hooks.
-// They must be thread-safe and not use heap related functions.
 __attribute__((weak)) void __scudo_allocate_hook(void *ptr, size_t size);
 __attribute__((weak)) void __scudo_deallocate_hook(void *ptr);
 
+// `realloc` involves both deallocation and allocation but they are not reported
+// atomically. In one specific case which may keep taking a snapshot right in
+// the middle of `realloc` reporting the deallocation and allocation, it may
+// confuse the user by missing memory from `realloc`. To alleviate that case,
+// define the two `realloc` hooks to get the knowledge of the bundled hook
+// calls. These hooks are optional and should only be used when a hooks user
+// wants to track reallocs more closely.
+//
+// See more details in the comment of `realloc` in wrapper_c.inc.
+__attribute__((weak)) void
+__scudo_realloc_allocate_hook(void *old_ptr, void *new_ptr, size_t size);
+__attribute__((weak)) void __scudo_realloc_deallocate_hook(void *old_ptr);
+
 void __scudo_print_stats(void);
 
 typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
@@ -73,7 +85,8 @@ typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
 // pointer.
 void __scudo_get_error_info(struct scudo_error_info *error_info,
                             uintptr_t fault_addr, const char *stack_depot,
-                            const char *region_info, const char *ring_buffer,
+                            size_t stack_depot_size, const char *region_info,
+                            const char *ring_buffer, size_t ring_buffer_size,
                             const char *memory, const char *memory_tags,
                             uintptr_t memory_addr, size_t memory_size);
 
diff --git a/compiler-rt/lib/scudo/standalone/linux.cpp b/compiler-rt/lib/scudo/standalone/linux.cpp
index e285d8a3d2d2..274695108109 100644
--- a/compiler-rt/lib/scudo/standalone/linux.cpp
+++ b/compiler-rt/lib/scudo/standalone/linux.cpp
@@ -14,6 +14,7 @@
 #include "internal_defs.h"
 #include "linux.h"
 #include "mutex.h"
+#include "report_linux.h"
 #include "string_utils.h"
 
 #include <errno.h>
@@ -43,6 +44,7 @@ uptr getPageSize() { return static_cast<uptr>(sysconf(_SC_PAGESIZE)); }
 
 void NORETURN die() { abort(); }
 
+// TODO: Will be deprecated. Use the interfaces in MemMapLinux instead.
 void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
           UNUSED MapPlatformData *Data) {
   int MmapFlags = MAP_PRIVATE | MAP_ANONYMOUS;
@@ -65,7 +67,7 @@ void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
   void *P = mmap(Addr, Size, MmapProt, MmapFlags, -1, 0);
   if (P == MAP_FAILED) {
     if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
-      dieOnMapUnmapError(errno == ENOMEM ? Size : 0);
+      reportMapError(errno == ENOMEM ? Size : 0);
     return nullptr;
   }
 #if SCUDO_ANDROID
@@ -75,19 +77,22 @@ void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
   return P;
 }
 
+// TODO: Will be deprecated. Use the interfaces in MemMapLinux instead.
 void unmap(void *Addr, uptr Size, UNUSED uptr Flags,
            UNUSED MapPlatformData *Data) {
   if (munmap(Addr, Size) != 0)
-    dieOnMapUnmapError();
+    reportUnmapError(reinterpret_cast<uptr>(Addr), Size);
 }
 
+// TODO: Will be deprecated. Use the interfaces in MemMapLinux instead.
 void setMemoryPermission(uptr Addr, uptr Size, uptr Flags,
                          UNUSED MapPlatformData *Data) {
   int Prot = (Flags & MAP_NOACCESS) ? PROT_NONE : (PROT_READ | PROT_WRITE);
   if (mprotect(reinterpret_cast<void *>(Addr), Size, Prot) != 0)
-    dieOnMapUnmapError();
+    reportProtectError(Addr, Size, Prot);
 }
 
+// TODO: Will be deprecated. Use the interfaces in MemMapLinux instead.
 void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
                       UNUSED MapPlatformData *Data) {
   void *Addr = reinterpret_cast<void *>(BaseAddress + Offset);
@@ -104,12 +109,14 @@ enum State : u32 { Unlocked = 0, Locked = 1, Sleeping = 2 };
 }
 
 bool HybridMutex::tryLock() {
-  return atomic_compare_exchange(&M, Unlocked, Locked) == Unlocked;
+  return atomic_compare_exchange_strong(&M, Unlocked, Locked,
+                                        memory_order_acquire) == Unlocked;
 }
 
 // The following is based on https://akkadia.org/drepper/futex.pdf.
 void HybridMutex::lockSlow() {
-  u32 V = atomic_compare_exchange(&M, Unlocked, Locked);
+  u32 V = atomic_compare_exchange_strong(&M, Unlocked, Locked,
+                                         memory_order_acquire);
   if (V == Unlocked)
     return;
   if (V != Sleeping)
@@ -197,39 +204,6 @@ bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
 extern "C" WEAK int async_safe_write_log(int pri, const char *tag,
                                          const char *msg);
 
-static uptr GetRSSFromBuffer(const char *Buf) {
-  // The format of the file is:
-  // 1084 89 69 11 0 79 0
-  // We need the second number which is RSS in pages.
-  const char *Pos = Buf;
-  // Skip the first number.
-  while (*Pos >= '0' && *Pos <= '9')
-    Pos++;
-  // Skip whitespaces.
-  while (!(*Pos >= '0' && *Pos <= '9') && *Pos != 0)
-    Pos++;
-  // Read the number.
-  u64 Rss = 0;
-  for (; *Pos >= '0' && *Pos <= '9'; Pos++)
-    Rss = Rss * 10 + static_cast<u64>(*Pos) - '0';
-  return static_cast<uptr>(Rss * getPageSizeCached());
-}
-
-uptr GetRSS() {
-  // TODO: We currently use sanitizer_common's GetRSS which reads the
-  // RSS from /proc/self/statm by default. We might want to
-  // call getrusage directly, even if it's less accurate.
-  auto Fd = open("/proc/self/statm", O_RDONLY);
-  char Buf[64];
-  s64 Len = read(Fd, Buf, sizeof(Buf) - 1);
-  close(Fd);
-  if (Len <= 0)
-    return 0;
-  Buf[Len] = 0;
-
-  return GetRSSFromBuffer(Buf);
-}
-
 void outputRaw(const char *Buffer) {
   if (&async_safe_write_log) {
     constexpr s32 AndroidLogInfo = 4;
diff --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
index 1095eb5f186d..46d6affdc033 100644
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -22,80 +22,13 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
   typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
 
-  struct TransferBatch {
-    static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
-    void setFromArray(CompactPtrT *Array, u16 N) {
-      DCHECK_LE(N, MaxNumCached);
-      Count = N;
-      memcpy(Batch, Array, sizeof(Batch[0]) * Count);
-    }
-    void appendFromArray(CompactPtrT *Array, u16 N) {
-      DCHECK_LE(N, MaxNumCached - Count);
-      memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
-      // u16 will be promoted to int by arithmetic type conversion.
-      Count = static_cast<u16>(Count + N);
-    }
-    void appendFromTransferBatch(TransferBatch *B, u16 N) {
-      DCHECK_LE(N, MaxNumCached - Count);
-      DCHECK_GE(B->Count, N);
-      // Append from the back of `B`.
-      memcpy(Batch + Count, B->Batch + (B->Count - N), sizeof(Batch[0]) * N);
-      // u16 will be promoted to int by arithmetic type conversion.
-      Count = static_cast<u16>(Count + N);
-      B->Count = static_cast<u16>(B->Count - N);
-    }
-    void clear() { Count = 0; }
-    void add(CompactPtrT P) {
-      DCHECK_LT(Count, MaxNumCached);
-      Batch[Count++] = P;
-    }
-    void copyToArray(CompactPtrT *Array) const {
-      memcpy(Array, Batch, sizeof(Batch[0]) * Count);
-    }
-    u16 getCount() const { return Count; }
-    bool isEmpty() const { return Count == 0U; }
-    CompactPtrT get(u16 I) const {
-      DCHECK_LE(I, Count);
-      return Batch[I];
-    }
-    static u16 getMaxCached(uptr Size) {
-      return Min(MaxNumCached, SizeClassMap::getMaxCachedHint(Size));
-    }
-    TransferBatch *Next;
-
-  private:
-    CompactPtrT Batch[MaxNumCached];
-    u16 Count;
-  };
-
-  // A BatchGroup is used to collect blocks. Each group has a group id to
-  // identify the group kind of contained blocks.
-  struct BatchGroup {
-    // `Next` is used by IntrusiveList.
-    BatchGroup *Next;
-    // The compact base address of each group
-    uptr CompactPtrGroupBase;
-    // Cache value of TransferBatch::getMaxCached()
-    u16 MaxCachedPerBatch;
-    // Number of blocks pushed into this group. This is an increment-only
-    // counter.
-    uptr PushedBlocks;
-    // This is used to track how many bytes are not in-use since last time we
-    // tried to release pages.
-    uptr BytesInBGAtLastCheckpoint;
-    // Blocks are managed by TransferBatch in a list.
-    SinglyLinkedList<TransferBatch> Batches;
-  };
-
-  static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
-                "BatchGroup uses the same class size as TransferBatch");
-
   void init(GlobalStats *S, SizeClassAllocator *A) {
     DCHECK(isEmpty());
     Stats.init();
     if (LIKELY(S))
       S->link(&Stats);
     Allocator = A;
+    initCache();
   }
 
   void destroy(GlobalStats *S) {
@@ -108,7 +41,9 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     DCHECK_LT(ClassId, NumClasses);
     PerClass *C = &PerClassArray[ClassId];
     if (C->Count == 0) {
-      if (UNLIKELY(!refill(C, ClassId)))
+      // Refill half of the number of max cached.
+      DCHECK_GT(C->MaxCount / 2, 0U);
+      if (UNLIKELY(!refill(C, ClassId, C->MaxCount / 2)))
         return nullptr;
       DCHECK_GT(C->Count, 0);
     }
@@ -125,9 +60,6 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   bool deallocate(uptr ClassId, void *P) {
     CHECK_LT(ClassId, NumClasses);
     PerClass *C = &PerClassArray[ClassId];
-    // We still have to initialize the cache in the event that the first heap
-    // operation in a thread is a deallocation.
-    initCacheMaybe(C);
 
     // If the cache is full, drain half of blocks back to the main allocator.
     const bool NeedToDrainCache = C->Count == C->MaxCount;
@@ -151,7 +83,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   }
 
   void drain() {
-    // Drain BatchClassId last as createBatch can refill it.
+    // Drain BatchClassId last as it may be needed while draining normal blocks.
     for (uptr I = 0; I < NumClasses; ++I) {
       if (I == BatchClassId)
         continue;
@@ -163,19 +95,11 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     DCHECK(isEmpty());
   }
 
-  TransferBatch *createBatch(uptr ClassId, void *B) {
-    if (ClassId != BatchClassId)
-      B = allocate(BatchClassId);
+  void *getBatchClassBlock() {
+    void *B = allocate(BatchClassId);
     if (UNLIKELY(!B))
       reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    return reinterpret_cast<TransferBatch *>(B);
-  }
-
-  BatchGroup *createGroup() {
-    void *Ptr = allocate(BatchClassId);
-    if (UNLIKELY(!Ptr))
-      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    return reinterpret_cast<BatchGroup *>(Ptr);
+    return B;
   }
 
   LocalStats &getStats() { return Stats; }
@@ -203,6 +127,11 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
       Str->append("    No block is cached.\n");
   }
 
+  static u16 getMaxCached(uptr Size) {
+    return Min(SizeClassMap::MaxNumCachedHint,
+               SizeClassMap::getMaxCachedHint(Size));
+  }
+
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr BatchClassId = SizeClassMap::BatchClassId;
@@ -211,24 +140,17 @@ private:
     u16 MaxCount;
     // Note: ClassSize is zero for the transfer batch.
     uptr ClassSize;
-    CompactPtrT Chunks[2 * TransferBatch::MaxNumCached];
+    CompactPtrT Chunks[2 * SizeClassMap::MaxNumCachedHint];
   };
   PerClass PerClassArray[NumClasses] = {};
   LocalStats Stats;
   SizeClassAllocator *Allocator = nullptr;
 
-  ALWAYS_INLINE void initCacheMaybe(PerClass *C) {
-    if (LIKELY(C->MaxCount))
-      return;
-    initCache();
-    DCHECK_NE(C->MaxCount, 0U);
-  }
-
   NOINLINE void initCache() {
     for (uptr I = 0; I < NumClasses; I++) {
       PerClass *P = &PerClassArray[I];
       const uptr Size = SizeClassAllocator::getSizeByClassId(I);
-      P->MaxCount = static_cast<u16>(2 * TransferBatch::getMaxCached(Size));
+      P->MaxCount = static_cast<u16>(2 * getMaxCached(Size));
       if (I != BatchClassId) {
         P->ClassSize = Size;
       } else {
@@ -244,17 +166,12 @@ private:
       deallocate(BatchClassId, B);
   }
 
-  NOINLINE bool refill(PerClass *C, uptr ClassId) {
-    initCacheMaybe(C);
-    TransferBatch *B = Allocator->popBatch(this, ClassId);
-    if (UNLIKELY(!B))
-      return false;
-    DCHECK_GT(B->getCount(), 0);
-    C->Count = B->getCount();
-    B->copyToArray(C->Chunks);
-    B->clear();
-    destroyBatch(ClassId, B);
-    return true;
+  NOINLINE bool refill(PerClass *C, uptr ClassId, u16 MaxRefill) {
+    const u16 NumBlocksRefilled =
+        Allocator->popBlocks(this, ClassId, C->Chunks, MaxRefill);
+    DCHECK_LE(NumBlocksRefilled, MaxRefill);
+    C->Count = static_cast<u16>(C->Count + NumBlocksRefilled);
+    return NumBlocksRefilled != 0;
   }
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
diff --git a/compiler-rt/lib/scudo/standalone/mem_map.h b/compiler-rt/lib/scudo/standalone/mem_map.h
index 409e4dbbe04b..b92216cf271d 100644
--- a/compiler-rt/lib/scudo/standalone/mem_map.h
+++ b/compiler-rt/lib/scudo/standalone/mem_map.h
@@ -22,6 +22,7 @@
 #include "trusty.h"
 
 #include "mem_map_fuchsia.h"
+#include "mem_map_linux.h"
 
 namespace scudo {
 
@@ -73,10 +74,10 @@ private:
 };
 
 #if SCUDO_LINUX
-using ReservedMemoryT = ReservedMemoryDefault;
+using ReservedMemoryT = ReservedMemoryLinux;
 using MemMapT = ReservedMemoryT::MemMapT;
 #elif SCUDO_FUCHSIA
-using ReservedMemoryT = ReservedMemoryDefault;
+using ReservedMemoryT = ReservedMemoryFuchsia;
 using MemMapT = ReservedMemoryT::MemMapT;
 #elif SCUDO_TRUSTY
 using ReservedMemoryT = ReservedMemoryDefault;
diff --git a/compiler-rt/lib/scudo/standalone/mem_map_fuchsia.cpp b/compiler-rt/lib/scudo/standalone/mem_map_fuchsia.cpp
index 9ace1fef7ad4..0566ab065526 100644
--- a/compiler-rt/lib/scudo/standalone/mem_map_fuchsia.cpp
+++ b/compiler-rt/lib/scudo/standalone/mem_map_fuchsia.cpp
@@ -41,7 +41,7 @@ static void setVmoName(zx_handle_t Vmo, const char *Name) {
 static uptr getRootVmarBase() {
   static atomic_uptr CachedResult = {0};
 
-  uptr Result = atomic_load_relaxed(&CachedResult);
+  uptr Result = atomic_load(&CachedResult, memory_order_acquire);
   if (UNLIKELY(!Result)) {
     zx_info_vmar_t VmarInfo;
     zx_status_t Status =
@@ -50,7 +50,7 @@ static uptr getRootVmarBase() {
     CHECK_EQ(Status, ZX_OK);
     CHECK_NE(VmarInfo.base, 0);
 
-    atomic_store_relaxed(&CachedResult, VmarInfo.base);
+    atomic_store(&CachedResult, VmarInfo.base, memory_order_release);
     Result = VmarInfo.base;
   }
 
@@ -61,7 +61,7 @@ static uptr getRootVmarBase() {
 static zx_handle_t getPlaceholderVmo() {
   static atomic_u32 StoredVmo = {ZX_HANDLE_INVALID};
 
-  zx_handle_t Vmo = atomic_load_relaxed(&StoredVmo);
+  zx_handle_t Vmo = atomic_load(&StoredVmo, memory_order_acquire);
   if (UNLIKELY(Vmo == ZX_HANDLE_INVALID)) {
     // Create a zero-sized placeholder VMO.
     zx_status_t Status = _zx_vmo_create(0, 0, &Vmo);
@@ -72,9 +72,9 @@ static zx_handle_t getPlaceholderVmo() {
 
     // Atomically store its handle. If some other thread wins the race, use its
     // handle and discard ours.
-    zx_handle_t OldValue =
-        atomic_compare_exchange(&StoredVmo, ZX_HANDLE_INVALID, Vmo);
-    if (OldValue != ZX_HANDLE_INVALID) {
+    zx_handle_t OldValue = atomic_compare_exchange_strong(
+        &StoredVmo, ZX_HANDLE_INVALID, Vmo, memory_order_acq_rel);
+    if (UNLIKELY(OldValue != ZX_HANDLE_INVALID)) {
       Status = _zx_handle_close(Vmo);
       CHECK_EQ(Status, ZX_OK);
 
diff --git a/compiler-rt/lib/scudo/standalone/mem_map_linux.cpp b/compiler-rt/lib/scudo/standalone/mem_map_linux.cpp
new file mode 100644
index 000000000000..783c4f0d9ab0
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/mem_map_linux.cpp
@@ -0,0 +1,153 @@
+//===-- mem_map_linux.cpp ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "platform.h"
+
+#if SCUDO_LINUX
+
+#include "mem_map_linux.h"
+
+#include "common.h"
+#include "internal_defs.h"
+#include "linux.h"
+#include "mutex.h"
+#include "report_linux.h"
+#include "string_utils.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/futex.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#if SCUDO_ANDROID
+// TODO(chiahungduan): Review if we still need the followings macros.
+#include <sys/prctl.h>
+// Definitions of prctl arguments to set a vma name in Android kernels.
+#define ANDROID_PR_SET_VMA 0x53564d41
+#define ANDROID_PR_SET_VMA_ANON_NAME 0
+#endif
+
+namespace scudo {
+
+static void *mmapWrapper(uptr Addr, uptr Size, const char *Name, uptr Flags) {
+  int MmapFlags = MAP_PRIVATE | MAP_ANONYMOUS;
+  int MmapProt;
+  if (Flags & MAP_NOACCESS) {
+    MmapFlags |= MAP_NORESERVE;
+    MmapProt = PROT_NONE;
+  } else {
+    MmapProt = PROT_READ | PROT_WRITE;
+  }
+#if defined(__aarch64__)
+#ifndef PROT_MTE
+#define PROT_MTE 0x20
+#endif
+  if (Flags & MAP_MEMTAG)
+    MmapProt |= PROT_MTE;
+#endif
+  if (Addr)
+    MmapFlags |= MAP_FIXED;
+  void *P =
+      mmap(reinterpret_cast<void *>(Addr), Size, MmapProt, MmapFlags, -1, 0);
+  if (P == MAP_FAILED) {
+    if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
+      reportMapError(errno == ENOMEM ? Size : 0);
+    return nullptr;
+  }
+#if SCUDO_ANDROID
+  if (Name)
+    prctl(ANDROID_PR_SET_VMA, ANDROID_PR_SET_VMA_ANON_NAME, P, Size, Name);
+#else
+  (void)Name;
+#endif
+
+  return P;
+}
+
+bool MemMapLinux::mapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags) {
+  void *P = mmapWrapper(Addr, Size, Name, Flags);
+  if (P == nullptr)
+    return false;
+
+  MapBase = reinterpret_cast<uptr>(P);
+  MapCapacity = Size;
+  return true;
+}
+
+void MemMapLinux::unmapImpl(uptr Addr, uptr Size) {
+  // If we unmap all the pages, also mark `MapBase` to 0 to indicate invalid
+  // status.
+  if (Size == MapCapacity) {
+    MapBase = MapCapacity = 0;
+  } else {
+    // This is partial unmap and is unmapping the pages from the beginning,
+    // shift `MapBase` to the new base.
+    if (MapBase == Addr)
+      MapBase = Addr + Size;
+    MapCapacity -= Size;
+  }
+
+  if (munmap(reinterpret_cast<void *>(Addr), Size) != 0)
+    reportUnmapError(Addr, Size);
+}
+
+bool MemMapLinux::remapImpl(uptr Addr, uptr Size, const char *Name,
+                            uptr Flags) {
+  void *P = mmapWrapper(Addr, Size, Name, Flags);
+  if (reinterpret_cast<uptr>(P) != Addr)
+    reportMapError();
+  return true;
+}
+
+void MemMapLinux::setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags) {
+  int Prot = (Flags & MAP_NOACCESS) ? PROT_NONE : (PROT_READ | PROT_WRITE);
+  if (mprotect(reinterpret_cast<void *>(Addr), Size, Prot) != 0)
+    reportProtectError(Addr, Size, Prot);
+}
+
+void MemMapLinux::releaseAndZeroPagesToOSImpl(uptr From, uptr Size) {
+  void *Addr = reinterpret_cast<void *>(From);
+
+  while (madvise(Addr, Size, MADV_DONTNEED) == -1 && errno == EAGAIN) {
+  }
+}
+
+bool ReservedMemoryLinux::createImpl(uptr Addr, uptr Size, const char *Name,
+                                     uptr Flags) {
+  ReservedMemoryLinux::MemMapT MemMap;
+  if (!MemMap.map(Addr, Size, Name, Flags | MAP_NOACCESS))
+    return false;
+
+  MapBase = MemMap.getBase();
+  MapCapacity = MemMap.getCapacity();
+
+  return true;
+}
+
+void ReservedMemoryLinux::releaseImpl() {
+  if (munmap(reinterpret_cast<void *>(getBase()), getCapacity()) != 0)
+    reportUnmapError(getBase(), getCapacity());
+}
+
+ReservedMemoryLinux::MemMapT ReservedMemoryLinux::dispatchImpl(uptr Addr,
+                                                               uptr Size) {
+  return ReservedMemoryLinux::MemMapT(Addr, Size);
+}
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX
diff --git a/compiler-rt/lib/scudo/standalone/mem_map_linux.h b/compiler-rt/lib/scudo/standalone/mem_map_linux.h
new file mode 100644
index 000000000000..7a89b3bff5ed
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/mem_map_linux.h
@@ -0,0 +1,67 @@
+//===-- mem_map_linux.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_MEM_MAP_LINUX_H_
+#define SCUDO_MEM_MAP_LINUX_H_
+
+#include "platform.h"
+
+#if SCUDO_LINUX
+
+#include "common.h"
+#include "mem_map_base.h"
+
+namespace scudo {
+
+class MemMapLinux final : public MemMapBase<MemMapLinux> {
+public:
+  constexpr MemMapLinux() = default;
+  MemMapLinux(uptr Base, uptr Capacity)
+      : MapBase(Base), MapCapacity(Capacity) {}
+
+  // Impls for base functions.
+  bool mapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags = 0);
+  void unmapImpl(uptr Addr, uptr Size);
+  bool remapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags = 0);
+  void setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags);
+  void releasePagesToOSImpl(uptr From, uptr Size) {
+    return releaseAndZeroPagesToOSImpl(From, Size);
+  }
+  void releaseAndZeroPagesToOSImpl(uptr From, uptr Size);
+  uptr getBaseImpl() { return MapBase; }
+  uptr getCapacityImpl() { return MapCapacity; }
+
+private:
+  uptr MapBase = 0;
+  uptr MapCapacity = 0;
+};
+
+// This will be deprecated when every allocator has been supported by each
+// platform's `MemMap` implementation.
+class ReservedMemoryLinux final
+    : public ReservedMemory<ReservedMemoryLinux, MemMapLinux> {
+public:
+  // The following two are the Impls for function in `MemMapBase`.
+  uptr getBaseImpl() { return MapBase; }
+  uptr getCapacityImpl() { return MapCapacity; }
+
+  // These threes are specific to `ReservedMemory`.
+  bool createImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void releaseImpl();
+  MemMapT dispatchImpl(uptr Addr, uptr Size);
+
+private:
+  uptr MapBase = 0;
+  uptr MapCapacity = 0;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX
+
+#endif // SCUDO_MEM_MAP_LINUX_H_
diff --git a/compiler-rt/lib/scudo/standalone/mutex.h b/compiler-rt/lib/scudo/standalone/mutex.h
index 05340de3e12d..4caa945219b5 100644
--- a/compiler-rt/lib/scudo/standalone/mutex.h
+++ b/compiler-rt/lib/scudo/standalone/mutex.h
@@ -35,7 +35,7 @@ public:
 #pragma nounroll
 #endif
     for (u8 I = 0U; I < NumberOfTries; I++) {
-      yieldProcessor(NumberOfYields);
+      delayLoop();
       if (tryLock())
         return;
     }
@@ -53,10 +53,23 @@ public:
   }
 
 private:
+  void delayLoop() {
+    // The value comes from the average time spent in accessing caches (which
+    // are the fastest operations) so that we are unlikely to wait too long for
+    // fast operations.
+    constexpr u32 SpinTimes = 16;
+    volatile u32 V = 0;
+    for (u32 I = 0; I < SpinTimes; ++I) {
+      u32 Tmp = V + 1;
+      V = Tmp;
+    }
+  }
+
   void assertHeldImpl();
 
-  static constexpr u8 NumberOfTries = 8U;
-  static constexpr u8 NumberOfYields = 8U;
+  // TODO(chiahungduan): Adapt this value based on scenarios. E.g., primary and
+  // secondary allocator have different allocation times.
+  static constexpr u8 NumberOfTries = 32U;
 
 #if SCUDO_LINUX
   atomic_u32 M = {};
diff --git a/compiler-rt/lib/scudo/standalone/options.h b/compiler-rt/lib/scudo/standalone/options.h
index 4e6786513334..b20142a41590 100644
--- a/compiler-rt/lib/scudo/standalone/options.h
+++ b/compiler-rt/lib/scudo/standalone/options.h
@@ -38,7 +38,7 @@ struct Options {
   }
 };
 
-template <typename Config> bool useMemoryTagging(Options Options) {
+template <typename Config> bool useMemoryTagging(const Options &Options) {
   return allocatorSupportsMemoryTagging<Config>() &&
          Options.get(OptionBit::UseMemoryTagging);
 }
diff --git a/compiler-rt/lib/scudo/standalone/platform.h b/compiler-rt/lib/scudo/standalone/platform.h
index 7c7024ff570e..b71a86be7669 100644
--- a/compiler-rt/lib/scudo/standalone/platform.h
+++ b/compiler-rt/lib/scudo/standalone/platform.h
@@ -63,6 +63,20 @@
 #define SCUDO_CAN_USE_MTE (SCUDO_LINUX || SCUDO_TRUSTY)
 #endif
 
+// Use smaller table sizes for fuzzing in order to reduce input size.
+// Trusty just has less available memory.
+#ifndef SCUDO_SMALL_STACK_DEPOT
+#if defined(SCUDO_FUZZ) || SCUDO_TRUSTY
+#define SCUDO_SMALL_STACK_DEPOT 1
+#else
+#define SCUDO_SMALL_STACK_DEPOT 0
+#endif
+#endif
+
+#ifndef SCUDO_ENABLE_HOOKS
+#define SCUDO_ENABLE_HOOKS 0
+#endif
+
 #ifndef SCUDO_MIN_ALIGNMENT_LOG
 // We force malloc-type functions to be aligned to std::max_align_t, but there
 // is no reason why the minimum alignment for all other functions can't be 8
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 1d8a34ec65d6..8281e02ba164 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -9,6 +9,7 @@
 #ifndef SCUDO_PRIMARY32_H_
 #define SCUDO_PRIMARY32_H_
 
+#include "allocator_common.h"
 #include "bytemap.h"
 #include "common.h"
 #include "list.h"
@@ -53,12 +54,15 @@ public:
                 "");
   typedef SizeClassAllocator32<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
-  typedef typename CacheT::TransferBatch TransferBatch;
-  typedef typename CacheT::BatchGroup BatchGroup;
+  typedef TransferBatch<ThisT> TransferBatchT;
+  typedef BatchGroup<ThisT> BatchGroupT;
+
+  static_assert(sizeof(BatchGroupT) <= sizeof(TransferBatchT),
+                "BatchGroupT uses the same class size as TransferBatchT");
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? sizeof(TransferBatch)
+               ? sizeof(TransferBatchT)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
@@ -126,7 +130,7 @@ public:
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L1(Sci->Mutex);
       uptr TotalBlocks = 0;
-      for (BatchGroup &BG : Sci->FreeListInfo.BlockList) {
+      for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
         // `BG::Batches` are `TransferBatches`. +1 for `BatchGroup`.
         BatchClassUsedInFreeLists += BG.Batches.size() + 1;
         for (const auto &It : BG.Batches)
@@ -141,7 +145,7 @@ public:
     SizeClassInfo *Sci = getSizeClassInfo(SizeClassMap::BatchClassId);
     ScopedLock L1(Sci->Mutex);
     uptr TotalBlocks = 0;
-    for (BatchGroup &BG : Sci->FreeListInfo.BlockList) {
+    for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
       if (LIKELY(!BG.Batches.empty())) {
         for (const auto &It : BG.Batches)
           TotalBlocks += It.getCount();
@@ -187,11 +191,30 @@ public:
     return BlockSize > PageSize;
   }
 
-  TransferBatch *popBatch(CacheT *C, uptr ClassId) {
+  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
+  // count. Now it's the same as the number of blocks stored in the
+  // `TransferBatch`.
+  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
+                UNUSED const u16 MaxBlockCount) {
+    TransferBatchT *B = popBatch(C, ClassId);
+    if (!B)
+      return 0;
+
+    const u16 Count = B->getCount();
+    DCHECK_GT(Count, 0U);
+    B->moveToArray(ToArray);
+
+    if (ClassId != SizeClassMap::BatchClassId)
+      C->deallocate(SizeClassMap::BatchClassId, B);
+
+    return Count;
+  }
+
+  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatch *B = popBatchImpl(C, ClassId, Sci);
+    TransferBatchT *B = popBatchImpl(C, ClassId, Sci);
     if (UNLIKELY(!B)) {
       if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
         return nullptr;
@@ -311,6 +334,18 @@ public:
     }
   }
 
+  void getFragmentationInfo(ScopedString *Str) {
+    Str->append(
+        "Fragmentation Stats: SizeClassAllocator32: page size = %zu bytes\n",
+        getPageSizeCached());
+
+    for (uptr I = 1; I < NumClasses; I++) {
+      SizeClassInfo *Sci = getSizeClassInfo(I);
+      ScopedLock L(Sci->Mutex);
+      getSizeClassFragmentationInfo(Sci, I, Str);
+    }
+  }
+
   bool setOption(Option O, sptr Value) {
     if (O == Option::ReleaseInterval) {
       const s32 Interval = Max(Min(static_cast<s32>(Value),
@@ -369,7 +404,7 @@ private:
   };
 
   struct BlocksInfo {
-    SinglyLinkedList<BatchGroup> BlockList = {};
+    SinglyLinkedList<BatchGroupT> BlockList = {};
     uptr PoppedBlocks = 0;
     uptr PushedBlocks = 0;
   };
@@ -493,11 +528,11 @@ private:
     // reusable and don't need additional space for them.
 
     Sci->FreeListInfo.PushedBlocks += Size;
-    BatchGroup *BG = Sci->FreeListInfo.BlockList.front();
+    BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
 
     if (BG == nullptr) {
       // Construct `BatchGroup` on the last element.
-      BG = reinterpret_cast<BatchGroup *>(
+      BG = reinterpret_cast<BatchGroupT *>(
           decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
       --Size;
       BG->Batches.clear();
@@ -508,8 +543,8 @@ private:
       // from `CreateGroup` in `pushBlocksImpl`
       BG->PushedBlocks = 1;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = TransferBatch::getMaxCached(
-          getSizeByClassId(SizeClassMap::BatchClassId));
+      BG->MaxCachedPerBatch =
+          CacheT::getMaxCached(getSizeByClassId(SizeClassMap::BatchClassId));
 
       Sci->FreeListInfo.BlockList.push_front(BG);
     }
@@ -522,7 +557,7 @@ private:
     //   2. Only 1 block is pushed when the freelist is empty.
     if (BG->Batches.empty()) {
       // Construct the `TransferBatch` on the last element.
-      TransferBatch *TB = reinterpret_cast<TransferBatch *>(
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(
           decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
       TB->clear();
       // As mentioned above, addresses of `TransferBatch` and `BatchGroup` are
@@ -537,14 +572,14 @@ private:
       BG->Batches.push_front(TB);
     }
 
-    TransferBatch *CurBatch = BG->Batches.front();
+    TransferBatchT *CurBatch = BG->Batches.front();
     DCHECK_NE(CurBatch, nullptr);
 
     for (u32 I = 0; I < Size;) {
       u16 UnusedSlots =
           static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
       if (UnusedSlots == 0) {
-        CurBatch = reinterpret_cast<TransferBatch *>(
+        CurBatch = reinterpret_cast<TransferBatchT *>(
             decompactPtr(SizeClassMap::BatchClassId, Array[I]));
         CurBatch->clear();
         // Self-contained
@@ -588,24 +623,25 @@ private:
     DCHECK_GT(Size, 0U);
 
     auto CreateGroup = [&](uptr CompactPtrGroupBase) {
-      BatchGroup *BG = C->createGroup();
+      BatchGroupT *BG =
+          reinterpret_cast<BatchGroupT *>(C->getBatchClassBlock());
       BG->Batches.clear();
-      TransferBatch *TB = C->createBatch(ClassId, nullptr);
+      TransferBatchT *TB =
+          reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
       TB->clear();
 
       BG->CompactPtrGroupBase = CompactPtrGroupBase;
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch =
-          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
 
       return BG;
     };
 
-    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
-      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
-      TransferBatch *CurBatch = Batches.front();
+    auto InsertBlocks = [&](BatchGroupT *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatchT> &Batches = BG->Batches;
+      TransferBatchT *CurBatch = Batches.front();
       DCHECK_NE(CurBatch, nullptr);
 
       for (u32 I = 0; I < Size;) {
@@ -613,9 +649,8 @@ private:
         u16 UnusedSlots =
             static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
         if (UnusedSlots == 0) {
-          CurBatch = C->createBatch(
-              ClassId,
-              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch =
+              reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
           CurBatch->clear();
           Batches.push_front(CurBatch);
           UnusedSlots = BG->MaxCachedPerBatch;
@@ -630,11 +665,11 @@ private:
     };
 
     Sci->FreeListInfo.PushedBlocks += Size;
-    BatchGroup *Cur = Sci->FreeListInfo.BlockList.front();
+    BatchGroupT *Cur = Sci->FreeListInfo.BlockList.front();
 
     // In the following, `Cur` always points to the BatchGroup for blocks that
     // will be pushed next. `Prev` is the element right before `Cur`.
-    BatchGroup *Prev = nullptr;
+    BatchGroupT *Prev = nullptr;
 
     while (Cur != nullptr &&
            compactPtrGroupBase(Array[0]) > Cur->CompactPtrGroupBase) {
@@ -695,22 +730,22 @@ private:
   // group id will be considered first.
   //
   // The region mutex needs to be held while calling this method.
-  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
       REQUIRES(Sci->Mutex) {
     if (Sci->FreeListInfo.BlockList.empty())
       return nullptr;
 
-    SinglyLinkedList<TransferBatch> &Batches =
+    SinglyLinkedList<TransferBatchT> &Batches =
         Sci->FreeListInfo.BlockList.front()->Batches;
 
     if (Batches.empty()) {
       DCHECK_EQ(ClassId, SizeClassMap::BatchClassId);
-      BatchGroup *BG = Sci->FreeListInfo.BlockList.front();
+      BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
       Sci->FreeListInfo.BlockList.pop_front();
 
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
-      TransferBatch *TB = reinterpret_cast<TransferBatch *>(BG);
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
       TB->clear();
       TB->add(
           compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
@@ -718,13 +753,13 @@ private:
       return TB;
     }
 
-    TransferBatch *B = Batches.front();
+    TransferBatchT *B = Batches.front();
     Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
     if (Batches.empty()) {
-      BatchGroup *BG = Sci->FreeListInfo.BlockList.front();
+      BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
       Sci->FreeListInfo.BlockList.pop_front();
 
       // We don't keep BatchGroup with zero blocks to avoid empty-checking while
@@ -763,7 +798,7 @@ private:
     }
 
     const uptr Size = getSizeByClassId(ClassId);
-    const u16 MaxCount = TransferBatch::getMaxCached(Size);
+    const u16 MaxCount = CacheT::getMaxCached(Size);
     DCHECK_GT(MaxCount, 0U);
     // The maximum number of blocks we should carve in the region is dictated
     // by the maximum number of batches we want to fill, and the amount of
@@ -776,7 +811,7 @@ private:
     DCHECK_GT(NumberOfBlocks, 0U);
 
     constexpr u32 ShuffleArraySize =
-        MaxNumBatches * TransferBatch::MaxNumCached;
+        MaxNumBatches * TransferBatchT::MaxNumCached;
     // Fill the transfer batches and put them in the size-class freelist. We
     // need to randomize the blocks for security purposes, so we first fill a
     // local array that we then shuffle before populating the batches.
@@ -856,11 +891,56 @@ private:
                 PushedBytesDelta >> 10);
   }
 
+  void getSizeClassFragmentationInfo(SizeClassInfo *Sci, uptr ClassId,
+                                     ScopedString *Str) REQUIRES(Sci->Mutex) {
+    const uptr BlockSize = getSizeByClassId(ClassId);
+    const uptr First = Sci->MinRegionIndex;
+    const uptr Last = Sci->MaxRegionIndex;
+    const uptr Base = First * RegionSize;
+    const uptr NumberOfRegions = Last - First + 1U;
+    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
+      ScopedLock L(ByteMapMutex);
+      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
+    };
+
+    FragmentationRecorder Recorder;
+    if (!Sci->FreeListInfo.BlockList.empty()) {
+      PageReleaseContext Context =
+          markFreeBlocks(Sci, ClassId, BlockSize, Base, NumberOfRegions,
+                         ReleaseToOS::ForceAll);
+      releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+    }
+
+    const uptr PageSize = getPageSizeCached();
+    const uptr TotalBlocks = Sci->AllocatedUser / BlockSize;
+    const uptr InUseBlocks =
+        Sci->FreeListInfo.PoppedBlocks - Sci->FreeListInfo.PushedBlocks;
+    uptr AllocatedPagesCount = 0;
+    if (TotalBlocks != 0U) {
+      for (uptr I = 0; I < NumberOfRegions; ++I) {
+        if (SkipRegion(I))
+          continue;
+        AllocatedPagesCount += RegionSize / PageSize;
+      }
+
+      DCHECK_NE(AllocatedPagesCount, 0U);
+    }
+
+    DCHECK_GE(AllocatedPagesCount, Recorder.getReleasedPagesCount());
+    const uptr InUsePages =
+        AllocatedPagesCount - Recorder.getReleasedPagesCount();
+    const uptr InUseBytes = InUsePages * PageSize;
+
+    Str->append("  %02zu (%6zu): inuse/total blocks: %6zu/%6zu inuse/total "
+                "pages: %6zu/%6zu inuse bytes: %6zuK\n",
+                ClassId, BlockSize, InUseBlocks, TotalBlocks, InUsePages,
+                AllocatedPagesCount, InUseBytes >> 10);
+  }
+
   NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
                                  ReleaseToOS ReleaseType = ReleaseToOS::Normal)
       REQUIRES(Sci->Mutex) {
     const uptr BlockSize = getSizeByClassId(ClassId);
-    const uptr PageSize = getPageSizeCached();
 
     DCHECK_GE(Sci->FreeListInfo.PoppedBlocks, Sci->FreeListInfo.PushedBlocks);
     const uptr BytesInFreeList =
@@ -871,6 +951,60 @@ private:
     if (UNLIKELY(BytesInFreeList == 0))
       return 0;
 
+    // ====================================================================== //
+    // 1. Check if we have enough free blocks and if it's worth doing a page
+    // release.
+    // ====================================================================== //
+    if (ReleaseType != ReleaseToOS::ForceAll &&
+        !hasChanceToReleasePages(Sci, BlockSize, BytesInFreeList,
+                                 ReleaseType)) {
+      return 0;
+    }
+
+    const uptr First = Sci->MinRegionIndex;
+    const uptr Last = Sci->MaxRegionIndex;
+    DCHECK_NE(Last, 0U);
+    DCHECK_LE(First, Last);
+    uptr TotalReleasedBytes = 0;
+    const uptr Base = First * RegionSize;
+    const uptr NumberOfRegions = Last - First + 1U;
+
+    // ==================================================================== //
+    // 2. Mark the free blocks and we can tell which pages are in-use by
+    //    querying `PageReleaseContext`.
+    // ==================================================================== //
+    PageReleaseContext Context = markFreeBlocks(Sci, ClassId, BlockSize, Base,
+                                                NumberOfRegions, ReleaseType);
+    if (!Context.hasBlockMarked())
+      return 0;
+
+    // ==================================================================== //
+    // 3. Release the unused physical pages back to the OS.
+    // ==================================================================== //
+    ReleaseRecorder Recorder(Base);
+    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
+      ScopedLock L(ByteMapMutex);
+      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
+    };
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+
+    if (Recorder.getReleasedRangesCount() > 0) {
+      Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+      Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+      Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+      TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
+    }
+    Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
+
+    return TotalReleasedBytes;
+  }
+
+  bool hasChanceToReleasePages(SizeClassInfo *Sci, uptr BlockSize,
+                               uptr BytesInFreeList, ReleaseToOS ReleaseType)
+      REQUIRES(Sci->Mutex) {
+    DCHECK_GE(Sci->FreeListInfo.PoppedBlocks, Sci->FreeListInfo.PushedBlocks);
+    const uptr PageSize = getPageSizeCached();
+
     if (BytesInFreeList <= Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint)
       Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
 
@@ -892,22 +1026,20 @@ private:
     // (BytesInFreeListAtLastCheckpoint - BytesInFreeList).
     const uptr PushedBytesDelta =
         BytesInFreeList - Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
-    if (PushedBytesDelta < PageSize && ReleaseType != ReleaseToOS::ForceAll)
-      return 0;
+    if (PushedBytesDelta < PageSize)
+      return false;
 
-    const bool CheckDensity =
-        isSmallBlock(BlockSize) && ReleaseType != ReleaseToOS::ForceAll;
     // Releasing smaller blocks is expensive, so we want to make sure that a
     // significant amount of bytes are free, and that there has been a good
     // amount of batches pushed to the freelist before attempting to release.
-    if (CheckDensity && ReleaseType == ReleaseToOS::Normal)
+    if (isSmallBlock(BlockSize) && ReleaseType == ReleaseToOS::Normal)
       if (PushedBytesDelta < Sci->AllocatedUser / 16U)
-        return 0;
+        return false;
 
     if (ReleaseType == ReleaseToOS::Normal) {
       const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
       if (IntervalMs < 0)
-        return 0;
+        return false;
 
       // The constant 8 here is selected from profiling some apps and the number
       // of unreleased pages in the large size classes is around 16 pages or
@@ -920,30 +1052,31 @@ private:
                 static_cast<u64>(IntervalMs) * 1000000 >
             getMonotonicTimeFast()) {
           // Memory was returned recently.
-          return 0;
+          return false;
         }
       }
     } // if (ReleaseType == ReleaseToOS::Normal)
 
-    const uptr First = Sci->MinRegionIndex;
-    const uptr Last = Sci->MaxRegionIndex;
-    DCHECK_NE(Last, 0U);
-    DCHECK_LE(First, Last);
-    uptr TotalReleasedBytes = 0;
-    const uptr Base = First * RegionSize;
-    const uptr NumberOfRegions = Last - First + 1U;
-    const uptr GroupSize = (1U << GroupSizeLog);
+    return true;
+  }
+
+  PageReleaseContext markFreeBlocks(SizeClassInfo *Sci, const uptr ClassId,
+                                    const uptr BlockSize, const uptr Base,
+                                    const uptr NumberOfRegions,
+                                    ReleaseToOS ReleaseType)
+      REQUIRES(Sci->Mutex) {
+    const uptr PageSize = getPageSizeCached();
+    const uptr GroupSize = (1UL << GroupSizeLog);
     const uptr CurGroupBase =
         compactPtrGroupBase(compactPtr(ClassId, Sci->CurrentRegion));
 
-    ReleaseRecorder Recorder(Base);
     PageReleaseContext Context(BlockSize, NumberOfRegions,
                                /*ReleaseSize=*/RegionSize);
 
     auto DecompactPtr = [](CompactPtrT CompactPtr) {
       return reinterpret_cast<uptr>(CompactPtr);
     };
-    for (BatchGroup &BG : Sci->FreeListInfo.BlockList) {
+    for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
       const uptr GroupBase = decompactGroupBase(BG.CompactPtrGroupBase);
       // The `GroupSize` may not be divided by `BlockSize`, which means there is
       // an unused space at the end of Region. Exclude that space to avoid
@@ -960,25 +1093,27 @@ private:
                              BG.Batches.front()->getCount();
       const uptr BytesInBG = NumBlocks * BlockSize;
 
-      if (ReleaseType != ReleaseToOS::ForceAll &&
-          BytesInBG <= BG.BytesInBGAtLastCheckpoint) {
-        BG.BytesInBGAtLastCheckpoint = BytesInBG;
-        continue;
-      }
-      const uptr PushedBytesDelta = BytesInBG - BG.BytesInBGAtLastCheckpoint;
-      if (ReleaseType != ReleaseToOS::ForceAll && PushedBytesDelta < PageSize)
-        continue;
+      if (ReleaseType != ReleaseToOS::ForceAll) {
+        if (BytesInBG <= BG.BytesInBGAtLastCheckpoint) {
+          BG.BytesInBGAtLastCheckpoint = BytesInBG;
+          continue;
+        }
 
-      // Given the randomness property, we try to release the pages only if the
-      // bytes used by free blocks exceed certain proportion of allocated
-      // spaces.
-      if (CheckDensity && (BytesInBG * 100U) / AllocatedGroupSize <
-                              (100U - 1U - BlockSize / 16U)) {
-        continue;
+        const uptr PushedBytesDelta = BytesInBG - BG.BytesInBGAtLastCheckpoint;
+        if (PushedBytesDelta < PageSize)
+          continue;
+
+        // Given the randomness property, we try to release the pages only if
+        // the bytes used by free blocks exceed certain proportion of allocated
+        // spaces.
+        if (isSmallBlock(BlockSize) && (BytesInBG * 100U) / AllocatedGroupSize <
+                                           (100U - 1U - BlockSize / 16U)) {
+          continue;
+        }
       }
 
       // TODO: Consider updating this after page release if `ReleaseRecorder`
-      // can tell the releasd bytes in each group.
+      // can tell the released bytes in each group.
       BG.BytesInBGAtLastCheckpoint = BytesInBG;
 
       const uptr MaxContainedBlocks = AllocatedGroupSize / BlockSize;
@@ -1006,27 +1141,10 @@ private:
       // We may not be able to do the page release In a rare case that we may
       // fail on PageMap allocation.
       if (UNLIKELY(!Context.hasBlockMarked()))
-        return 0;
-    }
-
-    if (!Context.hasBlockMarked())
-      return 0;
-
-    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
-      ScopedLock L(ByteMapMutex);
-      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
-    };
-    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
-
-    if (Recorder.getReleasedRangesCount() > 0) {
-      Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
-      Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-      Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
-      TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
+        break;
     }
-    Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
 
-    return TotalReleasedBytes;
+    return Context;
   }
 
   SizeClassInfo SizeClassInfoArray[NumClasses] = {};
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index fd7a1f9e80cd..d1929ff7212f 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -9,6 +9,7 @@
 #ifndef SCUDO_PRIMARY64_H_
 #define SCUDO_PRIMARY64_H_
 
+#include "allocator_common.h"
 #include "bytemap.h"
 #include "common.h"
 #include "list.h"
@@ -21,6 +22,8 @@
 #include "string_utils.h"
 #include "thread_annotations.h"
 
+#include "condition_variable.h"
+
 namespace scudo {
 
 // SizeClassAllocator64 is an allocator tuned for 64-bit address space.
@@ -47,27 +50,39 @@ template <typename Config> class SizeClassAllocator64 {
 public:
   typedef typename Config::Primary::CompactPtrT CompactPtrT;
   typedef typename Config::Primary::SizeClassMap SizeClassMap;
+  typedef typename ConditionVariableState<
+      typename Config::Primary>::ConditionVariableT ConditionVariableT;
   static const uptr CompactPtrScale = Config::Primary::CompactPtrScale;
+  static const uptr RegionSizeLog = Config::Primary::RegionSizeLog;
   static const uptr GroupSizeLog = Config::Primary::GroupSizeLog;
+  static_assert(RegionSizeLog >= GroupSizeLog,
+                "Group size shouldn't be greater than the region size");
   static const uptr GroupScale = GroupSizeLog - CompactPtrScale;
   typedef SizeClassAllocator64<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
-  typedef typename CacheT::TransferBatch TransferBatch;
-  typedef typename CacheT::BatchGroup BatchGroup;
+  typedef TransferBatch<ThisT> TransferBatchT;
+  typedef BatchGroup<ThisT> BatchGroupT;
+
+  static_assert(sizeof(BatchGroupT) <= sizeof(TransferBatchT),
+                "BatchGroupT uses the same class size as TransferBatchT");
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? roundUp(sizeof(TransferBatch), 1U << CompactPtrScale)
+               ? roundUp(sizeof(TransferBatchT), 1U << CompactPtrScale)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
   static bool canAllocate(uptr Size) { return Size <= SizeClassMap::MaxSize; }
 
+  static bool conditionVariableEnabled() {
+    return ConditionVariableState<typename Config::Primary>::enabled();
+  }
+
   void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
 
     const uptr PageSize = getPageSizeCached();
-    const uptr GroupSize = (1U << GroupSizeLog);
+    const uptr GroupSize = (1UL << GroupSizeLog);
     const uptr PagesInGroup = GroupSize / PageSize;
     const uptr MinSizeClass = getSizeByClassId(1);
     // When trying to release pages back to memory, visiting smaller size
@@ -117,13 +132,13 @@ public:
 
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
+
       // The actual start of a region is offset by a random number of pages
       // when PrimaryEnableRandomOffset is set.
-      Region->RegionBeg =
-          (PrimaryBase + (I << Config::Primary::RegionSizeLog)) +
-          (Config::Primary::EnableRandomOffset
-               ? ((getRandomModN(&Seed, 16) + 1) * PageSize)
-               : 0);
+      Region->RegionBeg = (PrimaryBase + (I << RegionSizeLog)) +
+                          (Config::Primary::EnableRandomOffset
+                               ? ((getRandomModN(&Seed, 16) + 1) * PageSize)
+                               : 0);
       Region->RandState = getRandomU32(&Seed);
       // Releasing small blocks is expensive, set a higher threshold to avoid
       // frequent page releases.
@@ -134,11 +149,16 @@ public:
       Region->ReleaseInfo.LastReleaseAtNs = Time;
 
       Region->MemMapInfo.MemMap = ReservedMemory.dispatch(
-          PrimaryBase + (I << Config::Primary::RegionSizeLog), RegionSize);
+          PrimaryBase + (I << RegionSizeLog), RegionSize);
       CHECK(Region->MemMapInfo.MemMap.isAllocated());
     }
     shuffle(RegionInfoArray, NumClasses, &Seed);
 
+    // The binding should be done after region shuffling so that it won't bind
+    // the FLLock from the wrong region.
+    for (uptr I = 0; I < NumClasses; I++)
+      getRegionInfo(I)->FLLockCV.bindTestOnly(getRegionInfo(I)->FLLock);
+
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
@@ -165,7 +185,7 @@ public:
       ScopedLock FL(Region->FLLock);
       const uptr BlockSize = getSizeByClassId(I);
       uptr TotalBlocks = 0;
-      for (BatchGroup &BG : Region->FreeListInfo.BlockList) {
+      for (BatchGroupT &BG : Region->FreeListInfo.BlockList) {
         // `BG::Batches` are `TransferBatches`. +1 for `BatchGroup`.
         BatchClassUsedInFreeLists += BG.Batches.size() + 1;
         for (const auto &It : BG.Batches)
@@ -182,7 +202,7 @@ public:
     ScopedLock FL(Region->FLLock);
     const uptr BlockSize = getSizeByClassId(SizeClassMap::BatchClassId);
     uptr TotalBlocks = 0;
-    for (BatchGroup &BG : Region->FreeListInfo.BlockList) {
+    for (BatchGroupT &BG : Region->FreeListInfo.BlockList) {
       if (LIKELY(!BG.Batches.empty())) {
         for (const auto &It : BG.Batches)
           TotalBlocks += It.getCount();
@@ -201,51 +221,64 @@ public:
     DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
   }
 
-  TransferBatch *popBatch(CacheT *C, uptr ClassId) {
+  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
+  // count. Now it's the same as the number of blocks stored in the
+  // `TransferBatch`.
+  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
+                UNUSED const u16 MaxBlockCount) {
+    TransferBatchT *B = popBatch(C, ClassId);
+    if (!B)
+      return 0;
+
+    const u16 Count = B->getCount();
+    DCHECK_GT(Count, 0U);
+    B->moveToArray(ToArray);
+
+    if (ClassId != SizeClassMap::BatchClassId)
+      C->deallocate(SizeClassMap::BatchClassId, B);
+
+    return Count;
+  }
+
+  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
 
     {
       ScopedLock L(Region->FLLock);
-      TransferBatch *B = popBatchImpl(C, ClassId, Region);
+      TransferBatchT *B = popBatchImpl(C, ClassId, Region);
       if (LIKELY(B))
         return B;
     }
 
-    bool PrintStats = false;
-    TransferBatch *B = nullptr;
+    bool ReportRegionExhausted = false;
+    TransferBatchT *B = nullptr;
 
-    while (true) {
-      // When two threads compete for `Region->MMLock`, we only want one of them
-      // does the populateFreeListAndPopBatch(). To avoid both of them doing
-      // that, always check the freelist before mapping new pages.
-      //
-      // TODO(chiahungduan): Use a condition variable so that we don't need to
-      // hold `Region->MMLock` here.
-      ScopedLock ML(Region->MMLock);
-      {
-        ScopedLock FL(Region->FLLock);
-        B = popBatchImpl(C, ClassId, Region);
-        if (LIKELY(B))
-          return B;
-      }
+    if (conditionVariableEnabled()) {
+      B = popBatchWithCV(C, ClassId, Region, ReportRegionExhausted);
+    } else {
+      while (true) {
+        // When two threads compete for `Region->MMLock`, we only want one of
+        // them to call populateFreeListAndPopBatch(). To avoid both of them
+        // doing that, always check the freelist before mapping new pages.
+        ScopedLock ML(Region->MMLock);
+        {
+          ScopedLock FL(Region->FLLock);
+          if ((B = popBatchImpl(C, ClassId, Region)))
+            break;
+        }
 
-      const bool RegionIsExhausted = Region->Exhausted;
-      if (!RegionIsExhausted)
-        B = populateFreeListAndPopBatch(C, ClassId, Region);
-      PrintStats = !RegionIsExhausted && Region->Exhausted;
-      break;
+        const bool RegionIsExhausted = Region->Exhausted;
+        if (!RegionIsExhausted)
+          B = populateFreeListAndPopBatch(C, ClassId, Region);
+        ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
+        break;
+      }
     }
 
-    // Note that `getStats()` requires locking each region so we can't call it
-    // while locking the Region->Mutex in the above.
-    if (UNLIKELY(PrintStats)) {
-      ScopedString Str;
-      getStats(&Str);
-      Str.append(
-          "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
-          RegionSize >> 20, getSizeByClassId(ClassId));
-      Str.output();
+    if (UNLIKELY(ReportRegionExhausted)) {
+      Printf("Can't populate more pages for size class %zu.\n",
+             getSizeByClassId(ClassId));
 
       // Theoretically, BatchClass shouldn't be used up. Abort immediately  when
       // it happens.
@@ -265,30 +298,36 @@ public:
     if (ClassId == SizeClassMap::BatchClassId) {
       ScopedLock L(Region->FLLock);
       pushBatchClassBlocks(Region, Array, Size);
+      if (conditionVariableEnabled())
+        Region->FLLockCV.notifyAll(Region->FLLock);
       return;
     }
 
     // TODO(chiahungduan): Consider not doing grouping if the group size is not
     // greater than the block size with a certain scale.
 
-    // Sort the blocks so that blocks belonging to the same group can be pushed
-    // together.
     bool SameGroup = true;
-    for (u32 I = 1; I < Size; ++I) {
-      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
-        SameGroup = false;
-      CompactPtrT Cur = Array[I];
-      u32 J = I;
-      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
-        Array[J] = Array[J - 1];
-        --J;
+    if (GroupSizeLog < RegionSizeLog) {
+      // Sort the blocks so that blocks belonging to the same group can be
+      // pushed together.
+      for (u32 I = 1; I < Size; ++I) {
+        if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
+          SameGroup = false;
+        CompactPtrT Cur = Array[I];
+        u32 J = I;
+        while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+          Array[J] = Array[J - 1];
+          --J;
+        }
+        Array[J] = Cur;
       }
-      Array[J] = Cur;
     }
 
     {
       ScopedLock L(Region->FLLock);
       pushBlocksImpl(C, ClassId, Region, Array, Size, SameGroup);
+      if (conditionVariableEnabled())
+        Region->FLLockCV.notifyAll(Region->FLLock);
     }
   }
 
@@ -363,6 +402,18 @@ public:
     }
   }
 
+  void getFragmentationInfo(ScopedString *Str) {
+    Str->append(
+        "Fragmentation Stats: SizeClassAllocator64: page size = %zu bytes\n",
+        getPageSizeCached());
+
+    for (uptr I = 1; I < NumClasses; I++) {
+      RegionInfo *Region = getRegionInfo(I);
+      ScopedLock L(Region->MMLock);
+      getRegionFragmentationInfo(Region, I, Str);
+    }
+  }
+
   bool setOption(Option O, sptr Value) {
     if (O == Option::ReleaseInterval) {
       const s32 Interval = Max(Min(static_cast<s32>(Value),
@@ -477,7 +528,7 @@ public:
   AtomicOptions Options;
 
 private:
-  static const uptr RegionSize = 1UL << Config::Primary::RegionSizeLog;
+  static const uptr RegionSize = 1UL << RegionSizeLog;
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr PrimarySize = RegionSize * NumClasses;
 
@@ -493,7 +544,7 @@ private:
   };
 
   struct BlocksInfo {
-    SinglyLinkedList<BatchGroup> BlockList = {};
+    SinglyLinkedList<BatchGroupT> BlockList = {};
     uptr PoppedBlocks = 0;
     uptr PushedBlocks = 0;
   };
@@ -509,6 +560,7 @@ private:
   struct UnpaddedRegionInfo {
     // Mutex for operations on freelist
     HybridMutex FLLock;
+    ConditionVariableT FLLockCV GUARDED_BY(FLLock);
     // Mutex for memmap operations
     HybridMutex MMLock ACQUIRED_BEFORE(FLLock);
     // `RegionBeg` is initialized before thread creation and won't be changed.
@@ -520,6 +572,7 @@ private:
     uptr TryReleaseThreshold GUARDED_BY(MMLock) = 0;
     ReleaseToOsInfo ReleaseInfo GUARDED_BY(MMLock) = {};
     bool Exhausted GUARDED_BY(MMLock) = false;
+    bool isPopulatingFreeList GUARDED_BY(FLLock) = false;
   };
   struct RegionInfo : UnpaddedRegionInfo {
     char Padding[SCUDO_CACHE_LINE_SIZE -
@@ -605,11 +658,11 @@ private:
     // reusable and don't need additional space for them.
 
     Region->FreeListInfo.PushedBlocks += Size;
-    BatchGroup *BG = Region->FreeListInfo.BlockList.front();
+    BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
 
     if (BG == nullptr) {
       // Construct `BatchGroup` on the last element.
-      BG = reinterpret_cast<BatchGroup *>(
+      BG = reinterpret_cast<BatchGroupT *>(
           decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
       --Size;
       BG->Batches.clear();
@@ -620,8 +673,8 @@ private:
       // from `CreateGroup` in `pushBlocksImpl`
       BG->PushedBlocks = 1;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = TransferBatch::getMaxCached(
-          getSizeByClassId(SizeClassMap::BatchClassId));
+      BG->MaxCachedPerBatch =
+          CacheT::getMaxCached(getSizeByClassId(SizeClassMap::BatchClassId));
 
       Region->FreeListInfo.BlockList.push_front(BG);
     }
@@ -634,7 +687,7 @@ private:
     //   2. Only 1 block is pushed when the freelist is empty.
     if (BG->Batches.empty()) {
       // Construct the `TransferBatch` on the last element.
-      TransferBatch *TB = reinterpret_cast<TransferBatch *>(
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(
           decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
       TB->clear();
       // As mentioned above, addresses of `TransferBatch` and `BatchGroup` are
@@ -649,14 +702,14 @@ private:
       BG->Batches.push_front(TB);
     }
 
-    TransferBatch *CurBatch = BG->Batches.front();
+    TransferBatchT *CurBatch = BG->Batches.front();
     DCHECK_NE(CurBatch, nullptr);
 
     for (u32 I = 0; I < Size;) {
       u16 UnusedSlots =
           static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
       if (UnusedSlots == 0) {
-        CurBatch = reinterpret_cast<TransferBatch *>(
+        CurBatch = reinterpret_cast<TransferBatchT *>(
             decompactPtr(SizeClassMap::BatchClassId, Array[I]));
         CurBatch->clear();
         // Self-contained
@@ -699,24 +752,25 @@ private:
     DCHECK_GT(Size, 0U);
 
     auto CreateGroup = [&](uptr CompactPtrGroupBase) {
-      BatchGroup *BG = C->createGroup();
+      BatchGroupT *BG =
+          reinterpret_cast<BatchGroupT *>(C->getBatchClassBlock());
       BG->Batches.clear();
-      TransferBatch *TB = C->createBatch(ClassId, nullptr);
+      TransferBatchT *TB =
+          reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
       TB->clear();
 
       BG->CompactPtrGroupBase = CompactPtrGroupBase;
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch =
-          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
 
       return BG;
     };
 
-    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
-      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
-      TransferBatch *CurBatch = Batches.front();
+    auto InsertBlocks = [&](BatchGroupT *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatchT> &Batches = BG->Batches;
+      TransferBatchT *CurBatch = Batches.front();
       DCHECK_NE(CurBatch, nullptr);
 
       for (u32 I = 0; I < Size;) {
@@ -724,9 +778,8 @@ private:
         u16 UnusedSlots =
             static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
         if (UnusedSlots == 0) {
-          CurBatch = C->createBatch(
-              ClassId,
-              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch =
+              reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
           CurBatch->clear();
           Batches.push_front(CurBatch);
           UnusedSlots = BG->MaxCachedPerBatch;
@@ -741,21 +794,11 @@ private:
     };
 
     Region->FreeListInfo.PushedBlocks += Size;
-    BatchGroup *Cur = Region->FreeListInfo.BlockList.front();
-
-    if (ClassId == SizeClassMap::BatchClassId) {
-      if (Cur == nullptr) {
-        // Don't need to classify BatchClassId.
-        Cur = CreateGroup(/*CompactPtrGroupBase=*/0);
-        Region->FreeListInfo.BlockList.push_front(Cur);
-      }
-      InsertBlocks(Cur, Array, Size);
-      return;
-    }
+    BatchGroupT *Cur = Region->FreeListInfo.BlockList.front();
 
     // In the following, `Cur` always points to the BatchGroup for blocks that
     // will be pushed next. `Prev` is the element right before `Cur`.
-    BatchGroup *Prev = nullptr;
+    BatchGroupT *Prev = nullptr;
 
     while (Cur != nullptr &&
            compactPtrGroup(Array[0]) > Cur->CompactPtrGroupBase) {
@@ -812,26 +855,96 @@ private:
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
+  TransferBatchT *popBatchWithCV(CacheT *C, uptr ClassId, RegionInfo *Region,
+                                 bool &ReportRegionExhausted) {
+    TransferBatchT *B = nullptr;
+
+    while (true) {
+      // We only expect one thread doing the freelist refillment and other
+      // threads will be waiting for either the completion of the
+      // `populateFreeListAndPopBatch()` or `pushBlocks()` called by other
+      // threads.
+      bool PopulateFreeList = false;
+      {
+        ScopedLock FL(Region->FLLock);
+        if (!Region->isPopulatingFreeList) {
+          Region->isPopulatingFreeList = true;
+          PopulateFreeList = true;
+        }
+      }
+
+      if (PopulateFreeList) {
+        ScopedLock ML(Region->MMLock);
+
+        const bool RegionIsExhausted = Region->Exhausted;
+        if (!RegionIsExhausted)
+          B = populateFreeListAndPopBatch(C, ClassId, Region);
+        ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
+
+        {
+          // Before reacquiring the `FLLock`, the freelist may be used up again
+          // and some threads are waiting for the freelist refillment by the
+          // current thread. It's important to set
+          // `Region->isPopulatingFreeList` to false so the threads about to
+          // sleep will notice the status change.
+          ScopedLock FL(Region->FLLock);
+          Region->isPopulatingFreeList = false;
+          Region->FLLockCV.notifyAll(Region->FLLock);
+        }
+
+        break;
+      }
+
+      // At here, there are two preconditions to be met before waiting,
+      //   1. The freelist is empty.
+      //   2. Region->isPopulatingFreeList == true, i.e, someone is still doing
+      //   `populateFreeListAndPopBatch()`.
+      //
+      // Note that it has the chance that freelist is empty but
+      // Region->isPopulatingFreeList == false because all the new populated
+      // blocks were used up right after the refillment. Therefore, we have to
+      // check if someone is still populating the freelist.
+      ScopedLock FL(Region->FLLock);
+      if (LIKELY(B = popBatchImpl(C, ClassId, Region)))
+        break;
+
+      if (!Region->isPopulatingFreeList)
+        continue;
+
+      // Now the freelist is empty and someone's doing the refillment. We will
+      // wait until anyone refills the freelist or someone finishes doing
+      // `populateFreeListAndPopBatch()`. The refillment can be done by
+      // `populateFreeListAndPopBatch()`, `pushBlocks()`,
+      // `pushBatchClassBlocks()` and `mergeGroupsToReleaseBack()`.
+      Region->FLLockCV.wait(Region->FLLock);
+
+      if (LIKELY(B = popBatchImpl(C, ClassId, Region)))
+        break;
+    }
+
+    return B;
+  }
+
   // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
   // group id will be considered first.
   //
   // The region mutex needs to be held while calling this method.
-  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
+  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
       REQUIRES(Region->FLLock) {
     if (Region->FreeListInfo.BlockList.empty())
       return nullptr;
 
-    SinglyLinkedList<TransferBatch> &Batches =
+    SinglyLinkedList<TransferBatchT> &Batches =
         Region->FreeListInfo.BlockList.front()->Batches;
 
     if (Batches.empty()) {
       DCHECK_EQ(ClassId, SizeClassMap::BatchClassId);
-      BatchGroup *BG = Region->FreeListInfo.BlockList.front();
+      BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
       Region->FreeListInfo.BlockList.pop_front();
 
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
-      TransferBatch *TB = reinterpret_cast<TransferBatch *>(BG);
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
       TB->clear();
       TB->add(
           compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
@@ -839,13 +952,13 @@ private:
       return TB;
     }
 
-    TransferBatch *B = Batches.front();
+    TransferBatchT *B = Batches.front();
     Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
     if (Batches.empty()) {
-      BatchGroup *BG = Region->FreeListInfo.BlockList.front();
+      BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
       Region->FreeListInfo.BlockList.pop_front();
 
       // We don't keep BatchGroup with zero blocks to avoid empty-checking while
@@ -863,11 +976,11 @@ private:
   }
 
   // Refill the freelist and return one batch.
-  NOINLINE TransferBatch *populateFreeListAndPopBatch(CacheT *C, uptr ClassId,
-                                                      RegionInfo *Region)
+  NOINLINE TransferBatchT *populateFreeListAndPopBatch(CacheT *C, uptr ClassId,
+                                                       RegionInfo *Region)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     const uptr Size = getSizeByClassId(ClassId);
-    const u16 MaxCount = TransferBatch::getMaxCached(Size);
+    const u16 MaxCount = CacheT::getMaxCached(Size);
 
     const uptr RegionBeg = Region->RegionBeg;
     const uptr MappedUser = Region->MemMapInfo.MappedUser;
@@ -903,7 +1016,7 @@ private:
     DCHECK_GT(NumberOfBlocks, 0);
 
     constexpr u32 ShuffleArraySize =
-        MaxNumBatches * TransferBatch::MaxNumCached;
+        MaxNumBatches * TransferBatchT::MaxNumCached;
     CompactPtrT ShuffleArray[ShuffleArraySize];
     DCHECK_LE(NumberOfBlocks, ShuffleArraySize);
 
@@ -936,7 +1049,7 @@ private:
       pushBatchClassBlocks(Region, ShuffleArray, NumberOfBlocks);
     }
 
-    TransferBatch *B = popBatchImpl(C, ClassId, Region);
+    TransferBatchT *B = popBatchImpl(C, ClassId, Region);
     DCHECK_NE(B, nullptr);
 
     // Note that `PushedBlocks` and `PoppedBlocks` are supposed to only record
@@ -957,10 +1070,10 @@ private:
     if (Region->MemMapInfo.MappedUser == 0)
       return;
     const uptr BlockSize = getSizeByClassId(ClassId);
-    const uptr InUse =
+    const uptr InUseBlocks =
         Region->FreeListInfo.PoppedBlocks - Region->FreeListInfo.PushedBlocks;
     const uptr BytesInFreeList =
-        Region->MemMapInfo.AllocatedUser - InUse * BlockSize;
+        Region->MemMapInfo.AllocatedUser - InUseBlocks * BlockSize;
     uptr RegionPushedBytesDelta = 0;
     if (BytesInFreeList >=
         Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint) {
@@ -972,122 +1085,141 @@ private:
         "%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
         "inuse: %6zu total: %6zu releases: %6zu last "
         "released: %6zuK latest pushed bytes: %6zuK region: 0x%zx (0x%zx)\n",
-        Region->Exhausted ? "F" : " ", ClassId, getSizeByClassId(ClassId),
+        Region->Exhausted ? "E" : " ", ClassId, getSizeByClassId(ClassId),
         Region->MemMapInfo.MappedUser >> 10, Region->FreeListInfo.PoppedBlocks,
-        Region->FreeListInfo.PushedBlocks, InUse, TotalChunks,
+        Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
         Region->ReleaseInfo.RangesReleased,
         Region->ReleaseInfo.LastReleasedBytes >> 10,
         RegionPushedBytesDelta >> 10, Region->RegionBeg,
         getRegionBaseByClassId(ClassId));
   }
 
-  NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
-                                 ReleaseToOS ReleaseType = ReleaseToOS::Normal)
-      REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
-    ScopedLock L(Region->FLLock);
-
+  void getRegionFragmentationInfo(RegionInfo *Region, uptr ClassId,
+                                  ScopedString *Str) REQUIRES(Region->MMLock) {
     const uptr BlockSize = getSizeByClassId(ClassId);
-    const uptr BytesInFreeList =
-        Region->MemMapInfo.AllocatedUser - (Region->FreeListInfo.PoppedBlocks -
-                                            Region->FreeListInfo.PushedBlocks) *
-                                               BlockSize;
-    if (UNLIKELY(BytesInFreeList == 0))
-      return false;
-
     const uptr AllocatedUserEnd =
         Region->MemMapInfo.AllocatedUser + Region->RegionBeg;
-    const uptr CompactPtrBase = getCompactPtrBaseByClassId(ClassId);
 
-    // ====================================================================== //
-    // 1. Check if we have enough free blocks and if it's worth doing a page
-    //    release.
-    // ====================================================================== //
-    if (ReleaseType != ReleaseToOS::ForceAll &&
-        !hasChanceToReleasePages(Region, BlockSize, BytesInFreeList,
-                                 ReleaseType)) {
-      return 0;
-    }
-
-    // ====================================================================== //
-    // 2. Determine which groups can release the pages. Use a heuristic to
-    //    gather groups that are candidates for doing a release.
-    // ====================================================================== //
-    SinglyLinkedList<BatchGroup> GroupsToRelease;
-    if (ReleaseType == ReleaseToOS::ForceAll) {
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
+    {
+      ScopedLock L(Region->FLLock);
       GroupsToRelease = Region->FreeListInfo.BlockList;
       Region->FreeListInfo.BlockList.clear();
-    } else {
-      GroupsToRelease = collectGroupsToRelease(
-          Region, BlockSize, AllocatedUserEnd, CompactPtrBase);
     }
-    if (GroupsToRelease.empty())
-      return 0;
 
-    // Ideally, we should use a class like `ScopedUnlock`. However, this form of
-    // unlocking is not supported by the thread-safety analysis. See
-    // https://clang.llvm.org/docs/ThreadSafetyAnalysis.html#no-alias-analysis
-    // for more details.
-    // Put it as local class so that we can mark the ctor/dtor with proper
-    // annotations associated to the target lock. Note that accessing the
-    // function variable in local class only works in thread-safety annotations.
-    // TODO: Implement general `ScopedUnlock` when it's supported.
-    class FLLockScopedUnlock {
-    public:
-      FLLockScopedUnlock(RegionInfo *Region) RELEASE(Region->FLLock)
-          : R(Region) {
-        R->FLLock.assertHeld();
-        R->FLLock.unlock();
-      }
-      ~FLLockScopedUnlock() ACQUIRE(Region->FLLock) { R->FLLock.lock(); }
+    FragmentationRecorder Recorder;
+    if (!GroupsToRelease.empty()) {
+      PageReleaseContext Context =
+          markFreeBlocks(Region, BlockSize, AllocatedUserEnd,
+                         getCompactPtrBaseByClassId(ClassId), GroupsToRelease);
+      auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
+      releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
 
-    private:
-      RegionInfo *R;
-    };
+      mergeGroupsToReleaseBack(Region, GroupsToRelease);
+    }
 
-    // Note that we have extracted the `GroupsToRelease` from region freelist.
-    // It's safe to let pushBlocks()/popBatches() access the remaining region
-    // freelist. In the steps 3 and 4, we will temporarily release the FLLock
-    // and lock it again before step 5.
+    ScopedLock L(Region->FLLock);
+    const uptr PageSize = getPageSizeCached();
+    const uptr TotalBlocks = Region->MemMapInfo.AllocatedUser / BlockSize;
+    const uptr InUseBlocks =
+        Region->FreeListInfo.PoppedBlocks - Region->FreeListInfo.PushedBlocks;
+    const uptr AllocatedPagesCount =
+        roundUp(Region->MemMapInfo.AllocatedUser, PageSize) / PageSize;
+    DCHECK_GE(AllocatedPagesCount, Recorder.getReleasedPagesCount());
+    const uptr InUsePages =
+        AllocatedPagesCount - Recorder.getReleasedPagesCount();
+    const uptr InUseBytes = InUsePages * PageSize;
+
+    Str->append("  %02zu (%6zu): inuse/total blocks: %6zu/%6zu inuse/total "
+                "pages: %6zu/%6zu inuse bytes: %6zuK\n",
+                ClassId, BlockSize, InUseBlocks, TotalBlocks, InUsePages,
+                AllocatedPagesCount, InUseBytes >> 10);
+  }
+
+  NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
+                                 ReleaseToOS ReleaseType = ReleaseToOS::Normal)
+      REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
+    const uptr BlockSize = getSizeByClassId(ClassId);
+    uptr BytesInFreeList;
+    const uptr AllocatedUserEnd =
+        Region->MemMapInfo.AllocatedUser + Region->RegionBeg;
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
 
-    uptr ReleasedBytes = 0;
     {
-      FLLockScopedUnlock UL(Region);
+      ScopedLock L(Region->FLLock);
+
+      BytesInFreeList = Region->MemMapInfo.AllocatedUser -
+                        (Region->FreeListInfo.PoppedBlocks -
+                         Region->FreeListInfo.PushedBlocks) *
+                            BlockSize;
+      if (UNLIKELY(BytesInFreeList == 0))
+        return false;
+
       // ==================================================================== //
-      // 3. Mark the free blocks in `GroupsToRelease` in the
-      //    `PageReleaseContext`. Then we can tell which pages are in-use by
-      //    querying `PageReleaseContext`.
+      // 1. Check if we have enough free blocks and if it's worth doing a page
+      //    release.
       // ==================================================================== //
-      PageReleaseContext Context = markFreeBlocks(
-          Region, BlockSize, AllocatedUserEnd, CompactPtrBase, GroupsToRelease);
-      if (UNLIKELY(!Context.hasBlockMarked())) {
-        ScopedLock L(Region->FLLock);
-        mergeGroupsToReleaseBack(Region, GroupsToRelease);
+      if (ReleaseType != ReleaseToOS::ForceAll &&
+          !hasChanceToReleasePages(Region, BlockSize, BytesInFreeList,
+                                   ReleaseType)) {
         return 0;
       }
 
       // ==================================================================== //
-      // 4. Release the unused physical pages back to the OS.
+      // 2. Determine which groups can release the pages. Use a heuristic to
+      //    gather groups that are candidates for doing a release.
       // ==================================================================== //
-      RegionReleaseRecorder<MemMapT> Recorder(&Region->MemMapInfo.MemMap,
-                                              Region->RegionBeg,
-                                              Context.getReleaseOffset());
-      auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
-      releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
-      if (Recorder.getReleasedRangesCount() > 0) {
-        Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
-        Region->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-        Region->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+      if (ReleaseType == ReleaseToOS::ForceAll) {
+        GroupsToRelease = Region->FreeListInfo.BlockList;
+        Region->FreeListInfo.BlockList.clear();
+      } else {
+        GroupsToRelease =
+            collectGroupsToRelease(Region, BlockSize, AllocatedUserEnd,
+                                   getCompactPtrBaseByClassId(ClassId));
       }
-      Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
-      ReleasedBytes = Recorder.getReleasedBytes();
+      if (GroupsToRelease.empty())
+        return 0;
     }
 
+    // Note that we have extracted the `GroupsToRelease` from region freelist.
+    // It's safe to let pushBlocks()/popBatches() access the remaining region
+    // freelist. In the steps 3 and 4, we will temporarily release the FLLock
+    // and lock it again before step 5.
+
+    // ==================================================================== //
+    // 3. Mark the free blocks in `GroupsToRelease` in the `PageReleaseContext`.
+    //    Then we can tell which pages are in-use by querying
+    //    `PageReleaseContext`.
+    // ==================================================================== //
+    PageReleaseContext Context =
+        markFreeBlocks(Region, BlockSize, AllocatedUserEnd,
+                       getCompactPtrBaseByClassId(ClassId), GroupsToRelease);
+    if (UNLIKELY(!Context.hasBlockMarked())) {
+      mergeGroupsToReleaseBack(Region, GroupsToRelease);
+      return 0;
+    }
+
+    // ==================================================================== //
+    // 4. Release the unused physical pages back to the OS.
+    // ==================================================================== //
+    RegionReleaseRecorder<MemMapT> Recorder(&Region->MemMapInfo.MemMap,
+                                            Region->RegionBeg,
+                                            Context.getReleaseOffset());
+    auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+    if (Recorder.getReleasedRangesCount() > 0) {
+      Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+      Region->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+      Region->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+    }
+    Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
+
     // ====================================================================== //
     // 5. Merge the `GroupsToRelease` back to the freelist.
     // ====================================================================== //
     mergeGroupsToReleaseBack(Region, GroupsToRelease);
 
-    return ReleasedBytes;
+    return Recorder.getReleasedBytes();
   }
 
   bool hasChanceToReleasePages(RegionInfo *Region, uptr BlockSize,
@@ -1154,13 +1286,13 @@ private:
     return true;
   }
 
-  SinglyLinkedList<BatchGroup>
+  SinglyLinkedList<BatchGroupT>
   collectGroupsToRelease(RegionInfo *Region, const uptr BlockSize,
                          const uptr AllocatedUserEnd, const uptr CompactPtrBase)
       REQUIRES(Region->MMLock, Region->FLLock) {
-    const uptr GroupSize = (1U << GroupSizeLog);
+    const uptr GroupSize = (1UL << GroupSizeLog);
     const uptr PageSize = getPageSizeCached();
-    SinglyLinkedList<BatchGroup> GroupsToRelease;
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
 
     // We are examining each group and will take the minimum distance to the
     // release threshold as the next Region::TryReleaseThreshold(). Note that if
@@ -1169,8 +1301,8 @@ private:
     // the comment on `SmallerBlockReleasePageDelta` for more details.
     uptr MinDistToThreshold = GroupSize;
 
-    for (BatchGroup *BG = Region->FreeListInfo.BlockList.front(),
-                    *Prev = nullptr;
+    for (BatchGroupT *BG = Region->FreeListInfo.BlockList.front(),
+                     *Prev = nullptr;
          BG != nullptr;) {
       // Group boundary is always GroupSize-aligned from CompactPtr base. The
       // layout of memory groups is like,
@@ -1254,7 +1386,7 @@ private:
         }
       }
 
-      // If `BG` is the first BatchGroup in the list, we only need to advance
+      // If `BG` is the first BatchGroupT in the list, we only need to advance
       // `BG` and call FreeListInfo.BlockList::pop_front(). No update is needed
       // for `Prev`.
       //
@@ -1290,7 +1422,7 @@ private:
       // Note that we need to advance before pushing this BatchGroup to
       // GroupsToRelease because it's a destructive operation.
 
-      BatchGroup *Cur = BG;
+      BatchGroupT *Cur = BG;
       BG = BG->Next;
 
       // Ideally, we may want to update this only after successful release.
@@ -1323,9 +1455,9 @@ private:
   PageReleaseContext
   markFreeBlocks(RegionInfo *Region, const uptr BlockSize,
                  const uptr AllocatedUserEnd, const uptr CompactPtrBase,
-                 SinglyLinkedList<BatchGroup> &GroupsToRelease)
+                 SinglyLinkedList<BatchGroupT> &GroupsToRelease)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
-    const uptr GroupSize = (1U << GroupSizeLog);
+    const uptr GroupSize = (1UL << GroupSizeLog);
     auto DecompactPtr = [CompactPtrBase](CompactPtrT CompactPtr) {
       return decompactPtrInternal(CompactPtrBase, CompactPtr);
     };
@@ -1352,7 +1484,7 @@ private:
     if (UNLIKELY(!Context.ensurePageMapAllocated()))
       return Context;
 
-    for (BatchGroup &BG : GroupsToRelease) {
+    for (BatchGroupT &BG : GroupsToRelease) {
       const uptr BatchGroupBase =
           decompactGroupBase(CompactPtrBase, BG.CompactPtrGroupBase);
       const uptr BatchGroupEnd = BatchGroupBase + GroupSize;
@@ -1400,8 +1532,10 @@ private:
   }
 
   void mergeGroupsToReleaseBack(RegionInfo *Region,
-                                SinglyLinkedList<BatchGroup> &GroupsToRelease)
-      REQUIRES(Region->MMLock, Region->FLLock) {
+                                SinglyLinkedList<BatchGroupT> &GroupsToRelease)
+      REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
+    ScopedLock L(Region->FLLock);
+
     // After merging two freelists, we may have redundant `BatchGroup`s that
     // need to be recycled. The number of unused `BatchGroup`s is expected to be
     // small. Pick a constant which is inferred from real programs.
@@ -1419,8 +1553,8 @@ private:
     // Merge GroupsToRelease back to the Region::FreeListInfo.BlockList. Note
     // that both `Region->FreeListInfo.BlockList` and `GroupsToRelease` are
     // sorted.
-    for (BatchGroup *BG = Region->FreeListInfo.BlockList.front(),
-                    *Prev = nullptr;
+    for (BatchGroupT *BG = Region->FreeListInfo.BlockList.front(),
+                     *Prev = nullptr;
          ;) {
       if (BG == nullptr || GroupsToRelease.empty()) {
         if (!GroupsToRelease.empty())
@@ -1437,8 +1571,8 @@ private:
         continue;
       }
 
-      BatchGroup *Cur = GroupsToRelease.front();
-      TransferBatch *UnusedTransferBatch = nullptr;
+      BatchGroupT *Cur = GroupsToRelease.front();
+      TransferBatchT *UnusedTransferBatch = nullptr;
       GroupsToRelease.pop_front();
 
       if (BG->CompactPtrGroupBase == Cur->CompactPtrGroupBase) {
@@ -1454,7 +1588,7 @@ private:
         if (Cur->Batches.front()->getCount() == MaxCachedPerBatch) {
           BG->Batches.append_back(&Cur->Batches);
         } else {
-          TransferBatch *NonFullBatch = Cur->Batches.front();
+          TransferBatchT *NonFullBatch = Cur->Batches.front();
           Cur->Batches.pop_front();
           const u16 NonFullBatchCount = NonFullBatch->getCount();
           // The remaining Batches in `Cur` are full.
@@ -1481,6 +1615,8 @@ private:
         if (UNLIKELY(Idx + NeededSlots > MaxUnusedSize)) {
           ScopedLock L(BatchClassRegion->FLLock);
           pushBatchClassBlocks(BatchClassRegion, Blocks, Idx);
+          if (conditionVariableEnabled())
+            BatchClassRegion->FLLockCV.notifyAll(BatchClassRegion->FLLock);
           Idx = 0;
         }
         Blocks[Idx++] =
@@ -1516,15 +1652,20 @@ private:
     if (Idx != 0) {
       ScopedLock L(BatchClassRegion->FLLock);
       pushBatchClassBlocks(BatchClassRegion, Blocks, Idx);
+      if (conditionVariableEnabled())
+        BatchClassRegion->FLLockCV.notifyAll(BatchClassRegion->FLLock);
     }
 
     if (SCUDO_DEBUG) {
-      BatchGroup *Prev = Region->FreeListInfo.BlockList.front();
-      for (BatchGroup *Cur = Prev->Next; Cur != nullptr;
+      BatchGroupT *Prev = Region->FreeListInfo.BlockList.front();
+      for (BatchGroupT *Cur = Prev->Next; Cur != nullptr;
            Prev = Cur, Cur = Cur->Next) {
         CHECK_LT(Prev->CompactPtrGroupBase, Cur->CompactPtrGroupBase);
       }
     }
+
+    if (conditionVariableEnabled())
+      Region->FLLockCV.notifyAll(Region->FLLock);
   }
 
   // TODO: `PrimaryBase` can be obtained from ReservedMemory. This needs to be
diff --git a/compiler-rt/lib/scudo/standalone/release.cpp b/compiler-rt/lib/scudo/standalone/release.cpp
index 938bb41faf69..875a2b0c1c57 100644
--- a/compiler-rt/lib/scudo/standalone/release.cpp
+++ b/compiler-rt/lib/scudo/standalone/release.cpp
@@ -10,7 +10,8 @@
 
 namespace scudo {
 
-BufferPool<RegionPageMap::StaticBufferCount, RegionPageMap::StaticBufferSize>
+BufferPool<RegionPageMap::StaticBufferCount,
+           RegionPageMap::StaticBufferNumElements>
     RegionPageMap::Buffers;
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h
index 5bf963d0f26f..b6f76a4d2058 100644
--- a/compiler-rt/lib/scudo/standalone/release.h
+++ b/compiler-rt/lib/scudo/standalone/release.h
@@ -80,20 +80,53 @@ private:
   MapPlatformData *Data = nullptr;
 };
 
-// A buffer pool which holds a fixed number of static buffers for fast buffer
-// allocation. If the request size is greater than `StaticBufferSize`, it'll
+class FragmentationRecorder {
+public:
+  FragmentationRecorder() = default;
+
+  uptr getReleasedPagesCount() const { return ReleasedPagesCount; }
+
+  void releasePageRangeToOS(uptr From, uptr To) {
+    DCHECK_EQ((To - From) % getPageSizeCached(), 0U);
+    ReleasedPagesCount += (To - From) / getPageSizeCached();
+  }
+
+private:
+  uptr ReleasedPagesCount = 0;
+};
+
+// A buffer pool which holds a fixed number of static buffers of `uptr` elements
+// for fast buffer allocation. If the request size is greater than
+// `StaticBufferNumElements` or if all the static buffers are in use, it'll
 // delegate the allocation to map().
-template <uptr StaticBufferCount, uptr StaticBufferSize> class BufferPool {
+template <uptr StaticBufferCount, uptr StaticBufferNumElements>
+class BufferPool {
 public:
   // Preserve 1 bit in the `Mask` so that we don't need to do zero-check while
   // extracting the least significant bit from the `Mask`.
   static_assert(StaticBufferCount < SCUDO_WORDSIZE, "");
-  static_assert(isAligned(StaticBufferSize, SCUDO_CACHE_LINE_SIZE), "");
+  static_assert(isAligned(StaticBufferNumElements * sizeof(uptr),
+                          SCUDO_CACHE_LINE_SIZE),
+                "");
+
+  struct Buffer {
+    // Pointer to the buffer's memory, or nullptr if no buffer was allocated.
+    uptr *Data = nullptr;
+
+    // The index of the underlying static buffer, or StaticBufferCount if this
+    // buffer was dynamically allocated. This value is initially set to a poison
+    // value to aid debugging.
+    uptr BufferIndex = ~static_cast<uptr>(0);
 
-  // Return a buffer which is at least `BufferSize`.
-  uptr *getBuffer(const uptr BufferSize) {
-    if (UNLIKELY(BufferSize > StaticBufferSize))
-      return getDynamicBuffer(BufferSize);
+    // Only valid if BufferIndex == StaticBufferCount.
+    MemMapT MemMap = {};
+  };
+
+  // Return a zero-initialized buffer which can contain at least the given
+  // number of elements, or nullptr on failure.
+  Buffer getBuffer(const uptr NumElements) {
+    if (UNLIKELY(NumElements > StaticBufferNumElements))
+      return getDynamicBuffer(NumElements);
 
     uptr index;
     {
@@ -108,69 +141,55 @@ public:
     }
 
     if (index >= StaticBufferCount)
-      return getDynamicBuffer(BufferSize);
+      return getDynamicBuffer(NumElements);
 
-    const uptr Offset = index * StaticBufferSize;
-    memset(&RawBuffer[Offset], 0, StaticBufferSize);
-    return &RawBuffer[Offset];
+    Buffer Buf;
+    Buf.Data = &RawBuffer[index * StaticBufferNumElements];
+    Buf.BufferIndex = index;
+    memset(Buf.Data, 0, StaticBufferNumElements * sizeof(uptr));
+    return Buf;
   }
 
-  void releaseBuffer(uptr *Buffer, const uptr BufferSize) {
-    const uptr index = getStaticBufferIndex(Buffer, BufferSize);
-    if (index < StaticBufferCount) {
+  void releaseBuffer(Buffer Buf) {
+    DCHECK_NE(Buf.Data, nullptr);
+    DCHECK_LE(Buf.BufferIndex, StaticBufferCount);
+    if (Buf.BufferIndex != StaticBufferCount) {
       ScopedLock L(Mutex);
-      DCHECK_EQ((Mask & (static_cast<uptr>(1) << index)), 0U);
-      Mask |= static_cast<uptr>(1) << index;
+      DCHECK_EQ((Mask & (static_cast<uptr>(1) << Buf.BufferIndex)), 0U);
+      Mask |= static_cast<uptr>(1) << Buf.BufferIndex;
     } else {
-      unmap(reinterpret_cast<void *>(Buffer),
-            roundUp(BufferSize, getPageSizeCached()));
+      Buf.MemMap.unmap(Buf.MemMap.getBase(), Buf.MemMap.getCapacity());
     }
   }
 
-  bool isStaticBufferTestOnly(uptr *Buffer, uptr BufferSize) {
-    return getStaticBufferIndex(Buffer, BufferSize) < StaticBufferCount;
+  bool isStaticBufferTestOnly(const Buffer &Buf) {
+    DCHECK_NE(Buf.Data, nullptr);
+    DCHECK_LE(Buf.BufferIndex, StaticBufferCount);
+    return Buf.BufferIndex != StaticBufferCount;
   }
 
 private:
-  uptr getStaticBufferIndex(uptr *Buffer, uptr BufferSize) {
-    if (UNLIKELY(BufferSize > StaticBufferSize))
-      return StaticBufferCount;
-
-    const uptr BufferBase = reinterpret_cast<uptr>(Buffer);
-    const uptr RawBufferBase = reinterpret_cast<uptr>(RawBuffer);
-
-    if (BufferBase < RawBufferBase ||
-        BufferBase >= RawBufferBase + sizeof(RawBuffer)) {
-      return StaticBufferCount;
-    }
-
-    DCHECK_LE(BufferSize, StaticBufferSize);
-    DCHECK_LE(BufferBase + BufferSize, RawBufferBase + sizeof(RawBuffer));
-    DCHECK_EQ((BufferBase - RawBufferBase) % StaticBufferSize, 0U);
-
-    const uptr index =
-        (BufferBase - RawBufferBase) / (StaticBufferSize * sizeof(uptr));
-    DCHECK_LT(index, StaticBufferCount);
-    return index;
-  }
-
-  uptr *getDynamicBuffer(const uptr BufferSize) {
+  Buffer getDynamicBuffer(const uptr NumElements) {
     // When using a heap-based buffer, precommit the pages backing the
     // Vmar by passing |MAP_PRECOMMIT| flag. This allows an optimization
     // where page fault exceptions are skipped as the allocated memory
     // is accessed. So far, this is only enabled on Fuchsia. It hasn't proven a
     // performance benefit on other platforms.
     const uptr MmapFlags = MAP_ALLOWNOMEM | (SCUDO_FUCHSIA ? MAP_PRECOMMIT : 0);
-    return reinterpret_cast<uptr *>(
-        map(nullptr, roundUp(BufferSize, getPageSizeCached()), "scudo:counters",
-            MmapFlags, &MapData));
+    const uptr MappedSize =
+        roundUp(NumElements * sizeof(uptr), getPageSizeCached());
+    Buffer Buf;
+    if (Buf.MemMap.map(/*Addr=*/0, MappedSize, "scudo:counters", MmapFlags)) {
+      Buf.Data = reinterpret_cast<uptr *>(Buf.MemMap.getBase());
+      Buf.BufferIndex = StaticBufferCount;
+    }
+    return Buf;
   }
 
   HybridMutex Mutex;
   // '1' means that buffer index is not used. '0' means the buffer is in use.
   uptr Mask GUARDED_BY(Mutex) = ~static_cast<uptr>(0);
-  uptr RawBuffer[StaticBufferCount * StaticBufferSize] GUARDED_BY(Mutex);
-  [[no_unique_address]] MapPlatformData MapData = {};
+  uptr RawBuffer[StaticBufferCount * StaticBufferNumElements] GUARDED_BY(Mutex);
 };
 
 // A Region page map is used to record the usage of pages in the regions. It
@@ -185,23 +204,17 @@ private:
 class RegionPageMap {
 public:
   RegionPageMap()
-      : Regions(0),
-        NumCounters(0),
-        CounterSizeBitsLog(0),
-        CounterMask(0),
-        PackingRatioLog(0),
-        BitOffsetMask(0),
-        SizePerRegion(0),
-        BufferSize(0),
-        Buffer(nullptr) {}
+      : Regions(0), NumCounters(0), CounterSizeBitsLog(0), CounterMask(0),
+        PackingRatioLog(0), BitOffsetMask(0), SizePerRegion(0),
+        BufferNumElements(0) {}
   RegionPageMap(uptr NumberOfRegions, uptr CountersPerRegion, uptr MaxValue) {
     reset(NumberOfRegions, CountersPerRegion, MaxValue);
   }
   ~RegionPageMap() {
     if (!isAllocated())
       return;
-    Buffers.releaseBuffer(Buffer, BufferSize);
-    Buffer = nullptr;
+    Buffers.releaseBuffer(Buffer);
+    Buffer = {};
   }
 
   // Lock of `StaticBuffer` is acquired conditionally and there's no easy way to
@@ -216,7 +229,7 @@ public:
     Regions = NumberOfRegion;
     NumCounters = CountersPerRegion;
 
-    constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
+    constexpr uptr MaxCounterBits = sizeof(*Buffer.Data) * 8UL;
     // Rounding counter storage size up to the power of two allows for using
     // bit shifts calculating particular counter's Index and offset.
     const uptr CounterSizeBits =
@@ -233,11 +246,11 @@ public:
     SizePerRegion =
         roundUp(NumCounters, static_cast<uptr>(1U) << PackingRatioLog) >>
         PackingRatioLog;
-    BufferSize = SizePerRegion * sizeof(*Buffer) * Regions;
-    Buffer = Buffers.getBuffer(BufferSize);
+    BufferNumElements = SizePerRegion * Regions;
+    Buffer = Buffers.getBuffer(BufferNumElements);
   }
 
-  bool isAllocated() const { return !!Buffer; }
+  bool isAllocated() const { return Buffer.Data != nullptr; }
 
   uptr getCount() const { return NumCounters; }
 
@@ -246,7 +259,8 @@ public:
     DCHECK_LT(I, NumCounters);
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
-    return (Buffer[Region * SizePerRegion + Index] >> BitOffset) & CounterMask;
+    return (Buffer.Data[Region * SizePerRegion + Index] >> BitOffset) &
+           CounterMask;
   }
 
   void inc(uptr Region, uptr I) const {
@@ -255,8 +269,8 @@ public:
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
     DCHECK_EQ(isAllCounted(Region, I), false);
-    Buffer[Region * SizePerRegion + Index] += static_cast<uptr>(1U)
-                                              << BitOffset;
+    Buffer.Data[Region * SizePerRegion + Index] += static_cast<uptr>(1U)
+                                                   << BitOffset;
   }
 
   void incN(uptr Region, uptr I, uptr N) const {
@@ -267,7 +281,7 @@ public:
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
     DCHECK_EQ(isAllCounted(Region, I), false);
-    Buffer[Region * SizePerRegion + Index] += N << BitOffset;
+    Buffer.Data[Region * SizePerRegion + Index] += N << BitOffset;
   }
 
   void incRange(uptr Region, uptr From, uptr To) const {
@@ -286,7 +300,7 @@ public:
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
-    Buffer[Region * SizePerRegion + Index] |= CounterMask << BitOffset;
+    Buffer.Data[Region * SizePerRegion + Index] |= CounterMask << BitOffset;
   }
   void setAsAllCountedRange(uptr Region, uptr From, uptr To) const {
     DCHECK_LE(From, To);
@@ -309,9 +323,16 @@ public:
     return get(Region, I) == CounterMask;
   }
 
-  uptr getBufferSize() const { return BufferSize; }
+  uptr getBufferNumElements() const { return BufferNumElements; }
 
 private:
+  // We may consider making this configurable if there are cases which may
+  // benefit from this.
+  static const uptr StaticBufferCount = 2U;
+  static const uptr StaticBufferNumElements = 512U;
+  using BufferPoolT = BufferPool<StaticBufferCount, StaticBufferNumElements>;
+  static BufferPoolT Buffers;
+
   uptr Regions;
   uptr NumCounters;
   uptr CounterSizeBitsLog;
@@ -320,14 +341,8 @@ private:
   uptr BitOffsetMask;
 
   uptr SizePerRegion;
-  uptr BufferSize;
-  uptr *Buffer;
-
-  // We may consider making this configurable if there are cases which may
-  // benefit from this.
-  static const uptr StaticBufferCount = 2U;
-  static const uptr StaticBufferSize = 512U;
-  static BufferPool<StaticBufferCount, StaticBufferSize> Buffers;
+  uptr BufferNumElements;
+  BufferPoolT::Buffer Buffer;
 };
 
 template <class ReleaseRecorderT> class FreePagesRangeTracker {
diff --git a/compiler-rt/lib/scudo/standalone/report.cpp b/compiler-rt/lib/scudo/standalone/report.cpp
index 81b3dce4e02c..9cef0adc0bb3 100644
--- a/compiler-rt/lib/scudo/standalone/report.cpp
+++ b/compiler-rt/lib/scudo/standalone/report.cpp
@@ -24,11 +24,7 @@ public:
     Message.vappend(Format, Args);
     va_end(Args);
   }
-  NORETURN ~ScopedErrorReport() {
-    outputRaw(Message.data());
-    setAbortMessage(Message.data());
-    die();
-  }
+  NORETURN ~ScopedErrorReport() { reportRawError(Message.data()); }
 
 private:
   ScopedString Message;
@@ -36,18 +32,6 @@ private:
 
 inline void NORETURN trap() { __builtin_trap(); }
 
-void NORETURN reportSoftRSSLimit(uptr RssLimitMb) {
-  ScopedErrorReport Report;
-  Report.append("Soft RSS limit of %zu MB exhausted, current RSS is %zu MB\n",
-                RssLimitMb, GetRSS() >> 20);
-}
-
-void NORETURN reportHardRSSLimit(uptr RssLimitMb) {
-  ScopedErrorReport Report;
-  Report.append("Hard RSS limit of %zu MB exhausted, current RSS is %zu MB\n",
-                RssLimitMb, GetRSS() >> 20);
-}
-
 // This could potentially be called recursively if a CHECK fails in the reports.
 void NORETURN reportCheckFailed(const char *File, int Line,
                                 const char *Condition, u64 Value1, u64 Value2) {
@@ -67,6 +51,13 @@ void NORETURN reportError(const char *Message) {
   Report.append("%s\n", Message);
 }
 
+// Generic fatal error message without ScopedString.
+void NORETURN reportRawError(const char *Message) {
+  outputRaw(Message);
+  setAbortMessage(Message);
+  die();
+}
+
 void NORETURN reportInvalidFlag(const char *FlagType, const char *Value) {
   ScopedErrorReport Report;
   Report.append("invalid value for %s option: '%s'\n", FlagType, Value);
@@ -79,14 +70,6 @@ void NORETURN reportHeaderCorruption(void *Ptr) {
   Report.append("corrupted chunk header at address %p\n", Ptr);
 }
 
-// Two threads have attempted to modify a chunk header at the same time. This is
-// symptomatic of a race-condition in the application code, or general lack of
-// proper locking.
-void NORETURN reportHeaderRace(void *Ptr) {
-  ScopedErrorReport Report;
-  Report.append("race on chunk header at address %p\n", Ptr);
-}
-
 // The allocator was compiled with parameters that conflict with field size
 // requirements.
 void NORETURN reportSanityCheckError(const char *Field) {
diff --git a/compiler-rt/lib/scudo/standalone/report.h b/compiler-rt/lib/scudo/standalone/report.h
index 3a78ab64b13f..a510fdaebb6d 100644
--- a/compiler-rt/lib/scudo/standalone/report.h
+++ b/compiler-rt/lib/scudo/standalone/report.h
@@ -15,15 +15,17 @@ namespace scudo {
 
 // Reports are *fatal* unless stated otherwise.
 
-// Generic error.
+// Generic error, adds newline to end of message.
 void NORETURN reportError(const char *Message);
 
+// Generic error, but the message is not modified.
+void NORETURN reportRawError(const char *Message);
+
 // Flags related errors.
 void NORETURN reportInvalidFlag(const char *FlagType, const char *Value);
 
 // Chunk header related errors.
 void NORETURN reportHeaderCorruption(void *Ptr);
-void NORETURN reportHeaderRace(void *Ptr);
 
 // Sanity checks related error.
 void NORETURN reportSanityCheckError(const char *Field);
@@ -34,8 +36,6 @@ void NORETURN reportAllocationSizeTooBig(uptr UserSize, uptr TotalSize,
                                          uptr MaxSize);
 void NORETURN reportOutOfBatchClass();
 void NORETURN reportOutOfMemory(uptr RequestedSize);
-void NORETURN reportSoftRSSLimit(uptr RssLimitMb);
-void NORETURN reportHardRSSLimit(uptr RssLimitMb);
 enum class AllocatorAction : u8 {
   Recycling,
   Deallocating,
diff --git a/compiler-rt/lib/scudo/standalone/report_linux.cpp b/compiler-rt/lib/scudo/standalone/report_linux.cpp
new file mode 100644
index 000000000000..6a983036e6cd
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/report_linux.cpp
@@ -0,0 +1,58 @@
+//===-- report_linux.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "platform.h"
+
+#if SCUDO_LINUX || SCUDO_TRUSTY
+
+#include "common.h"
+#include "internal_defs.h"
+#include "report.h"
+#include "report_linux.h"
+#include "string_utils.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace scudo {
+
+// Fatal internal map() error (potentially OOM related).
+void NORETURN reportMapError(uptr SizeIfOOM) {
+  char Error[128] = "Scudo ERROR: internal map failure\n";
+  if (SizeIfOOM) {
+    formatString(
+        Error, sizeof(Error),
+        "Scudo ERROR: internal map failure (NO MEMORY) requesting %zuKB\n",
+        SizeIfOOM >> 10);
+  }
+  reportRawError(Error);
+}
+
+void NORETURN reportUnmapError(uptr Addr, uptr Size) {
+  char Error[128];
+  formatString(Error, sizeof(Error),
+               "Scudo ERROR: internal unmap failure (error desc=%s) Addr 0x%zx "
+               "Size %zu\n",
+               strerror(errno), Addr, Size);
+  reportRawError(Error);
+}
+
+void NORETURN reportProtectError(uptr Addr, uptr Size, int Prot) {
+  char Error[128];
+  formatString(
+      Error, sizeof(Error),
+      "Scudo ERROR: internal protect failure (error desc=%s) Addr 0x%zx "
+      "Size %zu Prot %x\n",
+      strerror(errno), Addr, Size, Prot);
+  reportRawError(Error);
+}
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX || SCUDO_TRUSTY
diff --git a/compiler-rt/lib/scudo/standalone/report_linux.h b/compiler-rt/lib/scudo/standalone/report_linux.h
new file mode 100644
index 000000000000..aa0bb247e672
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/report_linux.h
@@ -0,0 +1,34 @@
+//===-- report_linux.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_REPORT_LINUX_H_
+#define SCUDO_REPORT_LINUX_H_
+
+#include "platform.h"
+
+#if SCUDO_LINUX || SCUDO_TRUSTY
+
+#include "internal_defs.h"
+
+namespace scudo {
+
+// Report a fatal error when a map call fails. SizeIfOOM shall
+// hold the requested size on an out-of-memory error, 0 otherwise.
+void NORETURN reportMapError(uptr SizeIfOOM = 0);
+
+// Report a fatal error when an unmap call fails.
+void NORETURN reportUnmapError(uptr Addr, uptr Size);
+
+// Report a fatal error when a mprotect call fails.
+void NORETURN reportProtectError(uptr Addr, uptr Size, int Prot);
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX || SCUDO_TRUSTY
+
+#endif // SCUDO_REPORT_LINUX_H_
diff --git a/compiler-rt/lib/scudo/standalone/rss_limit_checker.cpp b/compiler-rt/lib/scudo/standalone/rss_limit_checker.cpp
deleted file mode 100644
index f428386b755c..000000000000
--- a/compiler-rt/lib/scudo/standalone/rss_limit_checker.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- common.cpp ----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "rss_limit_checker.h"
-#include "atomic_helpers.h"
-#include "string_utils.h"
-
-namespace scudo {
-
-void RssLimitChecker::check(u64 NextCheck) {
-  // The interval for the checks is 250ms.
-  static constexpr u64 CheckInterval = 250 * 1000000;
-
-  // Early return in case another thread already did the calculation.
-  if (!atomic_compare_exchange_strong(&RssNextCheckAtNS, &NextCheck,
-                                      getMonotonicTime() + CheckInterval,
-                                      memory_order_relaxed)) {
-    return;
-  }
-
-  const uptr CurrentRssMb = GetRSS() >> 20;
-
-  RssLimitExceeded Result = RssLimitExceeded::Neither;
-  if (UNLIKELY(HardRssLimitMb && HardRssLimitMb < CurrentRssMb))
-    Result = RssLimitExceeded::Hard;
-  else if (UNLIKELY(SoftRssLimitMb && SoftRssLimitMb < CurrentRssMb))
-    Result = RssLimitExceeded::Soft;
-
-  atomic_store_relaxed(&RssLimitStatus, static_cast<u8>(Result));
-}
-
-} // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/rss_limit_checker.h b/compiler-rt/lib/scudo/standalone/rss_limit_checker.h
deleted file mode 100644
index 29dc063f3fc4..000000000000
--- a/compiler-rt/lib/scudo/standalone/rss_limit_checker.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//===-- common.h ------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SCUDO_RSS_LIMIT_CHECKER_H_
-#define SCUDO_RSS_LIMIT_CHECKER_H_
-
-#include "atomic_helpers.h"
-#include "common.h"
-#include "internal_defs.h"
-
-namespace scudo {
-
-class RssLimitChecker {
-public:
-  enum RssLimitExceeded {
-    Neither,
-    Soft,
-    Hard,
-  };
-
-  void init(int SoftRssLimitMb, int HardRssLimitMb) {
-    CHECK_GE(SoftRssLimitMb, 0);
-    CHECK_GE(HardRssLimitMb, 0);
-    this->SoftRssLimitMb = static_cast<uptr>(SoftRssLimitMb);
-    this->HardRssLimitMb = static_cast<uptr>(HardRssLimitMb);
-  }
-
-  // Opportunistic RSS limit check. This will update the RSS limit status, if
-  // it can, every 250ms, otherwise it will just return the current one.
-  RssLimitExceeded getRssLimitExceeded() {
-    if (!HardRssLimitMb && !SoftRssLimitMb)
-      return RssLimitExceeded::Neither;
-
-    u64 NextCheck = atomic_load_relaxed(&RssNextCheckAtNS);
-    u64 Now = getMonotonicTime();
-
-    if (UNLIKELY(Now >= NextCheck))
-      check(NextCheck);
-
-    return static_cast<RssLimitExceeded>(atomic_load_relaxed(&RssLimitStatus));
-  }
-
-  uptr getSoftRssLimit() const { return SoftRssLimitMb; }
-  uptr getHardRssLimit() const { return HardRssLimitMb; }
-
-private:
-  void check(u64 NextCheck);
-
-  uptr SoftRssLimitMb = 0;
-  uptr HardRssLimitMb = 0;
-
-  atomic_u64 RssNextCheckAtNS = {};
-  atomic_u8 RssLimitStatus = {};
-};
-
-} // namespace scudo
-
-#endif // SCUDO_RSS_LIMIT_CHECKER_H_
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 105b154b5de2..8dc4c87fa7c6 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -72,11 +72,26 @@ static inline void unmap(LargeBlock::Header *H) {
   MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
 }
 
+namespace {
+struct CachedBlock {
+  uptr CommitBase = 0;
+  uptr CommitSize = 0;
+  uptr BlockBegin = 0;
+  MemMapT MemMap = {};
+  u64 Time = 0;
+
+  bool isValid() { return CommitBase != 0; }
+
+  void invalidate() { CommitBase = 0; }
+};
+} // namespace
+
 template <typename Config> class MapAllocatorNoCache {
 public:
   void init(UNUSED s32 ReleaseToOsInterval) {}
   bool retrieve(UNUSED Options Options, UNUSED uptr Size, UNUSED uptr Alignment,
-                UNUSED LargeBlock::Header **H, UNUSED bool *Zeroed) {
+                UNUSED uptr HeadersSize, UNUSED LargeBlock::Header **H,
+                UNUSED bool *Zeroed) {
     return false;
   }
   void store(UNUSED Options Options, LargeBlock::Header *H) { unmap(H); }
@@ -102,20 +117,22 @@ public:
 static const uptr MaxUnusedCachePages = 4U;
 
 template <typename Config>
-void mapSecondary(Options Options, uptr CommitBase, uptr CommitSize,
+bool mapSecondary(const Options &Options, uptr CommitBase, uptr CommitSize,
                   uptr AllocPos, uptr Flags, MemMapT &MemMap) {
+  Flags |= MAP_RESIZABLE;
+  Flags |= MAP_ALLOWNOMEM;
+
   const uptr MaxUnusedCacheBytes = MaxUnusedCachePages * getPageSizeCached();
   if (useMemoryTagging<Config>(Options) && CommitSize > MaxUnusedCacheBytes) {
     const uptr UntaggedPos = Max(AllocPos, CommitBase + MaxUnusedCacheBytes);
-    MemMap.remap(CommitBase, UntaggedPos - CommitBase, "scudo:secondary",
-                 MAP_RESIZABLE | MAP_MEMTAG | Flags);
-    MemMap.remap(UntaggedPos, CommitBase + CommitSize - UntaggedPos,
-                 "scudo:secondary", MAP_RESIZABLE | Flags);
+    return MemMap.remap(CommitBase, UntaggedPos - CommitBase, "scudo:secondary",
+                        MAP_MEMTAG | Flags) &&
+           MemMap.remap(UntaggedPos, CommitBase + CommitSize - UntaggedPos,
+                        "scudo:secondary", Flags);
   } else {
     const uptr RemapFlags =
-        MAP_RESIZABLE | (useMemoryTagging<Config>(Options) ? MAP_MEMTAG : 0) |
-        Flags;
-    MemMap.remap(CommitBase, CommitSize, "scudo:secondary", RemapFlags);
+        (useMemoryTagging<Config>(Options) ? MAP_MEMTAG : 0) | Flags;
+    return MemMap.remap(CommitBase, CommitSize, "scudo:secondary", RemapFlags);
   }
 }
 
@@ -138,17 +155,28 @@ public:
 
   void getStats(ScopedString *Str) {
     ScopedLock L(Mutex);
+    u32 Integral = 0;
+    u32 Fractional = 0;
+    if (CallsToRetrieve != 0) {
+      Integral = SuccessfulRetrieves * 100 / CallsToRetrieve;
+      Fractional = (((SuccessfulRetrieves * 100) % CallsToRetrieve) * 100 +
+                    CallsToRetrieve / 2) /
+                   CallsToRetrieve;
+    }
     Str->append("Stats: MapAllocatorCache: EntriesCount: %d, "
                 "MaxEntriesCount: %u, MaxEntrySize: %zu\n",
                 EntriesCount, atomic_load_relaxed(&MaxEntriesCount),
                 atomic_load_relaxed(&MaxEntrySize));
+    Str->append("Stats: CacheRetrievalStats: SuccessRate: %u/%u "
+                "(%u.%02u%%)\n",
+                SuccessfulRetrieves, CallsToRetrieve, Integral, Fractional);
     for (CachedBlock Entry : Entries) {
-      if (!Entry.CommitBase)
+      if (!Entry.isValid())
         continue;
       Str->append("StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, "
-                  "BlockSize: %zu\n",
+                  "BlockSize: %zu %s\n",
                   Entry.CommitBase, Entry.CommitBase + Entry.CommitSize,
-                  Entry.CommitSize);
+                  Entry.CommitSize, Entry.Time == 0 ? "[R]" : "");
     }
   }
 
@@ -166,7 +194,7 @@ public:
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
-  void store(Options Options, LargeBlock::Header *H) EXCLUDES(Mutex) {
+  void store(const Options &Options, LargeBlock::Header *H) EXCLUDES(Mutex) {
     if (!canCache(H->CommitSize))
       return unmap(H);
 
@@ -195,7 +223,7 @@ public:
                                          MAP_NOACCESS);
       }
     } else if (Interval == 0) {
-      Entry.MemMap.releasePagesToOS(Entry.CommitBase, Entry.CommitSize);
+      Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase, Entry.CommitSize);
       Entry.Time = 0;
     }
     do {
@@ -210,7 +238,7 @@ public:
       if (CacheConfig::QuarantineSize && useMemoryTagging<Config>(Options)) {
         QuarantinePos =
             (QuarantinePos + 1) % Max(CacheConfig::QuarantineSize, 1u);
-        if (!Quarantine[QuarantinePos].CommitBase) {
+        if (!Quarantine[QuarantinePos].isValid()) {
           Quarantine[QuarantinePos] = Entry;
           return;
         }
@@ -225,7 +253,7 @@ public:
           EmptyCache = true;
       } else {
         for (u32 I = 0; I < MaxCount; I++) {
-          if (Entries[I].CommitBase)
+          if (Entries[I].isValid())
             continue;
           if (I != 0)
             Entries[I] = Entries[0];
@@ -246,26 +274,31 @@ public:
       Entry.MemMap.unmap(Entry.MemMap.getBase(), Entry.MemMap.getCapacity());
   }
 
-  bool retrieve(Options Options, uptr Size, uptr Alignment,
+  bool retrieve(Options Options, uptr Size, uptr Alignment, uptr HeadersSize,
                 LargeBlock::Header **H, bool *Zeroed) EXCLUDES(Mutex) {
     const uptr PageSize = getPageSizeCached();
     const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
+    // 10% of the requested size proved to be the optimal choice for
+    // retrieving cached blocks after testing several options.
+    constexpr u32 FragmentedBytesDivisor = 10;
     bool Found = false;
     CachedBlock Entry;
-    uptr HeaderPos = 0;
+    uptr EntryHeaderPos = 0;
     {
       ScopedLock L(Mutex);
+      CallsToRetrieve++;
       if (EntriesCount == 0)
         return false;
+      u32 OptimalFitIndex = 0;
+      uptr MinDiff = UINTPTR_MAX;
       for (u32 I = 0; I < MaxCount; I++) {
-        const uptr CommitBase = Entries[I].CommitBase;
-        if (!CommitBase)
+        if (!Entries[I].isValid())
           continue;
+        const uptr CommitBase = Entries[I].CommitBase;
         const uptr CommitSize = Entries[I].CommitSize;
         const uptr AllocPos =
             roundDown(CommitBase + CommitSize - Size, Alignment);
-        HeaderPos =
-            AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
+        const uptr HeaderPos = AllocPos - HeadersSize;
         if (HeaderPos > CommitBase + CommitSize)
           continue;
         if (HeaderPos < CommitBase ||
@@ -273,17 +306,36 @@ public:
           continue;
         }
         Found = true;
-        Entry = Entries[I];
-        Entries[I].CommitBase = 0;
+        const uptr Diff = HeaderPos - CommitBase;
+        // immediately use a cached block if it's size is close enough to the
+        // requested size.
+        const uptr MaxAllowedFragmentedBytes =
+            (CommitBase + CommitSize - HeaderPos) / FragmentedBytesDivisor;
+        if (Diff <= MaxAllowedFragmentedBytes) {
+          OptimalFitIndex = I;
+          EntryHeaderPos = HeaderPos;
+          break;
+        }
+        // keep track of the smallest cached block
+        // that is greater than (AllocSize + HeaderSize)
+        if (Diff > MinDiff)
+          continue;
+        OptimalFitIndex = I;
+        MinDiff = Diff;
+        EntryHeaderPos = HeaderPos;
+      }
+      if (Found) {
+        Entry = Entries[OptimalFitIndex];
+        Entries[OptimalFitIndex].invalidate();
         EntriesCount--;
-        break;
+        SuccessfulRetrieves++;
       }
     }
     if (!Found)
       return false;
 
     *H = reinterpret_cast<LargeBlock::Header *>(
-        LargeBlock::addHeaderTag<Config>(HeaderPos));
+        LargeBlock::addHeaderTag<Config>(EntryHeaderPos));
     *Zeroed = Entry.Time == 0;
     if (useMemoryTagging<Config>(Options))
       Entry.MemMap.setMemoryPermission(Entry.CommitBase, Entry.CommitSize, 0);
@@ -295,8 +347,7 @@ public:
       } else if (Entry.BlockBegin < NewBlockBegin) {
         storeTags(Entry.BlockBegin, NewBlockBegin);
       } else {
-        storeTags(untagPointer(NewBlockBegin),
-                  untagPointer(Entry.BlockBegin));
+        storeTags(untagPointer(NewBlockBegin), untagPointer(Entry.BlockBegin));
       }
     }
     (*H)->CommitBase = Entry.CommitBase;
@@ -338,15 +389,15 @@ public:
   void disableMemoryTagging() EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     for (u32 I = 0; I != CacheConfig::QuarantineSize; ++I) {
-      if (Quarantine[I].CommitBase) {
+      if (Quarantine[I].isValid()) {
         MemMapT &MemMap = Quarantine[I].MemMap;
         MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
-        Quarantine[I].CommitBase = 0;
+        Quarantine[I].invalidate();
       }
     }
     const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
     for (u32 I = 0; I < MaxCount; I++) {
-      if (Entries[I].CommitBase) {
+      if (Entries[I].isValid()) {
         Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase,
                                               Entries[I].CommitSize, 0);
       }
@@ -367,10 +418,10 @@ private:
     {
       ScopedLock L(Mutex);
       for (uptr I = 0; I < CacheConfig::EntriesArraySize; I++) {
-        if (!Entries[I].CommitBase)
+        if (!Entries[I].isValid())
           continue;
         MapInfo[N] = Entries[I].MemMap;
-        Entries[I].CommitBase = 0;
+        Entries[I].invalidate();
         N++;
       }
       EntriesCount = 0;
@@ -382,23 +433,15 @@ private:
     }
   }
 
-  struct CachedBlock {
-    uptr CommitBase = 0;
-    uptr CommitSize = 0;
-    uptr BlockBegin = 0;
-    MemMapT MemMap = {};
-    u64 Time = 0;
-  };
-
   void releaseIfOlderThan(CachedBlock &Entry, u64 Time) REQUIRES(Mutex) {
-    if (!Entry.CommitBase || !Entry.Time)
+    if (!Entry.isValid() || !Entry.Time)
       return;
     if (Entry.Time > Time) {
       if (OldestTime == 0 || Entry.Time < OldestTime)
         OldestTime = Entry.Time;
       return;
     }
-    Entry.MemMap.releasePagesToOS(Entry.CommitBase, Entry.CommitSize);
+    Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase, Entry.CommitSize);
     Entry.Time = 0;
   }
 
@@ -421,6 +464,8 @@ private:
   u64 OldestTime GUARDED_BY(Mutex) = 0;
   u32 IsFullEvents GUARDED_BY(Mutex) = 0;
   atomic_s32 ReleaseToOsIntervalMs = {};
+  u32 CallsToRetrieve GUARDED_BY(Mutex) = 0;
+  u32 SuccessfulRetrieves GUARDED_BY(Mutex) = 0;
 
   CachedBlock Entries[CacheConfig::EntriesArraySize] GUARDED_BY(Mutex) = {};
   NonZeroLengthArray<CachedBlock, CacheConfig::QuarantineSize>
@@ -439,11 +484,11 @@ public:
       S->link(&Stats);
   }
 
-  void *allocate(Options Options, uptr Size, uptr AlignmentHint = 0,
+  void *allocate(const Options &Options, uptr Size, uptr AlignmentHint = 0,
                  uptr *BlockEnd = nullptr,
                  FillContentsMode FillContents = NoFill);
 
-  void deallocate(Options Options, void *Ptr);
+  void deallocate(const Options &Options, void *Ptr);
 
   static uptr getBlockEnd(void *Ptr) {
     auto *B = LargeBlock::getHeader<Config>(Ptr);
@@ -454,6 +499,10 @@ public:
     return getBlockEnd(Ptr) - reinterpret_cast<uptr>(Ptr);
   }
 
+  static constexpr uptr getHeadersSize() {
+    return Chunk::getHeaderSize() + LargeBlock::getHeaderSize();
+  }
+
   void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
     Cache.disable();
@@ -494,6 +543,7 @@ private:
   DoublyLinkedList<LargeBlock::Header> InUseBlocks GUARDED_BY(Mutex);
   uptr AllocatedBytes GUARDED_BY(Mutex) = 0;
   uptr FreedBytes GUARDED_BY(Mutex) = 0;
+  uptr FragmentedBytes GUARDED_BY(Mutex) = 0;
   uptr LargestSize GUARDED_BY(Mutex) = 0;
   u32 NumberOfAllocs GUARDED_BY(Mutex) = 0;
   u32 NumberOfFrees GUARDED_BY(Mutex) = 0;
@@ -512,24 +562,23 @@ private:
 // the committed memory will amount to something close to Size - AlignmentHint
 // (pending rounding and headers).
 template <typename Config>
-void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
-                                     uptr *BlockEndPtr,
+void *MapAllocator<Config>::allocate(const Options &Options, uptr Size,
+                                     uptr Alignment, uptr *BlockEndPtr,
                                      FillContentsMode FillContents) {
   if (Options.get(OptionBit::AddLargeAllocationSlack))
     Size += 1UL << SCUDO_MIN_ALIGNMENT_LOG;
   Alignment = Max(Alignment, uptr(1U) << SCUDO_MIN_ALIGNMENT_LOG);
   const uptr PageSize = getPageSizeCached();
-  uptr RoundedSize =
-      roundUp(roundUp(Size, Alignment) + LargeBlock::getHeaderSize() +
-                  Chunk::getHeaderSize(),
-              PageSize);
-  if (Alignment > PageSize)
-    RoundedSize += Alignment - PageSize;
 
-  if (Alignment < PageSize && Cache.canCache(RoundedSize)) {
+  // Note that cached blocks may have aligned address already. Thus we simply
+  // pass the required size (`Size` + `getHeadersSize()`) to do cache look up.
+  const uptr MinNeededSizeForCache = roundUp(Size + getHeadersSize(), PageSize);
+
+  if (Alignment < PageSize && Cache.canCache(MinNeededSizeForCache)) {
     LargeBlock::Header *H;
     bool Zeroed;
-    if (Cache.retrieve(Options, Size, Alignment, &H, &Zeroed)) {
+    if (Cache.retrieve(Options, Size, Alignment, getHeadersSize(), &H,
+                       &Zeroed)) {
       const uptr BlockEnd = H->CommitBase + H->CommitSize;
       if (BlockEndPtr)
         *BlockEndPtr = BlockEnd;
@@ -545,6 +594,7 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
         ScopedLock L(Mutex);
         InUseBlocks.push_back(H);
         AllocatedBytes += H->CommitSize;
+        FragmentedBytes += H->MemMap.getCapacity() - H->CommitSize;
         NumberOfAllocs++;
         Stats.add(StatAllocated, H->CommitSize);
         Stats.add(StatMapped, H->MemMap.getCapacity());
@@ -553,16 +603,22 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
     }
   }
 
+  uptr RoundedSize =
+      roundUp(roundUp(Size, Alignment) + getHeadersSize(), PageSize);
+  if (Alignment > PageSize)
+    RoundedSize += Alignment - PageSize;
+
   ReservedMemoryT ReservedMemory;
   const uptr MapSize = RoundedSize + 2 * PageSize;
-  ReservedMemory.create(/*Addr=*/0U, MapSize, nullptr, MAP_ALLOWNOMEM);
+  if (UNLIKELY(!ReservedMemory.create(/*Addr=*/0U, MapSize, nullptr,
+                                      MAP_ALLOWNOMEM))) {
+    return nullptr;
+  }
 
   // Take the entire ownership of reserved region.
   MemMapT MemMap = ReservedMemory.dispatch(ReservedMemory.getBase(),
                                            ReservedMemory.getCapacity());
   uptr MapBase = MemMap.getBase();
-  if (UNLIKELY(!MapBase))
-    return nullptr;
   uptr CommitBase = MapBase + PageSize;
   uptr MapEnd = MapBase + MapSize;
 
@@ -592,9 +648,12 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
 
   const uptr CommitSize = MapEnd - PageSize - CommitBase;
   const uptr AllocPos = roundDown(CommitBase + CommitSize - Size, Alignment);
-  mapSecondary<Config>(Options, CommitBase, CommitSize, AllocPos, 0, MemMap);
-  const uptr HeaderPos =
-      AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
+  if (!mapSecondary<Config>(Options, CommitBase, CommitSize, AllocPos, 0,
+                            MemMap)) {
+    MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+    return nullptr;
+  }
+  const uptr HeaderPos = AllocPos - getHeadersSize();
   LargeBlock::Header *H = reinterpret_cast<LargeBlock::Header *>(
       LargeBlock::addHeaderTag<Config>(HeaderPos));
   if (useMemoryTagging<Config>(Options))
@@ -609,6 +668,7 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
     ScopedLock L(Mutex);
     InUseBlocks.push_back(H);
     AllocatedBytes += CommitSize;
+    FragmentedBytes += H->MemMap.getCapacity() - CommitSize;
     if (LargestSize < CommitSize)
       LargestSize = CommitSize;
     NumberOfAllocs++;
@@ -619,7 +679,7 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
 }
 
 template <typename Config>
-void MapAllocator<Config>::deallocate(Options Options, void *Ptr)
+void MapAllocator<Config>::deallocate(const Options &Options, void *Ptr)
     EXCLUDES(Mutex) {
   LargeBlock::Header *H = LargeBlock::getHeader<Config>(Ptr);
   const uptr CommitSize = H->CommitSize;
@@ -627,6 +687,7 @@ void MapAllocator<Config>::deallocate(Options Options, void *Ptr)
     ScopedLock L(Mutex);
     InUseBlocks.remove(H);
     FreedBytes += CommitSize;
+    FragmentedBytes -= H->MemMap.getCapacity() - CommitSize;
     NumberOfFrees++;
     Stats.sub(StatAllocated, CommitSize);
     Stats.sub(StatMapped, H->MemMap.getCapacity());
@@ -638,10 +699,11 @@ template <typename Config>
 void MapAllocator<Config>::getStats(ScopedString *Str) EXCLUDES(Mutex) {
   ScopedLock L(Mutex);
   Str->append("Stats: MapAllocator: allocated %u times (%zuK), freed %u times "
-              "(%zuK), remains %u (%zuK) max %zuM\n",
+              "(%zuK), remains %u (%zuK) max %zuM, Fragmented %zuK\n",
               NumberOfAllocs, AllocatedBytes >> 10, NumberOfFrees,
               FreedBytes >> 10, NumberOfAllocs - NumberOfFrees,
-              (AllocatedBytes - FreedBytes) >> 10, LargestSize >> 20);
+              (AllocatedBytes - FreedBytes) >> 10, LargestSize >> 20,
+              FragmentedBytes >> 10);
   Cache.getStats(Str);
 }
 
diff --git a/compiler-rt/lib/scudo/standalone/size_class_map.h b/compiler-rt/lib/scudo/standalone/size_class_map.h
index 2a6e298f9366..4138885de338 100644
--- a/compiler-rt/lib/scudo/standalone/size_class_map.h
+++ b/compiler-rt/lib/scudo/standalone/size_class_map.h
@@ -254,7 +254,7 @@ struct AndroidSizeClassConfig {
   static const u16 MaxNumCachedHint = 13;
   static const uptr MaxBytesCachedLog = 13;
 
-  static constexpr u32 Classes[] = {
+  static constexpr uptr Classes[] = {
       0x00020, 0x00030, 0x00040, 0x00050, 0x00060, 0x00070, 0x00090, 0x000b0,
       0x000c0, 0x000e0, 0x00120, 0x00160, 0x001c0, 0x00250, 0x00320, 0x00450,
       0x00670, 0x00830, 0x00a10, 0x00c30, 0x01010, 0x01210, 0x01bd0, 0x02210,
@@ -269,7 +269,7 @@ struct AndroidSizeClassConfig {
   static const u16 MaxNumCachedHint = 14;
   static const uptr MaxBytesCachedLog = 13;
 
-  static constexpr u32 Classes[] = {
+  static constexpr uptr Classes[] = {
       0x00020, 0x00030, 0x00040, 0x00050, 0x00060, 0x00070, 0x00080, 0x00090,
       0x000a0, 0x000b0, 0x000c0, 0x000e0, 0x000f0, 0x00110, 0x00120, 0x00130,
       0x00150, 0x00160, 0x00170, 0x00190, 0x001d0, 0x00210, 0x00240, 0x002a0,
@@ -289,28 +289,6 @@ typedef TableSizeClassMap<AndroidSizeClassConfig> AndroidSizeClassMap;
 static_assert(AndroidSizeClassMap::usesCompressedLSBFormat(), "");
 #endif
 
-struct SvelteSizeClassConfig {
-#if SCUDO_WORDSIZE == 64U
-  static const uptr NumBits = 4;
-  static const uptr MinSizeLog = 4;
-  static const uptr MidSizeLog = 8;
-  static const uptr MaxSizeLog = 14;
-  static const u16 MaxNumCachedHint = 13;
-  static const uptr MaxBytesCachedLog = 10;
-  static const uptr SizeDelta = Chunk::getHeaderSize();
-#else
-  static const uptr NumBits = 4;
-  static const uptr MinSizeLog = 3;
-  static const uptr MidSizeLog = 7;
-  static const uptr MaxSizeLog = 14;
-  static const u16 MaxNumCachedHint = 14;
-  static const uptr MaxBytesCachedLog = 10;
-  static const uptr SizeDelta = Chunk::getHeaderSize();
-#endif
-};
-
-typedef FixedSizeClassMap<SvelteSizeClassConfig> SvelteSizeClassMap;
-
 struct TrustySizeClassConfig {
   static const uptr NumBits = 1;
   static const uptr MinSizeLog = 5;
diff --git a/compiler-rt/lib/scudo/standalone/stack_depot.h b/compiler-rt/lib/scudo/standalone/stack_depot.h
index 458198fcb7aa..12c35eb2a4f3 100644
--- a/compiler-rt/lib/scudo/standalone/stack_depot.h
+++ b/compiler-rt/lib/scudo/standalone/stack_depot.h
@@ -62,8 +62,7 @@ class StackDepot {
   // This is achieved by re-checking the hash of the stack trace before
   // returning the trace.
 
-#ifdef SCUDO_FUZZ
-  // Use smaller table sizes for fuzzing in order to reduce input size.
+#if SCUDO_SMALL_STACK_DEPOT
   static const uptr TabBits = 4;
 #else
   static const uptr TabBits = 16;
@@ -72,7 +71,7 @@ class StackDepot {
   static const uptr TabMask = TabSize - 1;
   atomic_u32 Tab[TabSize] = {};
 
-#ifdef SCUDO_FUZZ
+#if SCUDO_SMALL_STACK_DEPOT
   static const uptr RingBits = 4;
 #else
   static const uptr RingBits = 19;
diff --git a/compiler-rt/lib/scudo/standalone/trusty.cpp b/compiler-rt/lib/scudo/standalone/trusty.cpp
index 3191091e1b96..26b349c6e506 100644
--- a/compiler-rt/lib/scudo/standalone/trusty.cpp
+++ b/compiler-rt/lib/scudo/standalone/trusty.cpp
@@ -12,6 +12,7 @@
 
 #include "common.h"
 #include "mutex.h"
+#include "report_linux.h"
 #include "trusty.h"
 
 #include <errno.h>           // for errno
@@ -50,7 +51,8 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
 
   if (IS_ERR(P)) {
     errno = lk_err_to_errno(PTR_ERR(P));
-    dieOnMapUnmapError(Size);
+    if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
+      reportMapError(Size);
     return nullptr;
   }
 
@@ -60,7 +62,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
 void unmap(UNUSED void *Addr, UNUSED uptr Size, UNUSED uptr Flags,
            UNUSED MapPlatformData *Data) {
   if (_trusty_munmap(Addr, Size) != 0)
-    dieOnMapUnmapError();
+    reportUnmapError(reinterpret_cast<uptr>(Addr), Size);
 }
 
 void setMemoryPermission(UNUSED uptr Addr, UNUSED uptr Size, UNUSED uptr Flags,
diff --git a/compiler-rt/lib/scudo/standalone/tsd.h b/compiler-rt/lib/scudo/standalone/tsd.h
index f4fa545de5e0..b2108a01900b 100644
--- a/compiler-rt/lib/scudo/standalone/tsd.h
+++ b/compiler-rt/lib/scudo/standalone/tsd.h
@@ -53,8 +53,14 @@ template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
   inline void unlock() NO_THREAD_SAFETY_ANALYSIS { Mutex.unlock(); }
   inline uptr getPrecedence() { return atomic_load_relaxed(&Precedence); }
 
-  void commitBack(Allocator *Instance) ASSERT_CAPABILITY(Mutex) {
-    Instance->commitBack(this);
+  void commitBack(Allocator *Instance) { Instance->commitBack(this); }
+
+  // As the comments attached to `getCache()`, the TSD doesn't always need to be
+  // locked. In that case, we would only skip the check before we have all TSDs
+  // locked in all paths.
+  void assertLocked(bool BypassCheck) ASSERT_CAPABILITY(Mutex) {
+    if (SCUDO_DEBUG && !BypassCheck)
+      Mutex.assertHeld();
   }
 
   // Ideally, we may want to assert that all the operations on
@@ -66,11 +72,8 @@ template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
   // TODO(chiahungduan): Ideally, we want to do `Mutex.assertHeld` but acquiring
   // TSD doesn't always require holding the lock. Add this assertion while the
   // lock is always acquired.
-  typename Allocator::CacheT &getCache() ASSERT_CAPABILITY(Mutex) {
-    return Cache;
-  }
-  typename Allocator::QuarantineCacheT &getQuarantineCache()
-      ASSERT_CAPABILITY(Mutex) {
+  typename Allocator::CacheT &getCache() REQUIRES(Mutex) { return Cache; }
+  typename Allocator::QuarantineCacheT &getQuarantineCache() REQUIRES(Mutex) {
     return QuarantineCache;
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
index dcb0948ad78f..1bca578ee14b 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -120,6 +120,11 @@ struct TSDRegistrySharedT {
                 TSDsArraySize);
     for (uptr I = 0; I < NumberOfTSDs; ++I) {
       TSDs[I].lock();
+      // Theoretically, we want to mark TSD::lock()/TSD::unlock() with proper
+      // thread annotations. However, given the TSD is only locked on shared
+      // path, do the assertion in a separate path to avoid confusing the
+      // analyzer.
+      TSDs[I].assertLocked(/*BypassCheck=*/true);
       Str->append("  Shared TSD[%zu]:\n", I);
       TSDs[I].getCache().getStats(Str);
       TSDs[I].unlock();
diff --git a/compiler-rt/lib/scudo/standalone/vector.h b/compiler-rt/lib/scudo/standalone/vector.h
index 9f2c200958fe..c0f1ba0eddfa 100644
--- a/compiler-rt/lib/scudo/standalone/vector.h
+++ b/compiler-rt/lib/scudo/standalone/vector.h
@@ -9,26 +9,20 @@
 #ifndef SCUDO_VECTOR_H_
 #define SCUDO_VECTOR_H_
 
-#include "common.h"
+#include "mem_map.h"
 
 #include <string.h>
 
 namespace scudo {
 
-// A low-level vector based on map. May incur a significant memory overhead for
-// small vectors. The current implementation supports only POD types.
+// A low-level vector based on map. It stores the contents inline up to a fixed
+// capacity, or in an external memory buffer if it grows bigger than that. May
+// incur a significant memory overhead for small vectors. The current
+// implementation supports only POD types.
+//
+// NOTE: This class is not meant to be used directly, use Vector<T> instead.
 template <typename T> class VectorNoCtor {
 public:
-  constexpr void init(uptr InitialCapacity = 0) {
-    Data = &LocalData[0];
-    CapacityBytes = sizeof(LocalData);
-    if (InitialCapacity > capacity())
-      reserve(InitialCapacity);
-  }
-  void destroy() {
-    if (Data != &LocalData[0])
-      unmap(Data, CapacityBytes, 0, &MapData);
-  }
   T &operator[](uptr I) {
     DCHECK_LT(I, Size);
     return Data[I];
@@ -78,24 +72,43 @@ public:
   const T *end() const { return data() + size(); }
   T *end() { return data() + size(); }
 
+protected:
+  constexpr void init(uptr InitialCapacity = 0) {
+    Data = &LocalData[0];
+    CapacityBytes = sizeof(LocalData);
+    if (InitialCapacity > capacity())
+      reserve(InitialCapacity);
+  }
+  void destroy() {
+    if (Data != &LocalData[0])
+      ExternalBuffer.unmap(ExternalBuffer.getBase(),
+                           ExternalBuffer.getCapacity());
+  }
+
 private:
   void reallocate(uptr NewCapacity) {
     DCHECK_GT(NewCapacity, 0);
     DCHECK_LE(Size, NewCapacity);
+
+    MemMapT NewExternalBuffer;
     NewCapacity = roundUp(NewCapacity * sizeof(T), getPageSizeCached());
-    T *NewData = reinterpret_cast<T *>(
-        map(nullptr, NewCapacity, "scudo:vector", 0, &MapData));
-    memcpy(NewData, Data, Size * sizeof(T));
+    NewExternalBuffer.map(/*Addr=*/0U, NewCapacity, "scudo:vector");
+    T *NewExternalData = reinterpret_cast<T *>(NewExternalBuffer.getBase());
+
+    memcpy(NewExternalData, Data, Size * sizeof(T));
     destroy();
-    Data = NewData;
+
+    Data = NewExternalData;
     CapacityBytes = NewCapacity;
+    ExternalBuffer = NewExternalBuffer;
   }
 
   T *Data = nullptr;
-  T LocalData[256 / sizeof(T)] = {};
   uptr CapacityBytes = 0;
   uptr Size = 0;
-  [[no_unique_address]] MapPlatformData MapData = {};
+
+  T LocalData[256 / sizeof(T)] = {};
+  MemMapT ExternalBuffer;
 };
 
 template <typename T> class Vector : public VectorNoCtor<T> {
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.cpp b/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
index b4d51be716cc..60014a0f66bf 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
@@ -12,6 +12,9 @@
 #if !SCUDO_ANDROID || !_BIONIC
 
 #include "allocator_config.h"
+#include "internal_defs.h"
+#include "platform.h"
+#include "scudo/interface.h"
 #include "wrappers_c.h"
 #include "wrappers_c_checks.h"
 
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
index 2c8e382dba0b..56d8ef20156e 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -17,6 +17,35 @@
 #define SCUDO_MALLOC_ALIGNMENT FIRST_32_SECOND_64(8U, 16U)
 #endif
 
+static void reportAllocation(void *ptr, size_t size) {
+  if (SCUDO_ENABLE_HOOKS)
+    if (__scudo_allocate_hook && ptr)
+      __scudo_allocate_hook(ptr, size);
+}
+static void reportDeallocation(void *ptr) {
+  if (SCUDO_ENABLE_HOOKS)
+    if (__scudo_deallocate_hook)
+      __scudo_deallocate_hook(ptr);
+}
+static void reportReallocAllocation(void *old_ptr, void *new_ptr, size_t size) {
+  DCHECK_NE(new_ptr, nullptr);
+
+  if (SCUDO_ENABLE_HOOKS) {
+    if (__scudo_realloc_allocate_hook)
+      __scudo_realloc_allocate_hook(old_ptr, new_ptr, size);
+    else if (__scudo_allocate_hook)
+      __scudo_allocate_hook(new_ptr, size);
+  }
+}
+static void reportReallocDeallocation(void *old_ptr) {
+  if (SCUDO_ENABLE_HOOKS) {
+    if (__scudo_realloc_deallocate_hook)
+      __scudo_realloc_deallocate_hook(old_ptr);
+    else if (__scudo_deallocate_hook)
+      __scudo_deallocate_hook(old_ptr);
+  }
+}
+
 extern "C" {
 
 INTERFACE WEAK void *SCUDO_PREFIX(calloc)(size_t nmemb, size_t size) {
@@ -28,11 +57,14 @@ INTERFACE WEAK void *SCUDO_PREFIX(calloc)(size_t nmemb, size_t size) {
     }
     scudo::reportCallocOverflow(nmemb, size);
   }
-  return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-      Product, scudo::Chunk::Origin::Malloc, SCUDO_MALLOC_ALIGNMENT, true));
+  void *Ptr = SCUDO_ALLOCATOR.allocate(Product, scudo::Chunk::Origin::Malloc,
+                                       SCUDO_MALLOC_ALIGNMENT, true);
+  reportAllocation(Ptr, Product);
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 INTERFACE WEAK void SCUDO_PREFIX(free)(void *ptr) {
+  reportDeallocation(ptr);
   SCUDO_ALLOCATOR.deallocate(ptr, scudo::Chunk::Origin::Malloc);
 }
 
@@ -75,8 +107,10 @@ INTERFACE WEAK struct __scudo_mallinfo2 SCUDO_PREFIX(mallinfo2)(void) {
 #endif
 
 INTERFACE WEAK void *SCUDO_PREFIX(malloc)(size_t size) {
-  return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-      size, scudo::Chunk::Origin::Malloc, SCUDO_MALLOC_ALIGNMENT));
+  void *Ptr = SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Malloc,
+                                       SCUDO_MALLOC_ALIGNMENT);
+  reportAllocation(Ptr, size);
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 #if SCUDO_ANDROID
@@ -105,8 +139,10 @@ INTERFACE WEAK void *SCUDO_PREFIX(memalign)(size_t alignment, size_t size) {
       scudo::reportAlignmentNotPowerOfTwo(alignment);
     }
   }
-  return SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Memalign,
-                                  alignment);
+  void *Ptr =
+      SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Memalign, alignment);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 
 INTERFACE WEAK int SCUDO_PREFIX(posix_memalign)(void **memptr, size_t alignment,
@@ -120,6 +156,8 @@ INTERFACE WEAK int SCUDO_PREFIX(posix_memalign)(void **memptr, size_t alignment,
       SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Memalign, alignment);
   if (UNLIKELY(!Ptr))
     return ENOMEM;
+  reportAllocation(Ptr, size);
+
   *memptr = Ptr;
   return 0;
 }
@@ -134,26 +172,57 @@ INTERFACE WEAK void *SCUDO_PREFIX(pvalloc)(size_t size) {
     scudo::reportPvallocOverflow(size);
   }
   // pvalloc(0) should allocate one page.
-  return scudo::setErrnoOnNull(
+  void *Ptr =
       SCUDO_ALLOCATOR.allocate(size ? scudo::roundUp(size, PageSize) : PageSize,
-                               scudo::Chunk::Origin::Memalign, PageSize));
+                               scudo::Chunk::Origin::Memalign, PageSize);
+  reportAllocation(Ptr, scudo::roundUp(size, PageSize));
+
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 INTERFACE WEAK void *SCUDO_PREFIX(realloc)(void *ptr, size_t size) {
-  if (!ptr)
-    return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-        size, scudo::Chunk::Origin::Malloc, SCUDO_MALLOC_ALIGNMENT));
+  if (!ptr) {
+    void *Ptr = SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Malloc,
+                                         SCUDO_MALLOC_ALIGNMENT);
+    reportAllocation(Ptr, size);
+    return scudo::setErrnoOnNull(Ptr);
+  }
   if (size == 0) {
+    reportDeallocation(ptr);
     SCUDO_ALLOCATOR.deallocate(ptr, scudo::Chunk::Origin::Malloc);
     return nullptr;
   }
-  return scudo::setErrnoOnNull(
-      SCUDO_ALLOCATOR.reallocate(ptr, size, SCUDO_MALLOC_ALIGNMENT));
+
+  // Given that the reporting of deallocation and allocation are not atomic, we
+  // always pretend the old pointer will be released so that the user doesn't
+  // need to worry about the false double-use case from the view of hooks.
+  //
+  // For example, assume that `realloc` releases the old pointer and allocates a
+  // new pointer. Before the reporting of both operations has been done, another
+  // thread may get the old pointer from `malloc`. It may be misinterpreted as
+  // double-use if it's not handled properly on the hook side.
+  reportReallocDeallocation(ptr);
+  void *NewPtr = SCUDO_ALLOCATOR.reallocate(ptr, size, SCUDO_MALLOC_ALIGNMENT);
+  if (NewPtr != nullptr) {
+    // Note that even if NewPtr == ptr, the size has changed. We still need to
+    // report the new size.
+    reportReallocAllocation(/*OldPtr=*/ptr, NewPtr, size);
+  } else {
+    // If `realloc` fails, the old pointer is not released. Report the old
+    // pointer as allocated again.
+    reportReallocAllocation(/*OldPtr=*/ptr, /*NewPtr=*/ptr,
+                            SCUDO_ALLOCATOR.getAllocSize(ptr));
+  }
+
+  return scudo::setErrnoOnNull(NewPtr);
 }
 
 INTERFACE WEAK void *SCUDO_PREFIX(valloc)(size_t size) {
-  return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-      size, scudo::Chunk::Origin::Memalign, scudo::getPageSizeCached()));
+  void *Ptr = SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Memalign,
+                                       scudo::getPageSizeCached());
+  reportAllocation(Ptr, size);
+
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 INTERFACE WEAK int SCUDO_PREFIX(malloc_iterate)(
@@ -198,6 +267,7 @@ INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, int value) {
     return 1;
   } else if (param == M_LOG_STATS) {
     SCUDO_ALLOCATOR.printStats();
+    SCUDO_ALLOCATOR.printFragmentationInfo();
     return 1;
   } else {
     scudo::Option option;
@@ -233,8 +303,12 @@ INTERFACE WEAK void *SCUDO_PREFIX(aligned_alloc)(size_t alignment,
     }
     scudo::reportInvalidAlignedAllocAlignment(alignment, size);
   }
-  return scudo::setErrnoOnNull(
-      SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Malloc, alignment));
+
+  void *Ptr =
+      SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Malloc, alignment);
+  reportAllocation(Ptr, size);
+
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 INTERFACE WEAK int SCUDO_PREFIX(malloc_info)(UNUSED int options, FILE *stream) {
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp b/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
index 1b9fe67d920c..21694c3f17fe 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
@@ -12,6 +12,9 @@
 #if SCUDO_ANDROID && _BIONIC
 
 #include "allocator_config.h"
+#include "internal_defs.h"
+#include "platform.h"
+#include "scudo/interface.h"
 #include "wrappers_c.h"
 #include "wrappers_c_checks.h"
 
@@ -24,7 +27,7 @@
 
 extern "C" void SCUDO_PREFIX(malloc_postinit)();
 SCUDO_REQUIRE_CONSTANT_INITIALIZATION
-static scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)>
+static scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)>
     SCUDO_ALLOCATOR;
 
 #include "wrappers_c.inc"
@@ -35,15 +38,15 @@ static scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)>
 // TODO(kostyak): support both allocators.
 INTERFACE void __scudo_print_stats(void) { Allocator.printStats(); }
 
-INTERFACE void
-__scudo_get_error_info(struct scudo_error_info *error_info,
-                       uintptr_t fault_addr, const char *stack_depot,
-                       const char *region_info, const char *ring_buffer,
-                       const char *memory, const char *memory_tags,
-                       uintptr_t memory_addr, size_t memory_size) {
+INTERFACE void __scudo_get_error_info(
+    struct scudo_error_info *error_info, uintptr_t fault_addr,
+    const char *stack_depot, size_t stack_depot_size, const char *region_info,
+    const char *ring_buffer, size_t ring_buffer_size, const char *memory,
+    const char *memory_tags, uintptr_t memory_addr, size_t memory_size) {
+  (void)(stack_depot_size);
   Allocator.getErrorInfo(error_info, fault_addr, stack_depot, region_info,
-                         ring_buffer, memory, memory_tags, memory_addr,
-                         memory_size);
+                         ring_buffer, ring_buffer_size, memory, memory_tags,
+                         memory_addr, memory_size);
 }
 
 INTERFACE const char *__scudo_get_stack_depot_addr() {
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp b/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
index 374e36d72b3d..098d4f71acc4 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
@@ -12,6 +12,9 @@
 #if !SCUDO_ANDROID || !_BIONIC
 
 #include "allocator_config.h"
+#include "internal_defs.h"
+#include "platform.h"
+#include "scudo/interface.h"
 #include "wrappers_c.h"
 
 #include <stdint.h>
@@ -21,86 +24,125 @@ struct nothrow_t {};
 enum class align_val_t : size_t {};
 } // namespace std
 
+static void reportAllocation(void *ptr, size_t size) {
+  if (SCUDO_ENABLE_HOOKS)
+    if (__scudo_allocate_hook && ptr)
+      __scudo_allocate_hook(ptr, size);
+}
+static void reportDeallocation(void *ptr) {
+  if (SCUDO_ENABLE_HOOKS)
+    if (__scudo_deallocate_hook)
+      __scudo_deallocate_hook(ptr);
+}
+
 INTERFACE WEAK void *operator new(size_t size) {
-  return Allocator.allocate(size, scudo::Chunk::Origin::New);
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::New);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new[](size_t size) {
-  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new(size_t size,
                                   std::nothrow_t const &) NOEXCEPT {
-  return Allocator.allocate(size, scudo::Chunk::Origin::New);
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::New);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new[](size_t size,
                                     std::nothrow_t const &) NOEXCEPT {
-  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new(size_t size, std::align_val_t align) {
-  return Allocator.allocate(size, scudo::Chunk::Origin::New,
-                            static_cast<scudo::uptr>(align));
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::New,
+                                 static_cast<scudo::uptr>(align));
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new[](size_t size, std::align_val_t align) {
-  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
-                            static_cast<scudo::uptr>(align));
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
+                                 static_cast<scudo::uptr>(align));
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new(size_t size, std::align_val_t align,
                                   std::nothrow_t const &) NOEXCEPT {
-  return Allocator.allocate(size, scudo::Chunk::Origin::New,
-                            static_cast<scudo::uptr>(align));
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::New,
+                                 static_cast<scudo::uptr>(align));
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new[](size_t size, std::align_val_t align,
                                     std::nothrow_t const &) NOEXCEPT {
-  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
-                            static_cast<scudo::uptr>(align));
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
+                                 static_cast<scudo::uptr>(align));
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 
 INTERFACE WEAK void operator delete(void *ptr) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void operator delete[](void *ptr) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray);
 }
 INTERFACE WEAK void operator delete(void *ptr,
                                     std::nothrow_t const &) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void operator delete[](void *ptr,
                                       std::nothrow_t const &) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray);
 }
 INTERFACE WEAK void operator delete(void *ptr, size_t size) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, size);
 }
 INTERFACE WEAK void operator delete[](void *ptr, size_t size) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, size);
 }
 INTERFACE WEAK void operator delete(void *ptr,
                                     std::align_val_t align) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, 0,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr,
                                       std::align_val_t align) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, 0,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete(void *ptr, std::align_val_t align,
                                     std::nothrow_t const &) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, 0,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr, std::align_val_t align,
                                       std::nothrow_t const &) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, 0,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete(void *ptr, size_t size,
                                     std::align_val_t align) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, size,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr, size_t size,
                                       std::align_val_t align) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, size,
                        static_cast<scudo::uptr>(align));
 }