18 files changed, 1731 insertions, 854 deletions
diff --git a/contrib/jemalloc/src/arena.c b/contrib/jemalloc/src/arena.c
index 43733cc15727..99e20fde363e 100644
--- a/contrib/jemalloc/src/arena.c
+++ b/contrib/jemalloc/src/arena.c
@@ -4,18 +4,32 @@
 /******************************************************************************/
 /* Data. */
 
+purge_mode_t	opt_purge = PURGE_DEFAULT;
+const char	*purge_mode_names[] = {
+	"ratio",
+	"decay",
+	"N/A"
+};
 ssize_t		opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT;
 static ssize_t	lg_dirty_mult_default;
+ssize_t		opt_decay_time = DECAY_TIME_DEFAULT;
+static ssize_t	decay_time_default;
+
 arena_bin_info_t	arena_bin_info[NBINS];
 
 size_t		map_bias;
 size_t		map_misc_offset;
 size_t		arena_maxrun; /* Max run size for arenas. */
 size_t		large_maxclass; /* Max large size class. */
-static size_t	small_maxrun; /* Max run size used for small size classes. */
+size_t		run_quantize_max; /* Max run_quantize_*() input. */
+static size_t	small_maxrun; /* Max run size for small size classes. */
 static bool	*small_run_tab; /* Valid small run page multiples. */
+static size_t	*run_quantize_floor_tab; /* run_quantize_floor() memoization. */
+static size_t	*run_quantize_ceil_tab; /* run_quantize_ceil() memoization. */
 unsigned	nlclasses; /* Number of large size classes. */
 unsigned	nhclasses; /* Number of huge size classes. */
+static szind_t	runs_avail_bias; /* Size index for first runs_avail tree. */
+static szind_t	runs_avail_nclasses; /* Number of runs_avail trees. */
 
 /******************************************************************************/
 /*
@@ -23,7 +37,7 @@ unsigned	nhclasses; /* Number of huge size classes. */
  * definition.
  */
 
-static void	arena_purge(arena_t *arena, bool all);
+static void	arena_purge_to_limit(arena_t *arena, size_t ndirty_limit);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty,
     bool cleaned, bool decommitted);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
@@ -33,42 +47,12 @@ static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
 
 /******************************************************************************/
 
-#define	CHUNK_MAP_KEY		((uintptr_t)0x1U)
-
-JEMALLOC_INLINE_C arena_chunk_map_misc_t *
-arena_miscelm_key_create(size_t size)
-{
-
-	return ((arena_chunk_map_misc_t *)(arena_mapbits_size_encode(size) |
-	    CHUNK_MAP_KEY));
-}
-
-JEMALLOC_INLINE_C bool
-arena_miscelm_is_key(const arena_chunk_map_misc_t *miscelm)
-{
-
-	return (((uintptr_t)miscelm & CHUNK_MAP_KEY) != 0);
-}
-
-#undef CHUNK_MAP_KEY
-
-JEMALLOC_INLINE_C size_t
-arena_miscelm_key_size_get(const arena_chunk_map_misc_t *miscelm)
-{
-
-	assert(arena_miscelm_is_key(miscelm));
-
-	return (arena_mapbits_size_decode((uintptr_t)miscelm));
-}
-
 JEMALLOC_INLINE_C size_t
-arena_miscelm_size_get(arena_chunk_map_misc_t *miscelm)
+arena_miscelm_size_get(const arena_chunk_map_misc_t *miscelm)
 {
 	arena_chunk_t *chunk;
 	size_t pageind, mapbits;
 
-	assert(!arena_miscelm_is_key(miscelm));
-
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
 	pageind = arena_miscelm_to_pageind(miscelm);
 	mapbits = arena_mapbits_get(chunk, pageind);
@@ -76,7 +60,8 @@ arena_miscelm_size_get(arena_chunk_map_misc_t *miscelm)
 }
 
 JEMALLOC_INLINE_C int
-arena_run_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
+arena_run_addr_comp(const arena_chunk_map_misc_t *a,
+    const arena_chunk_map_misc_t *b)
 {
 	uintptr_t a_miscelm = (uintptr_t)a;
 	uintptr_t b_miscelm = (uintptr_t)b;
@@ -89,10 +74,10 @@ arena_run_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 
 /* Generate red-black tree functions. */
 rb_gen(static UNUSED, arena_run_tree_, arena_run_tree_t, arena_chunk_map_misc_t,
-    rb_link, arena_run_comp)
+    rb_link, arena_run_addr_comp)
 
 static size_t
-run_quantize(size_t size)
+run_quantize_floor_compute(size_t size)
 {
 	size_t qsize;
 
@@ -110,13 +95,13 @@ run_quantize(size_t size)
 	 */
 	qsize = index2size(size2index(size - large_pad + 1) - 1) + large_pad;
 	if (qsize <= SMALL_MAXCLASS + large_pad)
-		return (run_quantize(size - large_pad));
+		return (run_quantize_floor_compute(size - large_pad));
 	assert(qsize <= size);
 	return (qsize);
 }
 
 static size_t
-run_quantize_next(size_t size)
+run_quantize_ceil_compute_hard(size_t size)
 {
 	size_t large_run_size_next;
 
@@ -150,9 +135,9 @@ run_quantize_next(size_t size)
 }
 
 static size_t
-run_quantize_first(size_t size)
+run_quantize_ceil_compute(size_t size)
 {
-	size_t qsize = run_quantize(size);
+	size_t qsize = run_quantize_floor_compute(size);
 
 	if (qsize < size) {
 		/*
@@ -163,65 +148,89 @@ run_quantize_first(size_t size)
 		 * search would potentially find sufficiently aligned available
 		 * memory somewhere lower.
 		 */
-		qsize = run_quantize_next(size);
+		qsize = run_quantize_ceil_compute_hard(qsize);
 	}
 	return (qsize);
 }
 
-JEMALLOC_INLINE_C int
-arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
+#ifdef JEMALLOC_JET
+#undef run_quantize_floor
+#define	run_quantize_floor JEMALLOC_N(run_quantize_floor_impl)
+#endif
+static size_t
+run_quantize_floor(size_t size)
 {
-	int ret;
-	uintptr_t a_miscelm = (uintptr_t)a;
-	size_t a_qsize = run_quantize(arena_miscelm_is_key(a) ?
-	    arena_miscelm_key_size_get(a) : arena_miscelm_size_get(a));
-	size_t b_qsize = run_quantize(arena_miscelm_size_get(b));
+	size_t ret;
 
-	/*
-	 * Compare based on quantized size rather than size, in order to sort
-	 * equally useful runs only by address.
-	 */
-	ret = (a_qsize > b_qsize) - (a_qsize < b_qsize);
-	if (ret == 0) {
-		if (!arena_miscelm_is_key(a)) {
-			uintptr_t b_miscelm = (uintptr_t)b;
+	assert(size > 0);
+	assert(size <= run_quantize_max);
+	assert((size & PAGE_MASK) == 0);
 
-			ret = (a_miscelm > b_miscelm) - (a_miscelm < b_miscelm);
-		} else {
-			/*
-			 * Treat keys as if they are lower than anything else.
-			 */
-			ret = -1;
-		}
-	}
+	ret = run_quantize_floor_tab[(size >> LG_PAGE) - 1];
+	assert(ret == run_quantize_floor_compute(size));
+	return (ret);
+}
+#ifdef JEMALLOC_JET
+#undef run_quantize_floor
+#define	run_quantize_floor JEMALLOC_N(run_quantize_floor)
+run_quantize_t *run_quantize_floor = JEMALLOC_N(run_quantize_floor_impl);
+#endif
+
+#ifdef JEMALLOC_JET
+#undef run_quantize_ceil
+#define	run_quantize_ceil JEMALLOC_N(run_quantize_ceil_impl)
+#endif
+static size_t
+run_quantize_ceil(size_t size)
+{
+	size_t ret;
+
+	assert(size > 0);
+	assert(size <= run_quantize_max);
+	assert((size & PAGE_MASK) == 0);
 
+	ret = run_quantize_ceil_tab[(size >> LG_PAGE) - 1];
+	assert(ret == run_quantize_ceil_compute(size));
 	return (ret);
 }
+#ifdef JEMALLOC_JET
+#undef run_quantize_ceil
+#define	run_quantize_ceil JEMALLOC_N(run_quantize_ceil)
+run_quantize_t *run_quantize_ceil = JEMALLOC_N(run_quantize_ceil_impl);
+#endif
 
-/* Generate red-black tree functions. */
-rb_gen(static UNUSED, arena_avail_tree_, arena_avail_tree_t,
-    arena_chunk_map_misc_t, rb_link, arena_avail_comp)
+static arena_run_tree_t *
+arena_runs_avail_get(arena_t *arena, szind_t ind)
+{
+
+	assert(ind >= runs_avail_bias);
+	assert(ind - runs_avail_bias < runs_avail_nclasses);
+
+	return (&arena->runs_avail[ind - runs_avail_bias]);
+}
 
 static void
 arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
     size_t npages)
 {
-
+	szind_t ind = size2index(run_quantize_floor(arena_miscelm_size_get(
+	    arena_miscelm_get(chunk, pageind))));
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-	arena_avail_tree_insert(&arena->runs_avail, arena_miscelm_get(chunk,
-	    pageind));
+	arena_run_tree_insert(arena_runs_avail_get(arena, ind),
+	    arena_miscelm_get(chunk, pageind));
 }
 
 static void
 arena_avail_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
     size_t npages)
 {
-
+	szind_t ind = size2index(run_quantize_floor(arena_miscelm_size_get(
+	    arena_miscelm_get(chunk, pageind))));
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-	arena_avail_tree_remove(&arena->runs_avail, arena_miscelm_get(chunk,
-	    pageind));
+	arena_run_tree_remove(arena_runs_avail_get(arena, ind),
+	    arena_miscelm_get(chunk, pageind));
 }
 
 static void
@@ -292,14 +301,14 @@ JEMALLOC_INLINE_C void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
 	void *ret;
-	unsigned regind;
+	size_t regind;
 	arena_chunk_map_misc_t *miscelm;
 	void *rpages;
 
 	assert(run->nfree > 0);
 	assert(!bitmap_full(run->bitmap, &bin_info->bitmap_info));
 
-	regind = bitmap_sfu(run->bitmap, &bin_info->bitmap_info);
+	regind = (unsigned)bitmap_sfu(run->bitmap, &bin_info->bitmap_info);
 	miscelm = arena_run_to_miscelm(run);
 	rpages = arena_miscelm_to_rpages(miscelm);
 	ret = (void *)((uintptr_t)rpages + (uintptr_t)bin_info->reg0_offset +
@@ -316,7 +325,7 @@ arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 	size_t mapbits = arena_mapbits_get(chunk, pageind);
 	szind_t binind = arena_ptr_small_binind_get(ptr, mapbits);
 	arena_bin_info_t *bin_info = &arena_bin_info[binind];
-	unsigned regind = arena_run_regind(run, bin_info, ptr);
+	size_t regind = arena_run_regind(run, bin_info, ptr);
 
 	assert(run->nfree < bin_info->nregs);
 	/* Freeing an interior pointer can cause assertion failure. */
@@ -364,16 +373,30 @@ arena_run_page_validate_zeroed(arena_chunk_t *chunk, size_t run_ind)
 }
 
 static void
-arena_cactive_update(arena_t *arena, size_t add_pages, size_t sub_pages)
+arena_nactive_add(arena_t *arena, size_t add_pages)
 {
 
 	if (config_stats) {
-		ssize_t cactive_diff = CHUNK_CEILING((arena->nactive + add_pages
-		    - sub_pages) << LG_PAGE) - CHUNK_CEILING(arena->nactive <<
+		size_t cactive_add = CHUNK_CEILING((arena->nactive +
+		    add_pages) << LG_PAGE) - CHUNK_CEILING(arena->nactive <<
 		    LG_PAGE);
-		if (cactive_diff != 0)
-			stats_cactive_add(cactive_diff);
+		if (cactive_add != 0)
+			stats_cactive_add(cactive_add);
+	}
+	arena->nactive += add_pages;
+}
+
+static void
+arena_nactive_sub(arena_t *arena, size_t sub_pages)
+{
+
+	if (config_stats) {
+		size_t cactive_sub = CHUNK_CEILING(arena->nactive << LG_PAGE) -
+		    CHUNK_CEILING((arena->nactive - sub_pages) << LG_PAGE);
+		if (cactive_sub != 0)
+			stats_cactive_sub(cactive_sub);
 	}
+	arena->nactive -= sub_pages;
 }
 
 static void
@@ -394,8 +417,7 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 	arena_avail_remove(arena, chunk, run_ind, total_pages);
 	if (flag_dirty != 0)
 		arena_run_dirty_remove(arena, chunk, run_ind, total_pages);
-	arena_cactive_update(arena, need_pages, 0);
-	arena->nactive += need_pages;
+	arena_nactive_add(arena, need_pages);
 
 	/* Keep track of trailing unused pages for later use. */
 	if (rem_pages > 0) {
@@ -711,7 +733,6 @@ arena_chunk_alloc(arena_t *arena)
 			return (NULL);
 	}
 
-	/* Insert the run into the runs_avail tree. */
 	arena_avail_insert(arena, chunk, map_bias, chunk_npages-map_bias);
 
 	return (chunk);
@@ -732,10 +753,7 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 	assert(arena_mapbits_decommitted_get(chunk, map_bias) ==
 	    arena_mapbits_decommitted_get(chunk, chunk_npages-1));
 
-	/*
-	 * Remove run from the runs_avail tree, so that the arena does not use
-	 * it.
-	 */
+	/* Remove run from runs_avail, so that the arena does not use it. */
 	arena_avail_remove(arena, chunk, map_bias, chunk_npages-map_bias);
 
 	if (arena->spare != NULL) {
@@ -888,7 +906,7 @@ arena_chunk_alloc_huge_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			arena_huge_malloc_stats_update_undo(arena, usize);
 			arena->stats.mapped -= usize;
 		}
-		arena->nactive -= (usize >> LG_PAGE);
+		arena_nactive_sub(arena, usize >> LG_PAGE);
 		malloc_mutex_unlock(&arena->lock);
 	}
 
@@ -910,7 +928,7 @@ arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
 		arena_huge_malloc_stats_update(arena, usize);
 		arena->stats.mapped += usize;
 	}
-	arena->nactive += (usize >> LG_PAGE);
+	arena_nactive_add(arena, usize >> LG_PAGE);
 
 	ret = chunk_alloc_cache(arena, &chunk_hooks, NULL, csize, alignment,
 	    zero, true);
@@ -920,8 +938,6 @@ arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
 		    alignment, zero, csize);
 	}
 
-	if (config_stats && ret != NULL)
-		stats_cactive_add(usize);
 	return (ret);
 }
 
@@ -936,9 +952,8 @@ arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize)
 	if (config_stats) {
 		arena_huge_dalloc_stats_update(arena, usize);
 		arena->stats.mapped -= usize;
-		stats_cactive_sub(usize);
 	}
-	arena->nactive -= (usize >> LG_PAGE);
+	arena_nactive_sub(arena, usize >> LG_PAGE);
 
 	chunk_dalloc_cache(arena, &chunk_hooks, chunk, csize, true);
 	malloc_mutex_unlock(&arena->lock);
@@ -955,17 +970,10 @@ arena_chunk_ralloc_huge_similar(arena_t *arena, void *chunk, size_t oldsize,
 	malloc_mutex_lock(&arena->lock);
 	if (config_stats)
 		arena_huge_ralloc_stats_update(arena, oldsize, usize);
-	if (oldsize < usize) {
-		size_t udiff = usize - oldsize;
-		arena->nactive += udiff >> LG_PAGE;
-		if (config_stats)
-			stats_cactive_add(udiff);
-	} else {
-		size_t udiff = oldsize - usize;
-		arena->nactive -= udiff >> LG_PAGE;
-		if (config_stats)
-			stats_cactive_sub(udiff);
-	}
+	if (oldsize < usize)
+		arena_nactive_add(arena, (usize - oldsize) >> LG_PAGE);
+	else
+		arena_nactive_sub(arena, (oldsize - usize) >> LG_PAGE);
 	malloc_mutex_unlock(&arena->lock);
 }
 
@@ -979,12 +987,10 @@ arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk, size_t oldsize,
 	malloc_mutex_lock(&arena->lock);
 	if (config_stats) {
 		arena_huge_ralloc_stats_update(arena, oldsize, usize);
-		if (cdiff != 0) {
+		if (cdiff != 0)
 			arena->stats.mapped -= cdiff;
-			stats_cactive_sub(udiff);
-		}
 	}
-	arena->nactive -= udiff >> LG_PAGE;
+	arena_nactive_sub(arena, udiff >> LG_PAGE);
 
 	if (cdiff != 0) {
 		chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
@@ -1014,7 +1020,7 @@ arena_chunk_ralloc_huge_expand_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			    usize);
 			arena->stats.mapped -= cdiff;
 		}
-		arena->nactive -= (udiff >> LG_PAGE);
+		arena_nactive_sub(arena, udiff >> LG_PAGE);
 		malloc_mutex_unlock(&arena->lock);
 	} else if (chunk_hooks->merge(chunk, CHUNK_CEILING(oldsize), nchunk,
 	    cdiff, true, arena->ind)) {
@@ -1042,7 +1048,7 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
 		arena_huge_ralloc_stats_update(arena, oldsize, usize);
 		arena->stats.mapped += cdiff;
 	}
-	arena->nactive += (udiff >> LG_PAGE);
+	arena_nactive_add(arena, udiff >> LG_PAGE);
 
 	err = (chunk_alloc_cache(arena, &arena->chunk_hooks, nchunk, cdiff,
 	    chunksize, zero, true) == NULL);
@@ -1058,26 +1064,28 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
 		err = true;
 	}
 
-	if (config_stats && !err)
-		stats_cactive_add(udiff);
 	return (err);
 }
 
 /*
  * Do first-best-fit run selection, i.e. select the lowest run that best fits.
- * Run sizes are quantized, so not all candidate runs are necessarily exactly
- * the same size.
+ * Run sizes are indexed, so not all candidate runs are necessarily exactly the
+ * same size.
  */
 static arena_run_t *
 arena_run_first_best_fit(arena_t *arena, size_t size)
 {
-	size_t search_size = run_quantize_first(size);
-	arena_chunk_map_misc_t *key = arena_miscelm_key_create(search_size);
-	arena_chunk_map_misc_t *miscelm =
-	    arena_avail_tree_nsearch(&arena->runs_avail, key);
-	if (miscelm == NULL)
-		return (NULL);
-	return (&miscelm->run);
+	szind_t ind, i;
+
+	ind = size2index(run_quantize_ceil(size));
+	for (i = ind; i < runs_avail_nclasses + runs_avail_bias; i++) {
+		arena_chunk_map_misc_t *miscelm = arena_run_tree_first(
+		    arena_runs_avail_get(arena, i));
+		if (miscelm != NULL)
+			return (&miscelm->run);
+	}
+
+	return (NULL);
 }
 
 static arena_run_t *
@@ -1204,16 +1212,194 @@ arena_lg_dirty_mult_set(arena_t *arena, ssize_t lg_dirty_mult)
 	return (false);
 }
 
-void
-arena_maybe_purge(arena_t *arena)
+static void
+arena_decay_deadline_init(arena_t *arena)
 {
 
+	assert(opt_purge == purge_mode_decay);
+
+	/*
+	 * Generate a new deadline that is uniformly random within the next
+	 * epoch after the current one.
+	 */
+	nstime_copy(&arena->decay_deadline, &arena->decay_epoch);
+	nstime_add(&arena->decay_deadline, &arena->decay_interval);
+	if (arena->decay_time > 0) {
+		nstime_t jitter;
+
+		nstime_init(&jitter, prng_range(&arena->decay_jitter_state,
+		    nstime_ns(&arena->decay_interval)));
+		nstime_add(&arena->decay_deadline, &jitter);
+	}
+}
+
+static bool
+arena_decay_deadline_reached(const arena_t *arena, const nstime_t *time)
+{
+
+	assert(opt_purge == purge_mode_decay);
+
+	return (nstime_compare(&arena->decay_deadline, time) <= 0);
+}
+
+static size_t
+arena_decay_backlog_npages_limit(const arena_t *arena)
+{
+	static const uint64_t h_steps[] = {
+#define	STEP(step, h, x, y) \
+		h,
+		SMOOTHSTEP
+#undef STEP
+	};
+	uint64_t sum;
+	size_t npages_limit_backlog;
+	unsigned i;
+
+	assert(opt_purge == purge_mode_decay);
+
+	/*
+	 * For each element of decay_backlog, multiply by the corresponding
+	 * fixed-point smoothstep decay factor.  Sum the products, then divide
+	 * to round down to the nearest whole number of pages.
+	 */
+	sum = 0;
+	for (i = 0; i < SMOOTHSTEP_NSTEPS; i++)
+		sum += arena->decay_backlog[i] * h_steps[i];
+	npages_limit_backlog = (sum >> SMOOTHSTEP_BFP);
+
+	return (npages_limit_backlog);
+}
+
+static void
+arena_decay_epoch_advance(arena_t *arena, const nstime_t *time)
+{
+	uint64_t nadvance;
+	nstime_t delta;
+	size_t ndirty_delta;
+
+	assert(opt_purge == purge_mode_decay);
+	assert(arena_decay_deadline_reached(arena, time));
+
+	nstime_copy(&delta, time);
+	nstime_subtract(&delta, &arena->decay_epoch);
+	nadvance = nstime_divide(&delta, &arena->decay_interval);
+	assert(nadvance > 0);
+
+	/* Add nadvance decay intervals to epoch. */
+	nstime_copy(&delta, &arena->decay_interval);
+	nstime_imultiply(&delta, nadvance);
+	nstime_add(&arena->decay_epoch, &delta);
+
+	/* Set a new deadline. */
+	arena_decay_deadline_init(arena);
+
+	/* Update the backlog. */
+	if (nadvance >= SMOOTHSTEP_NSTEPS) {
+		memset(arena->decay_backlog, 0, (SMOOTHSTEP_NSTEPS-1) *
+		    sizeof(size_t));
+	} else {
+		memmove(arena->decay_backlog, &arena->decay_backlog[nadvance],
+		    (SMOOTHSTEP_NSTEPS - nadvance) * sizeof(size_t));
+		if (nadvance > 1) {
+			memset(&arena->decay_backlog[SMOOTHSTEP_NSTEPS -
+			    nadvance], 0, (nadvance-1) * sizeof(size_t));
+		}
+	}
+	ndirty_delta = (arena->ndirty > arena->decay_ndirty) ? arena->ndirty -
+	    arena->decay_ndirty : 0;
+	arena->decay_ndirty = arena->ndirty;
+	arena->decay_backlog[SMOOTHSTEP_NSTEPS-1] = ndirty_delta;
+	arena->decay_backlog_npages_limit =
+	    arena_decay_backlog_npages_limit(arena);
+}
+
+static size_t
+arena_decay_npages_limit(arena_t *arena)
+{
+	size_t npages_limit;
+
+	assert(opt_purge == purge_mode_decay);
+
+	npages_limit = arena->decay_backlog_npages_limit;
+
+	/* Add in any dirty pages created during the current epoch. */
+	if (arena->ndirty > arena->decay_ndirty)
+		npages_limit += arena->ndirty - arena->decay_ndirty;
+
+	return (npages_limit);
+}
+
+static void
+arena_decay_init(arena_t *arena, ssize_t decay_time)
+{
+
+	arena->decay_time = decay_time;
+	if (decay_time > 0) {
+		nstime_init2(&arena->decay_interval, decay_time, 0);
+		nstime_idivide(&arena->decay_interval, SMOOTHSTEP_NSTEPS);
+	}
+
+	nstime_init(&arena->decay_epoch, 0);
+	nstime_update(&arena->decay_epoch);
+	arena->decay_jitter_state = (uint64_t)(uintptr_t)arena;
+	arena_decay_deadline_init(arena);
+	arena->decay_ndirty = arena->ndirty;
+	arena->decay_backlog_npages_limit = 0;
+	memset(arena->decay_backlog, 0, SMOOTHSTEP_NSTEPS * sizeof(size_t));
+}
+
+static bool
+arena_decay_time_valid(ssize_t decay_time)
+{
+
+	return (decay_time >= -1 && decay_time <= NSTIME_SEC_MAX);
+}
+
+ssize_t
+arena_decay_time_get(arena_t *arena)
+{
+	ssize_t decay_time;
+
+	malloc_mutex_lock(&arena->lock);
+	decay_time = arena->decay_time;
+	malloc_mutex_unlock(&arena->lock);
+
+	return (decay_time);
+}
+
+bool
+arena_decay_time_set(arena_t *arena, ssize_t decay_time)
+{
+
+	if (!arena_decay_time_valid(decay_time))
+		return (true);
+
+	malloc_mutex_lock(&arena->lock);
+	/*
+	 * Restart decay backlog from scratch, which may cause many dirty pages
+	 * to be immediately purged.  It would conceptually be possible to map
+	 * the old backlog onto the new backlog, but there is no justification
+	 * for such complexity since decay_time changes are intended to be
+	 * infrequent, either between the {-1, 0, >0} states, or a one-time
+	 * arbitrary change during initial arena configuration.
+	 */
+	arena_decay_init(arena, decay_time);
+	arena_maybe_purge(arena);
+	malloc_mutex_unlock(&arena->lock);
+
+	return (false);
+}
+
+static void
+arena_maybe_purge_ratio(arena_t *arena)
+{
+
+	assert(opt_purge == purge_mode_ratio);
+
 	/* Don't purge if the option is disabled. */
 	if (arena->lg_dirty_mult < 0)
 		return;
-	/* Don't recursively purge. */
-	if (arena->purging)
-		return;
+
 	/*
 	 * Iterate, since preventing recursive purging could otherwise leave too
 	 * many dirty pages.
@@ -1228,8 +1414,57 @@ arena_maybe_purge(arena_t *arena)
 		 */
 		if (arena->ndirty <= threshold)
 			return;
-		arena_purge(arena, false);
+		arena_purge_to_limit(arena, threshold);
+	}
+}
+
+static void
+arena_maybe_purge_decay(arena_t *arena)
+{
+	nstime_t time;
+	size_t ndirty_limit;
+
+	assert(opt_purge == purge_mode_decay);
+
+	/* Purge all or nothing if the option is disabled. */
+	if (arena->decay_time <= 0) {
+		if (arena->decay_time == 0)
+			arena_purge_to_limit(arena, 0);
+		return;
 	}
+
+	nstime_copy(&time, &arena->decay_epoch);
+	if (unlikely(nstime_update(&time))) {
+		/* Time went backwards.  Force an epoch advance. */
+		nstime_copy(&time, &arena->decay_deadline);
+	}
+
+	if (arena_decay_deadline_reached(arena, &time))
+		arena_decay_epoch_advance(arena, &time);
+
+	ndirty_limit = arena_decay_npages_limit(arena);
+
+	/*
+	 * Don't try to purge unless the number of purgeable pages exceeds the
+	 * current limit.
+	 */
+	if (arena->ndirty <= ndirty_limit)
+		return;
+	arena_purge_to_limit(arena, ndirty_limit);
+}
+
+void
+arena_maybe_purge(arena_t *arena)
+{
+
+	/* Don't recursively purge. */
+	if (arena->purging)
+		return;
+
+	if (opt_purge == purge_mode_ratio)
+		arena_maybe_purge_ratio(arena);
+	else
+		arena_maybe_purge_decay(arena);
 }
 
 static size_t
@@ -1267,35 +1502,15 @@ arena_dirty_count(arena_t *arena)
 }
 
 static size_t
-arena_compute_npurge(arena_t *arena, bool all)
-{
-	size_t npurge;
-
-	/*
-	 * Compute the minimum number of pages that this thread should try to
-	 * purge.
-	 */
-	if (!all) {
-		size_t threshold = (arena->nactive >> arena->lg_dirty_mult);
-		threshold = threshold < chunk_npages ? chunk_npages : threshold;
-
-		npurge = arena->ndirty - threshold;
-	} else
-		npurge = arena->ndirty;
-
-	return (npurge);
-}
-
-static size_t
-arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all,
-    size_t npurge, arena_runs_dirty_link_t *purge_runs_sentinel,
+arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    size_t ndirty_limit, arena_runs_dirty_link_t *purge_runs_sentinel,
     extent_node_t *purge_chunks_sentinel)
 {
 	arena_runs_dirty_link_t *rdelm, *rdelm_next;
 	extent_node_t *chunkselm;
 	size_t nstashed = 0;
 
-	/* Stash at least npurge pages. */
+	/* Stash runs/chunks according to ndirty_limit. */
 	for (rdelm = qr_next(&arena->runs_dirty, rd_link),
 	    chunkselm = qr_next(&arena->chunks_cache, cc_link);
 	    rdelm != &arena->runs_dirty; rdelm = rdelm_next) {
@@ -1307,6 +1522,11 @@ arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all,
 			bool zero;
 			UNUSED void *chunk;
 
+			npages = extent_node_size_get(chunkselm) >> LG_PAGE;
+			if (opt_purge == purge_mode_decay && arena->ndirty -
+			    (nstashed + npages) < ndirty_limit)
+				break;
+
 			chunkselm_next = qr_next(chunkselm, cc_link);
 			/*
 			 * Allocate.  chunkselm remains valid due to the
@@ -1321,7 +1541,8 @@ arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all,
 			assert(zero == extent_node_zeroed_get(chunkselm));
 			extent_node_dirty_insert(chunkselm, purge_runs_sentinel,
 			    purge_chunks_sentinel);
-			npages = extent_node_size_get(chunkselm) >> LG_PAGE;
+			assert(npages == (extent_node_size_get(chunkselm) >>
+			    LG_PAGE));
 			chunkselm = chunkselm_next;
 		} else {
 			arena_chunk_t *chunk =
@@ -1334,6 +1555,9 @@ arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all,
 			    arena_mapbits_unallocated_size_get(chunk, pageind);
 
 			npages = run_size >> LG_PAGE;
+			if (opt_purge == purge_mode_decay && arena->ndirty -
+			    (nstashed + npages) < ndirty_limit)
+				break;
 
 			assert(pageind + npages <= chunk_npages);
 			assert(arena_mapbits_dirty_get(chunk, pageind) ==
@@ -1359,7 +1583,8 @@ arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all,
 		}
 
 		nstashed += npages;
-		if (!all && nstashed >= npurge)
+		if (opt_purge == purge_mode_ratio && arena->ndirty - nstashed <=
+		    ndirty_limit)
 			break;
 	}
 
@@ -1499,11 +1724,20 @@ arena_unstash_purged(arena_t *arena, chunk_hooks_t *chunk_hooks,
 	}
 }
 
+/*
+ * NB: ndirty_limit is interpreted differently depending on opt_purge:
+ *   - purge_mode_ratio: Purge as few dirty run/chunks as possible to reach the
+ *                       desired state:
+ *                       (arena->ndirty <= ndirty_limit)
+ *   - purge_mode_decay: Purge as many dirty runs/chunks as possible without
+ *                       violating the invariant:
+ *                       (arena->ndirty >= ndirty_limit)
+ */
 static void
-arena_purge(arena_t *arena, bool all)
+arena_purge_to_limit(arena_t *arena, size_t ndirty_limit)
 {
 	chunk_hooks_t chunk_hooks = chunk_hooks_get(arena);
-	size_t npurge, npurgeable, npurged;
+	size_t npurge, npurged;
 	arena_runs_dirty_link_t purge_runs_sentinel;
 	extent_node_t purge_chunks_sentinel;
 
@@ -1517,33 +1751,38 @@ arena_purge(arena_t *arena, bool all)
 		size_t ndirty = arena_dirty_count(arena);
 		assert(ndirty == arena->ndirty);
 	}
-	assert((arena->nactive >> arena->lg_dirty_mult) < arena->ndirty || all);
+	assert(opt_purge != purge_mode_ratio || (arena->nactive >>
+	    arena->lg_dirty_mult) < arena->ndirty || ndirty_limit == 0);
 
-	if (config_stats)
-		arena->stats.npurge++;
-
-	npurge = arena_compute_npurge(arena, all);
 	qr_new(&purge_runs_sentinel, rd_link);
 	extent_node_dirty_linkage_init(&purge_chunks_sentinel);
 
-	npurgeable = arena_stash_dirty(arena, &chunk_hooks, all, npurge,
+	npurge = arena_stash_dirty(arena, &chunk_hooks, ndirty_limit,
 	    &purge_runs_sentinel, &purge_chunks_sentinel);
-	assert(npurgeable >= npurge);
+	if (npurge == 0)
+		goto label_return;
 	npurged = arena_purge_stashed(arena, &chunk_hooks, &purge_runs_sentinel,
 	    &purge_chunks_sentinel);
-	assert(npurged == npurgeable);
+	assert(npurged == npurge);
 	arena_unstash_purged(arena, &chunk_hooks, &purge_runs_sentinel,
 	    &purge_chunks_sentinel);
 
+	if (config_stats)
+		arena->stats.npurge++;
+
+label_return:
 	arena->purging = false;
 }
 
 void
-arena_purge_all(arena_t *arena)
+arena_purge(arena_t *arena, bool all)
 {
 
 	malloc_mutex_lock(&arena->lock);
-	arena_purge(arena, true);
+	if (all)
+		arena_purge_to_limit(arena, 0);
+	else
+		arena_maybe_purge(arena);
 	malloc_mutex_unlock(&arena->lock);
 }
 
@@ -1660,18 +1899,6 @@ arena_run_size_get(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	return (size);
 }
 
-static bool
-arena_run_decommit(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run)
-{
-	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
-	size_t run_ind = arena_miscelm_to_pageind(miscelm);
-	size_t offset = run_ind << LG_PAGE;
-	size_t length = arena_run_size_get(arena, chunk, run, run_ind);
-
-	return (arena->chunk_hooks.decommit(chunk, chunksize, offset, length,
-	    arena->ind));
-}
-
 static void
 arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned,
     bool decommitted)
@@ -1687,8 +1914,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned,
 	assert(run_ind < chunk_npages);
 	size = arena_run_size_get(arena, chunk, run, run_ind);
 	run_pages = (size >> LG_PAGE);
-	arena_cactive_update(arena, 0, run_pages);
-	arena->nactive -= run_pages;
+	arena_nactive_sub(arena, run_pages);
 
 	/*
 	 * The run is dirty if the caller claims to have dirtied it, as well as
@@ -1750,15 +1976,6 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned,
 }
 
 static void
-arena_run_dalloc_decommit(arena_t *arena, arena_chunk_t *chunk,
-    arena_run_t *run)
-{
-	bool committed = arena_run_decommit(arena, chunk, run);
-
-	arena_run_dalloc(arena, run, committed, false, !committed);
-}
-
-static void
 arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     size_t oldsize, size_t newsize)
 {
@@ -1986,8 +2203,8 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 }
 
 void
-arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, szind_t binind,
-    uint64_t prof_accumbytes)
+arena_tcache_fill_small(tsd_t *tsd, arena_t *arena, tcache_bin_t *tbin,
+    szind_t binind, uint64_t prof_accumbytes)
 {
 	unsigned i, nfill;
 	arena_bin_t *bin;
@@ -2010,11 +2227,10 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, szind_t binind,
 			/*
 			 * OOM.  tbin->avail isn't yet filled down to its first
 			 * element, so the successful allocations (if any) must
-			 * be moved to the base of tbin->avail before bailing
-			 * out.
+			 * be moved just before tbin->avail before bailing out.
 			 */
 			if (i > 0) {
-				memmove(tbin->avail, &tbin->avail[nfill - i],
+				memmove(tbin->avail - i, tbin->avail - nfill,
 				    i * sizeof(void *));
 			}
 			break;
@@ -2024,7 +2240,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, szind_t binind,
 			    true);
 		}
 		/* Insert such that low regions get used first. */
-		tbin->avail[nfill - 1 - i] = ptr;
+		*(tbin->avail - nfill + i) = ptr;
 	}
 	if (config_stats) {
 		bin->stats.nmalloc += i;
@@ -2035,6 +2251,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, szind_t binind,
 	}
 	malloc_mutex_unlock(&bin->lock);
 	tbin->ncached = i;
+	arena_decay_tick(tsd, arena);
 }
 
 void
@@ -2144,18 +2361,17 @@ arena_quarantine_junk_small(void *ptr, size_t usize)
 	arena_redzones_validate(ptr, bin_info, true);
 }
 
-void *
-arena_malloc_small(arena_t *arena, size_t size, bool zero)
+static void *
+arena_malloc_small(tsd_t *tsd, arena_t *arena, szind_t binind, bool zero)
 {
 	void *ret;
 	arena_bin_t *bin;
+	size_t usize;
 	arena_run_t *run;
-	szind_t binind;
 
-	binind = size2index(size);
 	assert(binind < NBINS);
 	bin = &arena->bins[binind];
-	size = index2size(binind);
+	usize = index2size(binind);
 
 	malloc_mutex_lock(&bin->lock);
 	if ((run = bin->runcur) != NULL && run->nfree > 0)
@@ -2174,7 +2390,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 		bin->stats.curregs++;
 	}
 	malloc_mutex_unlock(&bin->lock);
-	if (config_prof && !isthreaded && arena_prof_accum(arena, size))
+	if (config_prof && !isthreaded && arena_prof_accum(arena, usize))
 		prof_idump();
 
 	if (!zero) {
@@ -2183,23 +2399,24 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
 			} else if (unlikely(opt_zero))
-				memset(ret, 0, size);
+				memset(ret, 0, usize);
 		}
-		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, usize);
 	} else {
 		if (config_fill && unlikely(opt_junk_alloc)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
-		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-		memset(ret, 0, size);
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, usize);
+		memset(ret, 0, usize);
 	}
 
+	arena_decay_tick(tsd, arena);
 	return (ret);
 }
 
 void *
-arena_malloc_large(arena_t *arena, size_t size, bool zero)
+arena_malloc_large(tsd_t *tsd, arena_t *arena, szind_t binind, bool zero)
 {
 	void *ret;
 	size_t usize;
@@ -2209,7 +2426,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	UNUSED bool idump;
 
 	/* Large allocation. */
-	usize = s2u(size);
+	usize = index2size(binind);
 	malloc_mutex_lock(&arena->lock);
 	if (config_cache_oblivious) {
 		uint64_t r;
@@ -2219,9 +2436,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 		 * that is a multiple of the cacheline size, e.g. [0 .. 63) * 64
 		 * for 4 KiB pages and 64-byte cachelines.
 		 */
-		prng64(r, LG_PAGE - LG_CACHELINE, arena->offset_state,
-		    UINT64_C(6364136223846793009),
-		    UINT64_C(1442695040888963409));
+		r = prng_lg_range(&arena->offset_state, LG_PAGE - LG_CACHELINE);
 		random_offset = ((uintptr_t)r) << LG_CACHELINE;
 	} else
 		random_offset = 0;
@@ -2234,7 +2449,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	ret = (void *)((uintptr_t)arena_miscelm_to_rpages(miscelm) +
 	    random_offset);
 	if (config_stats) {
-		szind_t index = size2index(usize) - NBINS;
+		szind_t index = binind - NBINS;
 
 		arena->stats.nmalloc_large++;
 		arena->stats.nrequests_large++;
@@ -2258,9 +2473,26 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 		}
 	}
 
+	arena_decay_tick(tsd, arena);
 	return (ret);
 }
 
+void *
+arena_malloc_hard(tsd_t *tsd, arena_t *arena, size_t size, szind_t ind,
+    bool zero, tcache_t *tcache)
+{
+
+	arena = arena_choose(tsd, arena);
+	if (unlikely(arena == NULL))
+		return (NULL);
+
+	if (likely(size <= SMALL_MAXCLASS))
+		return (arena_malloc_small(tsd, arena, ind, zero));
+	if (likely(size <= large_maxclass))
+		return (arena_malloc_large(tsd, arena, ind, zero));
+	return (huge_malloc(tsd, arena, index2size(ind), zero, tcache));
+}
+
 /* Only handles large allocations that require more than page alignment. */
 static void *
 arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
@@ -2344,6 +2576,7 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 		else if (unlikely(opt_zero))
 			memset(ret, 0, usize);
 	}
+	arena_decay_tick(tsd, arena);
 	return (ret);
 }
 
@@ -2356,7 +2589,8 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE
 	    && (usize & PAGE_MASK) == 0))) {
 		/* Small; alignment doesn't require special run placement. */
-		ret = arena_malloc(tsd, arena, usize, zero, tcache);
+		ret = arena_malloc(tsd, arena, usize, size2index(usize), zero,
+		    tcache, true);
 	} else if (usize <= large_maxclass && alignment <= PAGE) {
 		/*
 		 * Large; alignment doesn't require special run placement.
@@ -2364,7 +2598,8 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 		 * the base of the run, so do some bit manipulation to retrieve
 		 * the base.
 		 */
-		ret = arena_malloc(tsd, arena, usize, zero, tcache);
+		ret = arena_malloc(tsd, arena, usize, size2index(usize), zero,
+		    tcache, true);
 		if (config_cache_oblivious)
 			ret = (void *)((uintptr_t)ret & ~PAGE_MASK);
 	} else {
@@ -2441,7 +2676,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	malloc_mutex_lock(&arena->lock);
-	arena_run_dalloc_decommit(arena, chunk, run);
+	arena_run_dalloc(arena, run, true, false, false);
 	malloc_mutex_unlock(&arena->lock);
 	/****************************/
 	malloc_mutex_lock(&bin->lock);
@@ -2528,7 +2763,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 }
 
 void
-arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
+arena_dalloc_small(tsd_t *tsd, arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind)
 {
 	arena_chunk_map_bits_t *bitselm;
@@ -2540,6 +2775,7 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	}
 	bitselm = arena_bitselm_get(chunk, pageind);
 	arena_dalloc_bin(arena, chunk, ptr, pageind, bitselm);
+	arena_decay_tick(tsd, arena);
 }
 
 #ifdef JEMALLOC_JET
@@ -2584,7 +2820,7 @@ arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
 		}
 	}
 
-	arena_run_dalloc_decommit(arena, chunk, run);
+	arena_run_dalloc(arena, run, true, false, false);
 }
 
 void
@@ -2596,12 +2832,13 @@ arena_dalloc_large_junked_locked(arena_t *arena, arena_chunk_t *chunk,
 }
 
 void
-arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
+arena_dalloc_large(tsd_t *tsd, arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
 
 	malloc_mutex_lock(&arena->lock);
 	arena_dalloc_large_locked_impl(arena, chunk, ptr, false);
 	malloc_mutex_unlock(&arena->lock);
+	arena_decay_tick(tsd, arena);
 }
 
 static void
@@ -2802,14 +3039,22 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t usize_min,
 }
 
 bool
-arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
-    bool zero)
+arena_ralloc_no_move(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero)
 {
 	size_t usize_min, usize_max;
 
+	/* Calls with non-zero extra had to clamp extra. */
+	assert(extra == 0 || size + extra <= HUGE_MAXCLASS);
+
+	if (unlikely(size > HUGE_MAXCLASS))
+		return (true);
+
 	usize_min = s2u(size);
 	usize_max = s2u(size + extra);
 	if (likely(oldsize <= large_maxclass && usize_min <= large_maxclass)) {
+		arena_chunk_t *chunk;
+
 		/*
 		 * Avoid moving the allocation if the size class can be left the
 		 * same.
@@ -2817,23 +3062,24 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 		if (oldsize <= SMALL_MAXCLASS) {
 			assert(arena_bin_info[size2index(oldsize)].reg_size ==
 			    oldsize);
-			if ((usize_max <= SMALL_MAXCLASS &&
-			    size2index(usize_max) == size2index(oldsize)) ||
-			    (size <= oldsize && usize_max >= oldsize))
-				return (false);
+			if ((usize_max > SMALL_MAXCLASS ||
+			    size2index(usize_max) != size2index(oldsize)) &&
+			    (size > oldsize || usize_max < oldsize))
+				return (true);
 		} else {
-			if (usize_max > SMALL_MAXCLASS) {
-				if (!arena_ralloc_large(ptr, oldsize, usize_min,
-				    usize_max, zero))
-					return (false);
-			}
+			if (usize_max <= SMALL_MAXCLASS)
+				return (true);
+			if (arena_ralloc_large(ptr, oldsize, usize_min,
+			    usize_max, zero))
+				return (true);
 		}
 
-		/* Reallocation would require a move. */
-		return (true);
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+		arena_decay_tick(tsd, extent_node_arena_get(&chunk->node));
+		return (false);
 	} else {
-		return (huge_ralloc_no_move(ptr, oldsize, usize_min, usize_max,
-		    zero));
+		return (huge_ralloc_no_move(tsd, ptr, oldsize, usize_min,
+		    usize_max, zero));
 	}
 }
 
@@ -2843,9 +3089,10 @@ arena_ralloc_move_helper(tsd_t *tsd, arena_t *arena, size_t usize,
 {
 
 	if (alignment == 0)
-		return (arena_malloc(tsd, arena, usize, zero, tcache));
+		return (arena_malloc(tsd, arena, usize, size2index(usize), zero,
+		    tcache, true));
 	usize = sa2u(usize, alignment);
-	if (usize == 0)
+	if (unlikely(usize == 0 || usize > HUGE_MAXCLASS))
 		return (NULL);
 	return (ipalloct(tsd, usize, alignment, zero, tcache, arena));
 }
@@ -2858,14 +3105,14 @@ arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	size_t usize;
 
 	usize = s2u(size);
-	if (usize == 0)
+	if (unlikely(usize == 0 || size > HUGE_MAXCLASS))
 		return (NULL);
 
 	if (likely(usize <= large_maxclass)) {
 		size_t copysize;
 
 		/* Try to avoid moving the allocation. */
-		if (!arena_ralloc_no_move(ptr, oldsize, usize, 0, zero))
+		if (!arena_ralloc_no_move(tsd, ptr, oldsize, usize, 0, zero))
 			return (ptr);
 
 		/*
@@ -2928,25 +3175,72 @@ bool
 arena_lg_dirty_mult_default_set(ssize_t lg_dirty_mult)
 {
 
+	if (opt_purge != purge_mode_ratio)
+		return (true);
 	if (!arena_lg_dirty_mult_valid(lg_dirty_mult))
 		return (true);
 	atomic_write_z((size_t *)&lg_dirty_mult_default, (size_t)lg_dirty_mult);
 	return (false);
 }
 
-void
-arena_stats_merge(arena_t *arena, const char **dss, ssize_t *lg_dirty_mult,
-    size_t *nactive, size_t *ndirty, arena_stats_t *astats,
-    malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats,
-    malloc_huge_stats_t *hstats)
+ssize_t
+arena_decay_time_default_get(void)
 {
-	unsigned i;
 
-	malloc_mutex_lock(&arena->lock);
+	return ((ssize_t)atomic_read_z((size_t *)&decay_time_default));
+}
+
+bool
+arena_decay_time_default_set(ssize_t decay_time)
+{
+
+	if (opt_purge != purge_mode_decay)
+		return (true);
+	if (!arena_decay_time_valid(decay_time))
+		return (true);
+	atomic_write_z((size_t *)&decay_time_default, (size_t)decay_time);
+	return (false);
+}
+
+static void
+arena_basic_stats_merge_locked(arena_t *arena, unsigned *nthreads,
+    const char **dss, ssize_t *lg_dirty_mult, ssize_t *decay_time,
+    size_t *nactive, size_t *ndirty)
+{
+
+	*nthreads += arena_nthreads_get(arena);
 	*dss = dss_prec_names[arena->dss_prec];
 	*lg_dirty_mult = arena->lg_dirty_mult;
+	*decay_time = arena->decay_time;
 	*nactive += arena->nactive;
 	*ndirty += arena->ndirty;
+}
+
+void
+arena_basic_stats_merge(arena_t *arena, unsigned *nthreads, const char **dss,
+    ssize_t *lg_dirty_mult, ssize_t *decay_time, size_t *nactive,
+    size_t *ndirty)
+{
+
+	malloc_mutex_lock(&arena->lock);
+	arena_basic_stats_merge_locked(arena, nthreads, dss, lg_dirty_mult,
+	    decay_time, nactive, ndirty);
+	malloc_mutex_unlock(&arena->lock);
+}
+
+void
+arena_stats_merge(arena_t *arena, unsigned *nthreads, const char **dss,
+    ssize_t *lg_dirty_mult, ssize_t *decay_time, size_t *nactive,
+    size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
+    malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats)
+{
+	unsigned i;
+
+	cassert(config_stats);
+
+	malloc_mutex_lock(&arena->lock);
+	arena_basic_stats_merge_locked(arena, nthreads, dss, lg_dirty_mult,
+	    decay_time, nactive, ndirty);
 
 	astats->mapped += arena->stats.mapped;
 	astats->npurge += arena->stats.npurge;
@@ -2995,23 +3289,48 @@ arena_stats_merge(arena_t *arena, const char **dss, ssize_t *lg_dirty_mult,
 	}
 }
 
+unsigned
+arena_nthreads_get(arena_t *arena)
+{
+
+	return (atomic_read_u(&arena->nthreads));
+}
+
+void
+arena_nthreads_inc(arena_t *arena)
+{
+
+	atomic_add_u(&arena->nthreads, 1);
+}
+
+void
+arena_nthreads_dec(arena_t *arena)
+{
+
+	atomic_sub_u(&arena->nthreads, 1);
+}
+
 arena_t *
 arena_new(unsigned ind)
 {
 	arena_t *arena;
+	size_t arena_size;
 	unsigned i;
 	arena_bin_t *bin;
 
+	/* Compute arena size to incorporate sufficient runs_avail elements. */
+	arena_size = offsetof(arena_t, runs_avail) + (sizeof(arena_run_tree_t) *
+	    runs_avail_nclasses);
 	/*
 	 * Allocate arena, arena->lstats, and arena->hstats contiguously, mainly
 	 * because there is no way to clean up if base_alloc() OOMs.
 	 */
 	if (config_stats) {
-		arena = (arena_t *)base_alloc(CACHELINE_CEILING(sizeof(arena_t))
-		    + QUANTUM_CEILING(nlclasses * sizeof(malloc_large_stats_t) +
+		arena = (arena_t *)base_alloc(CACHELINE_CEILING(arena_size) +
+		    QUANTUM_CEILING(nlclasses * sizeof(malloc_large_stats_t) +
 		    nhclasses) * sizeof(malloc_huge_stats_t));
 	} else
-		arena = (arena_t *)base_alloc(sizeof(arena_t));
+		arena = (arena_t *)base_alloc(arena_size);
 	if (arena == NULL)
 		return (NULL);
 
@@ -3023,11 +3342,11 @@ arena_new(unsigned ind)
 	if (config_stats) {
 		memset(&arena->stats, 0, sizeof(arena_stats_t));
 		arena->stats.lstats = (malloc_large_stats_t *)((uintptr_t)arena
-		    + CACHELINE_CEILING(sizeof(arena_t)));
+		    + CACHELINE_CEILING(arena_size));
 		memset(arena->stats.lstats, 0, nlclasses *
 		    sizeof(malloc_large_stats_t));
 		arena->stats.hstats = (malloc_huge_stats_t *)((uintptr_t)arena
-		    + CACHELINE_CEILING(sizeof(arena_t)) +
+		    + CACHELINE_CEILING(arena_size) +
 		    QUANTUM_CEILING(nlclasses * sizeof(malloc_large_stats_t)));
 		memset(arena->stats.hstats, 0, nhclasses *
 		    sizeof(malloc_huge_stats_t));
@@ -3059,10 +3378,14 @@ arena_new(unsigned ind)
 	arena->nactive = 0;
 	arena->ndirty = 0;
 
-	arena_avail_tree_new(&arena->runs_avail);
+	for(i = 0; i < runs_avail_nclasses; i++)
+		arena_run_tree_new(&arena->runs_avail[i]);
 	qr_new(&arena->runs_dirty, rd_link);
 	qr_new(&arena->chunks_cache, cc_link);
 
+	if (opt_purge == purge_mode_decay)
+		arena_decay_init(arena, arena_decay_time_default_get());
+
 	ql_new(&arena->huge);
 	if (malloc_mutex_init(&arena->huge_mtx))
 		return (NULL);
@@ -3117,8 +3440,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 	 * be twice as large in order to maintain alignment.
 	 */
 	if (config_fill && unlikely(opt_redzone)) {
-		size_t align_min = ZU(1) << (jemalloc_ffs(bin_info->reg_size) -
-		    1);
+		size_t align_min = ZU(1) << (ffs_zu(bin_info->reg_size) - 1);
 		if (align_min <= REDZONE_MINSIZE) {
 			bin_info->redzone_size = REDZONE_MINSIZE;
 			pad_size = 0;
@@ -3138,18 +3460,19 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 	 * size).
 	 */
 	try_run_size = PAGE;
-	try_nregs = try_run_size / bin_info->reg_size;
+	try_nregs = (uint32_t)(try_run_size / bin_info->reg_size);
 	do {
 		perfect_run_size = try_run_size;
 		perfect_nregs = try_nregs;
 
 		try_run_size += PAGE;
-		try_nregs = try_run_size / bin_info->reg_size;
+		try_nregs = (uint32_t)(try_run_size / bin_info->reg_size);
 	} while (perfect_run_size != perfect_nregs * bin_info->reg_size);
 	assert(perfect_nregs <= RUN_MAXREGS);
 
 	actual_run_size = perfect_run_size;
-	actual_nregs = (actual_run_size - pad_size) / bin_info->reg_interval;
+	actual_nregs = (uint32_t)((actual_run_size - pad_size) /
+	    bin_info->reg_interval);
 
 	/*
 	 * Redzones can require enough padding that not even a single region can
@@ -3161,8 +3484,8 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 		assert(config_fill && unlikely(opt_redzone));
 
 		actual_run_size += PAGE;
-		actual_nregs = (actual_run_size - pad_size) /
-		    bin_info->reg_interval;
+		actual_nregs = (uint32_t)((actual_run_size - pad_size) /
+		    bin_info->reg_interval);
 	}
 
 	/*
@@ -3170,8 +3493,8 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 	 */
 	while (actual_run_size > arena_maxrun) {
 		actual_run_size -= PAGE;
-		actual_nregs = (actual_run_size - pad_size) /
-		    bin_info->reg_interval;
+		actual_nregs = (uint32_t)((actual_run_size - pad_size) /
+		    bin_info->reg_interval);
 	}
 	assert(actual_nregs > 0);
 	assert(actual_run_size == s2u(actual_run_size));
@@ -3179,8 +3502,8 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 	/* Copy final settings. */
 	bin_info->run_size = actual_run_size;
 	bin_info->nregs = actual_nregs;
-	bin_info->reg0_offset = actual_run_size - (actual_nregs *
-	    bin_info->reg_interval) - pad_size + bin_info->redzone_size;
+	bin_info->reg0_offset = (uint32_t)(actual_run_size - (actual_nregs *
+	    bin_info->reg_interval) - pad_size + bin_info->redzone_size);
 
 	if (actual_run_size > small_maxrun)
 		small_maxrun = actual_run_size;
@@ -3234,12 +3557,42 @@ small_run_size_init(void)
 	return (false);
 }
 
+static bool
+run_quantize_init(void)
+{
+	unsigned i;
+
+	run_quantize_max = chunksize + large_pad;
+
+	run_quantize_floor_tab = (size_t *)base_alloc(sizeof(size_t) *
+	    (run_quantize_max >> LG_PAGE));
+	if (run_quantize_floor_tab == NULL)
+		return (true);
+
+	run_quantize_ceil_tab = (size_t *)base_alloc(sizeof(size_t) *
+	    (run_quantize_max >> LG_PAGE));
+	if (run_quantize_ceil_tab == NULL)
+		return (true);
+
+	for (i = 1; i <= run_quantize_max >> LG_PAGE; i++) {
+		size_t run_size = i << LG_PAGE;
+
+		run_quantize_floor_tab[i-1] =
+		    run_quantize_floor_compute(run_size);
+		run_quantize_ceil_tab[i-1] =
+		    run_quantize_ceil_compute(run_size);
+	}
+
+	return (false);
+}
+
 bool
 arena_boot(void)
 {
 	unsigned i;
 
 	arena_lg_dirty_mult_default_set(opt_lg_dirty_mult);
+	arena_decay_time_default_set(opt_decay_time);
 
 	/*
 	 * Compute the header size such that it is large enough to contain the
@@ -3281,7 +3634,15 @@ arena_boot(void)
 	nhclasses = NSIZES - nlclasses - NBINS;
 
 	bin_info_init();
-	return (small_run_size_init());
+	if (small_run_size_init())
+		return (true);
+	if (run_quantize_init())
+		return (true);
+
+	runs_avail_bias = size2index(PAGE);
+	runs_avail_nclasses = size2index(run_quantize_max)+1 - runs_avail_bias;
+
+	return (false);
 }
 
 void
diff --git a/contrib/jemalloc/src/bitmap.c b/contrib/jemalloc/src/bitmap.c
index c733372b4cb2..b1e66271e678 100644
--- a/contrib/jemalloc/src/bitmap.c
+++ b/contrib/jemalloc/src/bitmap.c
@@ -3,6 +3,8 @@
 
 /******************************************************************************/
 
+#ifdef USE_TREE
+
 void
 bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
 {
@@ -32,20 +34,11 @@ bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
 	binfo->nbits = nbits;
 }
 
-size_t
+static size_t
 bitmap_info_ngroups(const bitmap_info_t *binfo)
 {
 
-	return (binfo->levels[binfo->nlevels].group_offset << LG_SIZEOF_BITMAP);
-}
-
-size_t
-bitmap_size(size_t nbits)
-{
-	bitmap_info_t binfo;
-
-	bitmap_info_init(&binfo, nbits);
-	return (bitmap_info_ngroups(&binfo));
+	return (binfo->levels[binfo->nlevels].group_offset);
 }
 
 void
@@ -61,8 +54,7 @@ bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo)
 	 * correspond to the first logical bit in the group, so extra bits
 	 * are the most significant bits of the last group.
 	 */
-	memset(bitmap, 0xffU, binfo->levels[binfo->nlevels].group_offset <<
-	    LG_SIZEOF_BITMAP);
+	memset(bitmap, 0xffU, bitmap_size(binfo));
 	extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK))
 	    & BITMAP_GROUP_NBITS_MASK;
 	if (extra != 0)
@@ -76,3 +68,47 @@ bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo)
 			bitmap[binfo->levels[i+1].group_offset - 1] >>= extra;
 	}
 }
+
+#else /* USE_TREE */
+
+void
+bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
+{
+	size_t i;
+
+	assert(nbits > 0);
+	assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS));
+
+	i = nbits >> LG_BITMAP_GROUP_NBITS;
+	if (nbits % BITMAP_GROUP_NBITS != 0)
+		i++;
+	binfo->ngroups = i;
+	binfo->nbits = nbits;
+}
+
+static size_t
+bitmap_info_ngroups(const bitmap_info_t *binfo)
+{
+
+	return (binfo->ngroups);
+}
+
+void
+bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	size_t extra;
+
+	memset(bitmap, 0xffU, bitmap_size(binfo));
+	extra = (binfo->nbits % (binfo->ngroups * BITMAP_GROUP_NBITS));
+	if (extra != 0)
+		bitmap[binfo->ngroups - 1] >>= (BITMAP_GROUP_NBITS - extra);
+}
+
+#endif /* USE_TREE */
+
+size_t
+bitmap_size(const bitmap_info_t *binfo)
+{
+
+	return (bitmap_info_ngroups(binfo) << LG_SIZEOF_BITMAP);
+}
diff --git a/contrib/jemalloc/src/chunk.c b/contrib/jemalloc/src/chunk.c
index 6ba1ca7a51ba..b179d2135572 100644
--- a/contrib/jemalloc/src/chunk.c
+++ b/contrib/jemalloc/src/chunk.c
@@ -332,30 +332,20 @@ chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
     bool *zero, bool *commit, dss_prec_t dss_prec)
 {
 	void *ret;
-	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
-	/* Retained. */
-	if ((ret = chunk_recycle(arena, &chunk_hooks,
-	    &arena->chunks_szad_retained, &arena->chunks_ad_retained, false,
-	    new_addr, size, alignment, zero, commit, true)) != NULL)
-		return (ret);
-
 	/* "primary" dss. */
 	if (have_dss && dss_prec == dss_prec_primary && (ret =
 	    chunk_alloc_dss(arena, new_addr, size, alignment, zero, commit)) !=
 	    NULL)
 		return (ret);
-	/*
-	 * mmap.  Requesting an address is not implemented for
-	 * chunk_alloc_mmap(), so only call it if (new_addr == NULL).
-	 */
-	if (new_addr == NULL && (ret = chunk_alloc_mmap(size, alignment, zero,
-	    commit)) != NULL)
+	/* mmap. */
+	if ((ret = chunk_alloc_mmap(new_addr, size, alignment, zero, commit)) !=
+	    NULL)
 		return (ret);
 	/* "secondary" dss. */
 	if (have_dss && dss_prec == dss_prec_secondary && (ret =
@@ -380,7 +370,7 @@ chunk_alloc_base(size_t size)
 	 */
 	zero = true;
 	commit = true;
-	ret = chunk_alloc_mmap(size, chunksize, &zero, &commit);
+	ret = chunk_alloc_mmap(NULL, size, chunksize, &zero, &commit);
 	if (ret == NULL)
 		return (NULL);
 	if (config_valgrind)
@@ -418,9 +408,7 @@ chunk_arena_get(unsigned arena_ind)
 {
 	arena_t *arena;
 
-	/* Dodge tsd for a0 in order to avoid bootstrapping issues. */
-	arena = (arena_ind == 0) ? a0get() : arena_get(tsd_fetch(), arena_ind,
-	     false, true);
+	arena = arena_get(arena_ind, false);
 	/*
 	 * The arena we're allocating on behalf of must have been initialized
 	 * already.
@@ -447,6 +435,21 @@ chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
 	return (ret);
 }
 
+static void *
+chunk_alloc_retained(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
+    size_t size, size_t alignment, bool *zero, bool *commit)
+{
+
+	assert(size != 0);
+	assert((size & chunksize_mask) == 0);
+	assert(alignment != 0);
+	assert((alignment & chunksize_mask) == 0);
+
+	return (chunk_recycle(arena, chunk_hooks, &arena->chunks_szad_retained,
+	    &arena->chunks_ad_retained, false, new_addr, size, alignment, zero,
+	    commit, true));
+}
+
 void *
 chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit)
@@ -454,10 +457,16 @@ chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
 	void *ret;
 
 	chunk_hooks_assure_initialized(arena, chunk_hooks);
-	ret = chunk_hooks->alloc(new_addr, size, alignment, zero, commit,
-	    arena->ind);
-	if (ret == NULL)
-		return (NULL);
+
+	ret = chunk_alloc_retained(arena, chunk_hooks, new_addr, size,
+	    alignment, zero, commit);
+	if (ret == NULL) {
+		ret = chunk_hooks->alloc(new_addr, size, alignment, zero,
+		    commit, arena->ind);
+		if (ret == NULL)
+			return (NULL);
+	}
+
 	if (config_valgrind && chunk_hooks->alloc != chunk_alloc_default)
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, chunksize);
 	return (ret);
@@ -716,7 +725,7 @@ chunk_boot(void)
 	 * so pages_map will always take fast path.
 	 */
 	if (!opt_lg_chunk) {
-		opt_lg_chunk = jemalloc_ffs((int)info.dwAllocationGranularity)
+		opt_lg_chunk = ffs_u((unsigned)info.dwAllocationGranularity)
 		    - 1;
 	}
 #else
@@ -732,8 +741,8 @@ chunk_boot(void)
 
 	if (have_dss && chunk_dss_boot())
 		return (true);
-	if (rtree_new(&chunks_rtree, (ZU(1) << (LG_SIZEOF_PTR+3)) -
-	    opt_lg_chunk, chunks_rtree_node_alloc, NULL))
+	if (rtree_new(&chunks_rtree, (unsigned)((ZU(1) << (LG_SIZEOF_PTR+3)) -
+	    opt_lg_chunk), chunks_rtree_node_alloc, NULL))
 		return (true);
 
 	return (false);
diff --git a/contrib/jemalloc/src/chunk_mmap.c b/contrib/jemalloc/src/chunk_mmap.c
index b9ba74191a41..56b2ee422ab8 100644
--- a/contrib/jemalloc/src/chunk_mmap.c
+++ b/contrib/jemalloc/src/chunk_mmap.c
@@ -32,7 +32,8 @@ chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero, bool *commit)
 }
 
 void *
-chunk_alloc_mmap(size_t size, size_t alignment, bool *zero, bool *commit)
+chunk_alloc_mmap(void *new_addr, size_t size, size_t alignment, bool *zero,
+    bool *commit)
 {
 	void *ret;
 	size_t offset;
@@ -53,9 +54,10 @@ chunk_alloc_mmap(size_t size, size_t alignment, bool *zero, bool *commit)
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
-	ret = pages_map(NULL, size);
-	if (ret == NULL)
-		return (NULL);
+	ret = pages_map(new_addr, size);
+	if (ret == NULL || ret == new_addr)
+		return (ret);
+	assert(new_addr == NULL);
 	offset = ALIGNMENT_ADDR2OFFSET(ret, alignment);
 	if (offset != 0) {
 		pages_unmap(ret, size);
diff --git a/contrib/jemalloc/src/ckh.c b/contrib/jemalloc/src/ckh.c
index 53a1c1ef11d2..3b423aa22a3e 100644
--- a/contrib/jemalloc/src/ckh.c
+++ b/contrib/jemalloc/src/ckh.c
@@ -99,7 +99,7 @@ ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key,
 	 * Cycle through the cells in the bucket, starting at a random position.
 	 * The randomness avoids worst-case search overhead as buckets fill up.
 	 */
-	prng32(offset, LG_CKH_BUCKET_CELLS, ckh->prng_state, CKH_A, CKH_C);
+	offset = (unsigned)prng_lg_range(&ckh->prng_state, LG_CKH_BUCKET_CELLS);
 	for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) {
 		cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) +
 		    ((i + offset) & ((ZU(1) << LG_CKH_BUCKET_CELLS) - 1))];
@@ -141,7 +141,8 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
 		 * were an item for which both hashes indicated the same
 		 * bucket.
 		 */
-		prng32(i, LG_CKH_BUCKET_CELLS, ckh->prng_state, CKH_A, CKH_C);
+		i = (unsigned)prng_lg_range(&ckh->prng_state,
+		    LG_CKH_BUCKET_CELLS);
 		cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i];
 		assert(cell->key != NULL);
 
@@ -247,8 +248,7 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 {
 	bool ret;
 	ckhc_t *tab, *ttab;
-	size_t lg_curcells;
-	unsigned lg_prevbuckets;
+	unsigned lg_prevbuckets, lg_curcells;
 
 #ifdef CKH_COUNT
 	ckh->ngrows++;
@@ -266,7 +266,7 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 
 		lg_curcells++;
 		usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-		if (usize == 0) {
+		if (unlikely(usize == 0 || usize > HUGE_MAXCLASS)) {
 			ret = true;
 			goto label_return;
 		}
@@ -283,12 +283,12 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 		ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 		if (!ckh_rebuild(ckh, tab)) {
-			idalloctm(tsd, tab, tcache_get(tsd, false), true);
+			idalloctm(tsd, tab, tcache_get(tsd, false), true, true);
 			break;
 		}
 
 		/* Rebuilding failed, so back out partially rebuilt table. */
-		idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
+		idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true);
 		ckh->tab = tab;
 		ckh->lg_curbuckets = lg_prevbuckets;
 	}
@@ -302,8 +302,8 @@ static void
 ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 {
 	ckhc_t *tab, *ttab;
-	size_t lg_curcells, usize;
-	unsigned lg_prevbuckets;
+	size_t usize;
+	unsigned lg_prevbuckets, lg_curcells;
 
 	/*
 	 * It is possible (though unlikely, given well behaved hashes) that the
@@ -312,7 +312,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
 	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-	if (usize == 0)
+	if (unlikely(usize == 0 || usize > HUGE_MAXCLASS))
 		return;
 	tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL, true,
 	    NULL);
@@ -330,7 +330,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 	if (!ckh_rebuild(ckh, tab)) {
-		idalloctm(tsd, tab, tcache_get(tsd, false), true);
+		idalloctm(tsd, tab, tcache_get(tsd, false), true, true);
 #ifdef CKH_COUNT
 		ckh->nshrinks++;
 #endif
@@ -338,7 +338,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	}
 
 	/* Rebuilding failed, so back out partially rebuilt table. */
-	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
+	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true);
 	ckh->tab = tab;
 	ckh->lg_curbuckets = lg_prevbuckets;
 #ifdef CKH_COUNT
@@ -387,7 +387,7 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 	ckh->keycomp = keycomp;
 
 	usize = sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE);
-	if (usize == 0) {
+	if (unlikely(usize == 0 || usize > HUGE_MAXCLASS)) {
 		ret = true;
 		goto label_return;
 	}
@@ -421,7 +421,7 @@ ckh_delete(tsd_t *tsd, ckh_t *ckh)
 	    (unsigned long long)ckh->nrelocs);
 #endif
 
-	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
+	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true);
 	if (config_debug)
 		memset(ckh, 0x5a, sizeof(ckh_t));
 }
diff --git a/contrib/jemalloc/src/ctl.c b/contrib/jemalloc/src/ctl.c
index 3de8e602d11f..17bd0719735f 100644
--- a/contrib/jemalloc/src/ctl.c
+++ b/contrib/jemalloc/src/ctl.c
@@ -24,7 +24,7 @@ ctl_named_node(const ctl_node_t *node)
 }
 
 JEMALLOC_INLINE_C const ctl_named_node_t *
-ctl_named_children(const ctl_named_node_t *node, int index)
+ctl_named_children(const ctl_named_node_t *node, size_t index)
 {
 	const ctl_named_node_t *children = ctl_named_node(node->children);
 
@@ -77,6 +77,7 @@ CTL_PROTO(config_cache_oblivious)
 CTL_PROTO(config_debug)
 CTL_PROTO(config_fill)
 CTL_PROTO(config_lazy_lock)
+CTL_PROTO(config_malloc_conf)
 CTL_PROTO(config_munmap)
 CTL_PROTO(config_prof)
 CTL_PROTO(config_prof_libgcc)
@@ -91,7 +92,9 @@ CTL_PROTO(opt_abort)
 CTL_PROTO(opt_dss)
 CTL_PROTO(opt_lg_chunk)
 CTL_PROTO(opt_narenas)
+CTL_PROTO(opt_purge)
 CTL_PROTO(opt_lg_dirty_mult)
+CTL_PROTO(opt_decay_time)
 CTL_PROTO(opt_stats_print)
 CTL_PROTO(opt_junk)
 CTL_PROTO(opt_zero)
@@ -114,10 +117,12 @@ CTL_PROTO(opt_prof_accum)
 CTL_PROTO(tcache_create)
 CTL_PROTO(tcache_flush)
 CTL_PROTO(tcache_destroy)
+static void	arena_i_purge(unsigned arena_ind, bool all);
 CTL_PROTO(arena_i_purge)
-static void	arena_purge(unsigned arena_ind);
+CTL_PROTO(arena_i_decay)
 CTL_PROTO(arena_i_dss)
 CTL_PROTO(arena_i_lg_dirty_mult)
+CTL_PROTO(arena_i_decay_time)
 CTL_PROTO(arena_i_chunk_hooks)
 INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
@@ -131,6 +136,7 @@ INDEX_PROTO(arenas_hchunk_i)
 CTL_PROTO(arenas_narenas)
 CTL_PROTO(arenas_initialized)
 CTL_PROTO(arenas_lg_dirty_mult)
+CTL_PROTO(arenas_decay_time)
 CTL_PROTO(arenas_quantum)
 CTL_PROTO(arenas_page)
 CTL_PROTO(arenas_tcache_max)
@@ -181,6 +187,7 @@ INDEX_PROTO(stats_arenas_i_hchunks_j)
 CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_dss)
 CTL_PROTO(stats_arenas_i_lg_dirty_mult)
+CTL_PROTO(stats_arenas_i_decay_time)
 CTL_PROTO(stats_arenas_i_pactive)
 CTL_PROTO(stats_arenas_i_pdirty)
 CTL_PROTO(stats_arenas_i_mapped)
@@ -241,6 +248,7 @@ static const ctl_named_node_t	config_node[] = {
 	{NAME("debug"),		CTL(config_debug)},
 	{NAME("fill"),		CTL(config_fill)},
 	{NAME("lazy_lock"),	CTL(config_lazy_lock)},
+	{NAME("malloc_conf"),	CTL(config_malloc_conf)},
 	{NAME("munmap"),	CTL(config_munmap)},
 	{NAME("prof"),		CTL(config_prof)},
 	{NAME("prof_libgcc"),	CTL(config_prof_libgcc)},
@@ -258,7 +266,9 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("lg_chunk"),	CTL(opt_lg_chunk)},
 	{NAME("narenas"),	CTL(opt_narenas)},
+	{NAME("purge"),		CTL(opt_purge)},
 	{NAME("lg_dirty_mult"),	CTL(opt_lg_dirty_mult)},
+	{NAME("decay_time"),	CTL(opt_decay_time)},
 	{NAME("stats_print"),	CTL(opt_stats_print)},
 	{NAME("junk"),		CTL(opt_junk)},
 	{NAME("zero"),		CTL(opt_zero)},
@@ -288,8 +298,10 @@ static const ctl_named_node_t	tcache_node[] = {
 
 static const ctl_named_node_t arena_i_node[] = {
 	{NAME("purge"),		CTL(arena_i_purge)},
+	{NAME("decay"),		CTL(arena_i_decay)},
 	{NAME("dss"),		CTL(arena_i_dss)},
 	{NAME("lg_dirty_mult"),	CTL(arena_i_lg_dirty_mult)},
+	{NAME("decay_time"),	CTL(arena_i_decay_time)},
 	{NAME("chunk_hooks"),	CTL(arena_i_chunk_hooks)}
 };
 static const ctl_named_node_t super_arena_i_node[] = {
@@ -339,6 +351,7 @@ static const ctl_named_node_t arenas_node[] = {
 	{NAME("narenas"),	CTL(arenas_narenas)},
 	{NAME("initialized"),	CTL(arenas_initialized)},
 	{NAME("lg_dirty_mult"),	CTL(arenas_lg_dirty_mult)},
+	{NAME("decay_time"),	CTL(arenas_decay_time)},
 	{NAME("quantum"),	CTL(arenas_quantum)},
 	{NAME("page"),		CTL(arenas_page)},
 	{NAME("tcache_max"),	CTL(arenas_tcache_max)},
@@ -439,6 +452,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("nthreads"),	CTL(stats_arenas_i_nthreads)},
 	{NAME("dss"),		CTL(stats_arenas_i_dss)},
 	{NAME("lg_dirty_mult"),	CTL(stats_arenas_i_lg_dirty_mult)},
+	{NAME("decay_time"),	CTL(stats_arenas_i_decay_time)},
 	{NAME("pactive"),	CTL(stats_arenas_i_pactive)},
 	{NAME("pdirty"),	CTL(stats_arenas_i_pdirty)},
 	{NAME("mapped"),	CTL(stats_arenas_i_mapped)},
@@ -519,8 +533,10 @@ static void
 ctl_arena_clear(ctl_arena_stats_t *astats)
 {
 
+	astats->nthreads = 0;
 	astats->dss = dss_prec_names[dss_prec_limit];
 	astats->lg_dirty_mult = -1;
+	astats->decay_time = -1;
 	astats->pactive = 0;
 	astats->pdirty = 0;
 	if (config_stats) {
@@ -542,16 +558,23 @@ ctl_arena_stats_amerge(ctl_arena_stats_t *cstats, arena_t *arena)
 {
 	unsigned i;
 
-	arena_stats_merge(arena, &cstats->dss, &cstats->lg_dirty_mult,
-	    &cstats->pactive, &cstats->pdirty, &cstats->astats, cstats->bstats,
-	    cstats->lstats, cstats->hstats);
-
-	for (i = 0; i < NBINS; i++) {
-		cstats->allocated_small += cstats->bstats[i].curregs *
-		    index2size(i);
-		cstats->nmalloc_small += cstats->bstats[i].nmalloc;
-		cstats->ndalloc_small += cstats->bstats[i].ndalloc;
-		cstats->nrequests_small += cstats->bstats[i].nrequests;
+	if (config_stats) {
+		arena_stats_merge(arena, &cstats->nthreads, &cstats->dss,
+		    &cstats->lg_dirty_mult, &cstats->decay_time,
+		    &cstats->pactive, &cstats->pdirty, &cstats->astats,
+		    cstats->bstats, cstats->lstats, cstats->hstats);
+
+		for (i = 0; i < NBINS; i++) {
+			cstats->allocated_small += cstats->bstats[i].curregs *
+			    index2size(i);
+			cstats->nmalloc_small += cstats->bstats[i].nmalloc;
+			cstats->ndalloc_small += cstats->bstats[i].ndalloc;
+			cstats->nrequests_small += cstats->bstats[i].nrequests;
+		}
+	} else {
+		arena_basic_stats_merge(arena, &cstats->nthreads, &cstats->dss,
+		    &cstats->lg_dirty_mult, &cstats->decay_time,
+		    &cstats->pactive, &cstats->pdirty);
 	}
 }
 
@@ -560,57 +583,68 @@ ctl_arena_stats_smerge(ctl_arena_stats_t *sstats, ctl_arena_stats_t *astats)
 {
 	unsigned i;
 
+	sstats->nthreads += astats->nthreads;
 	sstats->pactive += astats->pactive;
 	sstats->pdirty += astats->pdirty;
 
-	sstats->astats.mapped += astats->astats.mapped;
-	sstats->astats.npurge += astats->astats.npurge;
-	sstats->astats.nmadvise += astats->astats.nmadvise;
-	sstats->astats.purged += astats->astats.purged;
-
-	sstats->astats.metadata_mapped += astats->astats.metadata_mapped;
-	sstats->astats.metadata_allocated += astats->astats.metadata_allocated;
-
-	sstats->allocated_small += astats->allocated_small;
-	sstats->nmalloc_small += astats->nmalloc_small;
-	sstats->ndalloc_small += astats->ndalloc_small;
-	sstats->nrequests_small += astats->nrequests_small;
-
-	sstats->astats.allocated_large += astats->astats.allocated_large;
-	sstats->astats.nmalloc_large += astats->astats.nmalloc_large;
-	sstats->astats.ndalloc_large += astats->astats.ndalloc_large;
-	sstats->astats.nrequests_large += astats->astats.nrequests_large;
-
-	sstats->astats.allocated_huge += astats->astats.allocated_huge;
-	sstats->astats.nmalloc_huge += astats->astats.nmalloc_huge;
-	sstats->astats.ndalloc_huge += astats->astats.ndalloc_huge;
-
-	for (i = 0; i < NBINS; i++) {
-		sstats->bstats[i].nmalloc += astats->bstats[i].nmalloc;
-		sstats->bstats[i].ndalloc += astats->bstats[i].ndalloc;
-		sstats->bstats[i].nrequests += astats->bstats[i].nrequests;
-		sstats->bstats[i].curregs += astats->bstats[i].curregs;
-		if (config_tcache) {
-			sstats->bstats[i].nfills += astats->bstats[i].nfills;
-			sstats->bstats[i].nflushes +=
-			    astats->bstats[i].nflushes;
+	if (config_stats) {
+		sstats->astats.mapped += astats->astats.mapped;
+		sstats->astats.npurge += astats->astats.npurge;
+		sstats->astats.nmadvise += astats->astats.nmadvise;
+		sstats->astats.purged += astats->astats.purged;
+
+		sstats->astats.metadata_mapped +=
+		    astats->astats.metadata_mapped;
+		sstats->astats.metadata_allocated +=
+		    astats->astats.metadata_allocated;
+
+		sstats->allocated_small += astats->allocated_small;
+		sstats->nmalloc_small += astats->nmalloc_small;
+		sstats->ndalloc_small += astats->ndalloc_small;
+		sstats->nrequests_small += astats->nrequests_small;
+
+		sstats->astats.allocated_large +=
+		    astats->astats.allocated_large;
+		sstats->astats.nmalloc_large += astats->astats.nmalloc_large;
+		sstats->astats.ndalloc_large += astats->astats.ndalloc_large;
+		sstats->astats.nrequests_large +=
+		    astats->astats.nrequests_large;
+
+		sstats->astats.allocated_huge += astats->astats.allocated_huge;
+		sstats->astats.nmalloc_huge += astats->astats.nmalloc_huge;
+		sstats->astats.ndalloc_huge += astats->astats.ndalloc_huge;
+
+		for (i = 0; i < NBINS; i++) {
+			sstats->bstats[i].nmalloc += astats->bstats[i].nmalloc;
+			sstats->bstats[i].ndalloc += astats->bstats[i].ndalloc;
+			sstats->bstats[i].nrequests +=
+			    astats->bstats[i].nrequests;
+			sstats->bstats[i].curregs += astats->bstats[i].curregs;
+			if (config_tcache) {
+				sstats->bstats[i].nfills +=
+				    astats->bstats[i].nfills;
+				sstats->bstats[i].nflushes +=
+				    astats->bstats[i].nflushes;
+			}
+			sstats->bstats[i].nruns += astats->bstats[i].nruns;
+			sstats->bstats[i].reruns += astats->bstats[i].reruns;
+			sstats->bstats[i].curruns += astats->bstats[i].curruns;
 		}
-		sstats->bstats[i].nruns += astats->bstats[i].nruns;
-		sstats->bstats[i].reruns += astats->bstats[i].reruns;
-		sstats->bstats[i].curruns += astats->bstats[i].curruns;
-	}
 
-	for (i = 0; i < nlclasses; i++) {
-		sstats->lstats[i].nmalloc += astats->lstats[i].nmalloc;
-		sstats->lstats[i].ndalloc += astats->lstats[i].ndalloc;
-		sstats->lstats[i].nrequests += astats->lstats[i].nrequests;
-		sstats->lstats[i].curruns += astats->lstats[i].curruns;
-	}
+		for (i = 0; i < nlclasses; i++) {
+			sstats->lstats[i].nmalloc += astats->lstats[i].nmalloc;
+			sstats->lstats[i].ndalloc += astats->lstats[i].ndalloc;
+			sstats->lstats[i].nrequests +=
+			    astats->lstats[i].nrequests;
+			sstats->lstats[i].curruns += astats->lstats[i].curruns;
+		}
 
-	for (i = 0; i < nhclasses; i++) {
-		sstats->hstats[i].nmalloc += astats->hstats[i].nmalloc;
-		sstats->hstats[i].ndalloc += astats->hstats[i].ndalloc;
-		sstats->hstats[i].curhchunks += astats->hstats[i].curhchunks;
+		for (i = 0; i < nhclasses; i++) {
+			sstats->hstats[i].nmalloc += astats->hstats[i].nmalloc;
+			sstats->hstats[i].ndalloc += astats->hstats[i].ndalloc;
+			sstats->hstats[i].curhchunks +=
+			    astats->hstats[i].curhchunks;
+		}
 	}
 }
 
@@ -621,19 +655,9 @@ ctl_arena_refresh(arena_t *arena, unsigned i)
 	ctl_arena_stats_t *sstats = &ctl_stats.arenas[ctl_stats.narenas];
 
 	ctl_arena_clear(astats);
-
-	sstats->nthreads += astats->nthreads;
-	if (config_stats) {
-		ctl_arena_stats_amerge(astats, arena);
-		/* Merge into sum stats as well. */
-		ctl_arena_stats_smerge(sstats, astats);
-	} else {
-		astats->pactive += arena->nactive;
-		astats->pdirty += arena->ndirty;
-		/* Merge into sum stats as well. */
-		sstats->pactive += arena->nactive;
-		sstats->pdirty += arena->ndirty;
-	}
+	ctl_arena_stats_amerge(astats, arena);
+	/* Merge into sum stats as well. */
+	ctl_arena_stats_smerge(sstats, astats);
 }
 
 static bool
@@ -679,33 +703,17 @@ ctl_grow(void)
 static void
 ctl_refresh(void)
 {
-	tsd_t *tsd;
 	unsigned i;
-	bool refreshed;
 	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
 
 	/*
 	 * Clear sum stats, since they will be merged into by
 	 * ctl_arena_refresh().
 	 */
-	ctl_stats.arenas[ctl_stats.narenas].nthreads = 0;
 	ctl_arena_clear(&ctl_stats.arenas[ctl_stats.narenas]);
 
-	tsd = tsd_fetch();
-	for (i = 0, refreshed = false; i < ctl_stats.narenas; i++) {
-		tarenas[i] = arena_get(tsd, i, false, false);
-		if (tarenas[i] == NULL && !refreshed) {
-			tarenas[i] = arena_get(tsd, i, false, true);
-			refreshed = true;
-		}
-	}
-
-	for (i = 0; i < ctl_stats.narenas; i++) {
-		if (tarenas[i] != NULL)
-			ctl_stats.arenas[i].nthreads = arena_nbound(i);
-		else
-			ctl_stats.arenas[i].nthreads = 0;
-	}
+	for (i = 0; i < ctl_stats.narenas; i++)
+		tarenas[i] = arena_get(i, false);
 
 	for (i = 0; i < ctl_stats.narenas; i++) {
 		bool initialized = (tarenas[i] != NULL);
@@ -960,7 +968,7 @@ ctl_bymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		assert(node->nchildren > 0);
 		if (ctl_named_node(node->children) != NULL) {
 			/* Children are named. */
-			if (node->nchildren <= mib[i]) {
+			if (node->nchildren <= (unsigned)mib[i]) {
 				ret = ENOENT;
 				goto label_return;
 			}
@@ -1199,17 +1207,17 @@ label_return:								\
 	return (ret);							\
 }
 
-#define	CTL_RO_BOOL_CONFIG_GEN(n)					\
+#define	CTL_RO_CONFIG_GEN(n, t)						\
 static int								\
 n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
     void *newp, size_t newlen)						\
 {									\
 	int ret;							\
-	bool oldval;							\
+	t oldval;							\
 									\
 	READONLY();							\
 	oldval = n;							\
-	READ(oldval, bool);						\
+	READ(oldval, t);						\
 									\
 	ret = 0;							\
 label_return:								\
@@ -1241,28 +1249,31 @@ label_return:
 
 /******************************************************************************/
 
-CTL_RO_BOOL_CONFIG_GEN(config_cache_oblivious)
-CTL_RO_BOOL_CONFIG_GEN(config_debug)
-CTL_RO_BOOL_CONFIG_GEN(config_fill)
-CTL_RO_BOOL_CONFIG_GEN(config_lazy_lock)
-CTL_RO_BOOL_CONFIG_GEN(config_munmap)
-CTL_RO_BOOL_CONFIG_GEN(config_prof)
-CTL_RO_BOOL_CONFIG_GEN(config_prof_libgcc)
-CTL_RO_BOOL_CONFIG_GEN(config_prof_libunwind)
-CTL_RO_BOOL_CONFIG_GEN(config_stats)
-CTL_RO_BOOL_CONFIG_GEN(config_tcache)
-CTL_RO_BOOL_CONFIG_GEN(config_tls)
-CTL_RO_BOOL_CONFIG_GEN(config_utrace)
-CTL_RO_BOOL_CONFIG_GEN(config_valgrind)
-CTL_RO_BOOL_CONFIG_GEN(config_xmalloc)
+CTL_RO_CONFIG_GEN(config_cache_oblivious, bool)
+CTL_RO_CONFIG_GEN(config_debug, bool)
+CTL_RO_CONFIG_GEN(config_fill, bool)
+CTL_RO_CONFIG_GEN(config_lazy_lock, bool)
+CTL_RO_CONFIG_GEN(config_malloc_conf, const char *)
+CTL_RO_CONFIG_GEN(config_munmap, bool)
+CTL_RO_CONFIG_GEN(config_prof, bool)
+CTL_RO_CONFIG_GEN(config_prof_libgcc, bool)
+CTL_RO_CONFIG_GEN(config_prof_libunwind, bool)
+CTL_RO_CONFIG_GEN(config_stats, bool)
+CTL_RO_CONFIG_GEN(config_tcache, bool)
+CTL_RO_CONFIG_GEN(config_tls, bool)
+CTL_RO_CONFIG_GEN(config_utrace, bool)
+CTL_RO_CONFIG_GEN(config_valgrind, bool)
+CTL_RO_CONFIG_GEN(config_xmalloc, bool)
 
 /******************************************************************************/
 
 CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
 CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
-CTL_RO_NL_GEN(opt_narenas, opt_narenas, size_t)
+CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
+CTL_RO_NL_GEN(opt_purge, purge_mode_names[opt_purge], const char *)
 CTL_RO_NL_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
+CTL_RO_NL_GEN(opt_decay_time, opt_decay_time, ssize_t)
 CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
 CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, const char *)
 CTL_RO_NL_CGEN(config_fill, opt_quarantine, opt_quarantine, size_t)
@@ -1314,7 +1325,7 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		}
 
 		/* Initialize arena if necessary. */
-		newarena = arena_get(tsd, newind, true, true);
+		newarena = arena_get(newind, true);
 		if (newarena == NULL) {
 			ret = EAGAIN;
 			goto label_return;
@@ -1536,34 +1547,44 @@ label_return:
 
 /******************************************************************************/
 
-/* ctl_mutex must be held during execution of this function. */
 static void
-arena_purge(unsigned arena_ind)
+arena_i_purge(unsigned arena_ind, bool all)
 {
-	tsd_t *tsd;
-	unsigned i;
-	bool refreshed;
-	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
 
-	tsd = tsd_fetch();
-	for (i = 0, refreshed = false; i < ctl_stats.narenas; i++) {
-		tarenas[i] = arena_get(tsd, i, false, false);
-		if (tarenas[i] == NULL && !refreshed) {
-			tarenas[i] = arena_get(tsd, i, false, true);
-			refreshed = true;
-		}
-	}
+	malloc_mutex_lock(&ctl_mtx);
+	{
+		unsigned narenas = ctl_stats.narenas;
+
+		if (arena_ind == narenas) {
+			unsigned i;
+			VARIABLE_ARRAY(arena_t *, tarenas, narenas);
+
+			for (i = 0; i < narenas; i++)
+				tarenas[i] = arena_get(i, false);
+
+			/*
+			 * No further need to hold ctl_mtx, since narenas and
+			 * tarenas contain everything needed below.
+			 */
+			malloc_mutex_unlock(&ctl_mtx);
+
+			for (i = 0; i < narenas; i++) {
+				if (tarenas[i] != NULL)
+					arena_purge(tarenas[i], all);
+			}
+		} else {
+			arena_t *tarena;
+
+			assert(arena_ind < narenas);
+
+			tarena = arena_get(arena_ind, false);
 
-	if (arena_ind == ctl_stats.narenas) {
-		unsigned i;
-		for (i = 0; i < ctl_stats.narenas; i++) {
-			if (tarenas[i] != NULL)
-				arena_purge_all(tarenas[i]);
+			/* No further need to hold ctl_mtx. */
+			malloc_mutex_unlock(&ctl_mtx);
+
+			if (tarena != NULL)
+				arena_purge(tarena, all);
 		}
-	} else {
-		assert(arena_ind < ctl_stats.narenas);
-		if (tarenas[arena_ind] != NULL)
-			arena_purge_all(tarenas[arena_ind]);
 	}
 }
 
@@ -1575,9 +1596,22 @@ arena_i_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 
 	READONLY();
 	WRITEONLY();
-	malloc_mutex_lock(&ctl_mtx);
-	arena_purge(mib[1]);
-	malloc_mutex_unlock(&ctl_mtx);
+	arena_i_purge((unsigned)mib[1], true);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
+arena_i_decay_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+
+	READONLY();
+	WRITEONLY();
+	arena_i_purge((unsigned)mib[1], false);
 
 	ret = 0;
 label_return:
@@ -1590,7 +1624,7 @@ arena_i_dss_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 {
 	int ret;
 	const char *dss = NULL;
-	unsigned arena_ind = mib[1];
+	unsigned arena_ind = (unsigned)mib[1];
 	dss_prec_t dss_prec_old = dss_prec_limit;
 	dss_prec_t dss_prec = dss_prec_limit;
 
@@ -1615,7 +1649,7 @@ arena_i_dss_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	}
 
 	if (arena_ind < ctl_stats.narenas) {
-		arena_t *arena = arena_get(tsd_fetch(), arena_ind, false, true);
+		arena_t *arena = arena_get(arena_ind, false);
 		if (arena == NULL || (dss_prec != dss_prec_limit &&
 		    arena_dss_prec_set(arena, dss_prec))) {
 			ret = EFAULT;
@@ -1645,10 +1679,10 @@ arena_i_lg_dirty_mult_ctl(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen)
 {
 	int ret;
-	unsigned arena_ind = mib[1];
+	unsigned arena_ind = (unsigned)mib[1];
 	arena_t *arena;
 
-	arena = arena_get(tsd_fetch(), arena_ind, false, true);
+	arena = arena_get(arena_ind, false);
 	if (arena == NULL) {
 		ret = EFAULT;
 		goto label_return;
@@ -1675,16 +1709,50 @@ label_return:
 }
 
 static int
+arena_i_decay_time_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	unsigned arena_ind = (unsigned)mib[1];
+	arena_t *arena;
+
+	arena = arena_get(arena_ind, false);
+	if (arena == NULL) {
+		ret = EFAULT;
+		goto label_return;
+	}
+
+	if (oldp != NULL && oldlenp != NULL) {
+		size_t oldval = arena_decay_time_get(arena);
+		READ(oldval, ssize_t);
+	}
+	if (newp != NULL) {
+		if (newlen != sizeof(ssize_t)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (arena_decay_time_set(arena, *(ssize_t *)newp)) {
+			ret = EFAULT;
+			goto label_return;
+		}
+	}
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
 arena_i_chunk_hooks_ctl(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen)
 {
 	int ret;
-	unsigned arena_ind = mib[1];
+	unsigned arena_ind = (unsigned)mib[1];
 	arena_t *arena;
 
 	malloc_mutex_lock(&ctl_mtx);
 	if (arena_ind < narenas_total_get() && (arena =
-	    arena_get(tsd_fetch(), arena_ind, false, true)) != NULL) {
+	    arena_get(arena_ind, false)) != NULL) {
 		if (newp != NULL) {
 			chunk_hooks_t old_chunk_hooks, new_chunk_hooks;
 			WRITE(new_chunk_hooks, chunk_hooks_t);
@@ -1758,7 +1826,7 @@ arenas_initialized_ctl(const size_t *mib, size_t miblen, void *oldp,
 	if (*oldlenp != ctl_stats.narenas * sizeof(bool)) {
 		ret = EINVAL;
 		nread = (*oldlenp < ctl_stats.narenas * sizeof(bool))
-		    ? (*oldlenp / sizeof(bool)) : ctl_stats.narenas;
+		    ? (unsigned)(*oldlenp / sizeof(bool)) : ctl_stats.narenas;
 	} else {
 		ret = 0;
 		nread = ctl_stats.narenas;
@@ -1798,6 +1866,32 @@ label_return:
 	return (ret);
 }
 
+static int
+arenas_decay_time_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+
+	if (oldp != NULL && oldlenp != NULL) {
+		size_t oldval = arena_decay_time_default_get();
+		READ(oldval, ssize_t);
+	}
+	if (newp != NULL) {
+		if (newlen != sizeof(ssize_t)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (arena_decay_time_default_set(*(ssize_t *)newp)) {
+			ret = EFAULT;
+			goto label_return;
+		}
+	}
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
 CTL_RO_NL_GEN(arenas_quantum, QUANTUM, size_t)
 CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_CGEN(config_tcache, arenas_tcache_max, tcache_maxclass, size_t)
@@ -1816,7 +1910,7 @@ arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
 }
 
 CTL_RO_NL_GEN(arenas_nlruns, nlclasses, unsigned)
-CTL_RO_NL_GEN(arenas_lrun_i_size, index2size(NBINS+mib[2]), size_t)
+CTL_RO_NL_GEN(arenas_lrun_i_size, index2size(NBINS+(szind_t)mib[2]), size_t)
 static const ctl_named_node_t *
 arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
 {
@@ -1827,7 +1921,8 @@ arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
 }
 
 CTL_RO_NL_GEN(arenas_nhchunks, nhclasses, unsigned)
-CTL_RO_NL_GEN(arenas_hchunk_i_size, index2size(NBINS+nlclasses+mib[2]), size_t)
+CTL_RO_NL_GEN(arenas_hchunk_i_size, index2size(NBINS+nlclasses+(szind_t)mib[2]),
+    size_t)
 static const ctl_named_node_t *
 arenas_hchunk_i_index(const size_t *mib, size_t miblen, size_t i)
 {
@@ -1999,6 +2094,8 @@ CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
 CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
 CTL_RO_GEN(stats_arenas_i_lg_dirty_mult, ctl_stats.arenas[mib[2]].lg_dirty_mult,
     ssize_t)
+CTL_RO_GEN(stats_arenas_i_decay_time, ctl_stats.arenas[mib[2]].decay_time,
+    ssize_t)
 CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
 CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
 CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
diff --git a/contrib/jemalloc/src/extent.c b/contrib/jemalloc/src/extent.c
index 13f94411c15a..9f5146e5ff5c 100644
--- a/contrib/jemalloc/src/extent.c
+++ b/contrib/jemalloc/src/extent.c
@@ -15,7 +15,7 @@ extent_quantize(size_t size)
 }
 
 JEMALLOC_INLINE_C int
-extent_szad_comp(extent_node_t *a, extent_node_t *b)
+extent_szad_comp(const extent_node_t *a, const extent_node_t *b)
 {
 	int ret;
 	size_t a_qsize = extent_quantize(extent_node_size_get(a));
@@ -41,7 +41,7 @@ rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, szad_link,
     extent_szad_comp)
 
 JEMALLOC_INLINE_C int
-extent_ad_comp(extent_node_t *a, extent_node_t *b)
+extent_ad_comp(const extent_node_t *a, const extent_node_t *b)
 {
 	uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a);
 	uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b);
diff --git a/contrib/jemalloc/src/huge.c b/contrib/jemalloc/src/huge.c
index 1e9a66512f12..5f7ceaf191c1 100644
--- a/contrib/jemalloc/src/huge.c
+++ b/contrib/jemalloc/src/huge.c
@@ -31,35 +31,30 @@ huge_node_unset(const void *ptr, const extent_node_t *node)
 }
 
 void *
-huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+huge_malloc(tsd_t *tsd, arena_t *arena, size_t usize, bool zero,
     tcache_t *tcache)
 {
-	size_t usize;
 
-	usize = s2u(size);
-	if (usize == 0) {
-		/* size_t overflow. */
-		return (NULL);
-	}
+	assert(usize == s2u(usize));
 
 	return (huge_palloc(tsd, arena, usize, chunksize, zero, tcache));
 }
 
 void *
-huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
     bool zero, tcache_t *tcache)
 {
 	void *ret;
-	size_t usize;
+	size_t ausize;
 	extent_node_t *node;
 	bool is_zeroed;
 
 	/* Allocate one or more contiguous chunks for this request. */
 
-	usize = sa2u(size, alignment);
-	if (unlikely(usize == 0))
+	ausize = sa2u(usize, alignment);
+	if (unlikely(ausize == 0 || ausize > HUGE_MAXCLASS))
 		return (NULL);
-	assert(usize >= chunksize);
+	assert(ausize >= chunksize);
 
 	/* Allocate an extent node with which to track the chunk. */
 	node = ipallocztm(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
@@ -74,16 +69,16 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	is_zeroed = zero;
 	arena = arena_choose(tsd, arena);
 	if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(arena,
-	    size, alignment, &is_zeroed)) == NULL) {
-		idalloctm(tsd, node, tcache, true);
+	    usize, alignment, &is_zeroed)) == NULL) {
+		idalloctm(tsd, node, tcache, true, true);
 		return (NULL);
 	}
 
-	extent_node_init(node, arena, ret, size, is_zeroed, true);
+	extent_node_init(node, arena, ret, usize, is_zeroed, true);
 
 	if (huge_node_set(ret, node)) {
-		arena_chunk_dalloc_huge(arena, ret, size);
-		idalloctm(tsd, node, tcache, true);
+		arena_chunk_dalloc_huge(arena, ret, usize);
+		idalloctm(tsd, node, tcache, true, true);
 		return (NULL);
 	}
 
@@ -95,10 +90,11 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 
 	if (zero || (config_fill && unlikely(opt_zero))) {
 		if (!is_zeroed)
-			memset(ret, 0, size);
+			memset(ret, 0, usize);
 	} else if (config_fill && unlikely(opt_junk_alloc))
-		memset(ret, 0xa5, size);
+		memset(ret, 0xa5, usize);
 
+	arena_decay_tick(tsd, arena);
 	return (ret);
 }
 
@@ -280,11 +276,13 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t usize, bool zero) {
 }
 
 bool
-huge_ralloc_no_move(void *ptr, size_t oldsize, size_t usize_min,
+huge_ralloc_no_move(tsd_t *tsd, void *ptr, size_t oldsize, size_t usize_min,
     size_t usize_max, bool zero)
 {
 
 	assert(s2u(oldsize) == oldsize);
+	/* The following should have been caught by callers. */
+	assert(usize_min > 0 && usize_max <= HUGE_MAXCLASS);
 
 	/* Both allocations must be huge to avoid a move. */
 	if (oldsize < chunksize || usize_max < chunksize)
@@ -292,13 +290,18 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t usize_min,
 
 	if (CHUNK_CEILING(usize_max) > CHUNK_CEILING(oldsize)) {
 		/* Attempt to expand the allocation in-place. */
-		if (!huge_ralloc_no_move_expand(ptr, oldsize, usize_max, zero))
+		if (!huge_ralloc_no_move_expand(ptr, oldsize, usize_max,
+		    zero)) {
+			arena_decay_tick(tsd, huge_aalloc(ptr));
 			return (false);
+		}
 		/* Try again, this time with usize_min. */
 		if (usize_min < usize_max && CHUNK_CEILING(usize_min) >
 		    CHUNK_CEILING(oldsize) && huge_ralloc_no_move_expand(ptr,
-		    oldsize, usize_min, zero))
+		    oldsize, usize_min, zero)) {
+			arena_decay_tick(tsd, huge_aalloc(ptr));
 			return (false);
+		}
 	}
 
 	/*
@@ -309,12 +312,17 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t usize_min,
 	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(usize_max)) {
 		huge_ralloc_no_move_similar(ptr, oldsize, usize_min, usize_max,
 		    zero);
+		arena_decay_tick(tsd, huge_aalloc(ptr));
 		return (false);
 	}
 
 	/* Attempt to shrink the allocation in-place. */
-	if (CHUNK_CEILING(oldsize) > CHUNK_CEILING(usize_max))
-		return (huge_ralloc_no_move_shrink(ptr, oldsize, usize_max));
+	if (CHUNK_CEILING(oldsize) > CHUNK_CEILING(usize_max)) {
+		if (!huge_ralloc_no_move_shrink(ptr, oldsize, usize_max)) {
+			arena_decay_tick(tsd, huge_aalloc(ptr));
+			return (false);
+		}
+	}
 	return (true);
 }
 
@@ -335,8 +343,11 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t usize,
 	void *ret;
 	size_t copysize;
 
+	/* The following should have been caught by callers. */
+	assert(usize > 0 && usize <= HUGE_MAXCLASS);
+
 	/* Try to avoid moving the allocation. */
-	if (!huge_ralloc_no_move(ptr, oldsize, usize, usize, zero))
+	if (!huge_ralloc_no_move(tsd, ptr, oldsize, usize, usize, zero))
 		return (ptr);
 
 	/*
@@ -372,7 +383,9 @@ huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 	    extent_node_size_get(node));
 	arena_chunk_dalloc_huge(extent_node_arena_get(node),
 	    extent_node_addr_get(node), extent_node_size_get(node));
-	idalloctm(tsd, node, tcache, true);
+	idalloctm(tsd, node, tcache, true, true);
+
+	arena_decay_tick(tsd, arena);
 }
 
 arena_t *
diff --git a/contrib/jemalloc/src/jemalloc.c b/contrib/jemalloc/src/jemalloc.c
index b6cbb79d6369..a34b85c9b5a2 100644
--- a/contrib/jemalloc/src/jemalloc.c
+++ b/contrib/jemalloc/src/jemalloc.c
@@ -44,14 +44,14 @@ bool	opt_redzone = false;
 bool	opt_utrace = false;
 bool	opt_xmalloc = false;
 bool	opt_zero = false;
-size_t	opt_narenas = 0;
+unsigned	opt_narenas = 0;
 
 /* Initialized to true if the process is running inside Valgrind. */
 bool	in_valgrind;
 
 unsigned	ncpus;
 
-/* Protects arenas initialization (arenas, narenas_total). */
+/* Protects arenas initialization. */
 static malloc_mutex_t	arenas_lock;
 /*
  * Arenas that are used to service external requests.  Not all elements of the
@@ -61,8 +61,8 @@ static malloc_mutex_t	arenas_lock;
  * arenas.  arenas[narenas_auto..narenas_total) are only used if the application
  * takes some action to create them and allocate from them.
  */
-static arena_t		**arenas;
-static unsigned		narenas_total;
+arena_t			**arenas;
+static unsigned		narenas_total; /* Use narenas_total_*(). */
 static arena_t		*a0; /* arenas[0]; read-only after initialization. */
 static unsigned		narenas_auto; /* Read-only after initialization. */
 
@@ -74,12 +74,29 @@ typedef enum {
 } malloc_init_t;
 static malloc_init_t	malloc_init_state = malloc_init_uninitialized;
 
+/* 0 should be the common case.  Set to true to trigger initialization. */
+static bool	malloc_slow = true;
+
+/* When malloc_slow != 0, set the corresponding bits for sanity check. */
+enum {
+	flag_opt_junk_alloc	= (1U),
+	flag_opt_junk_free	= (1U << 1),
+	flag_opt_quarantine	= (1U << 2),
+	flag_opt_zero		= (1U << 3),
+	flag_opt_utrace		= (1U << 4),
+	flag_in_valgrind	= (1U << 5),
+	flag_opt_xmalloc	= (1U << 6)
+};
+static uint8_t	malloc_slow_flags;
+
+/* Last entry for overflow detection only.  */
 JEMALLOC_ALIGNED(CACHELINE)
-const size_t	index2size_tab[NSIZES] = {
+const size_t	index2size_tab[NSIZES+1] = {
 #define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
 	((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta)),
 	SIZE_CLASSES
 #undef SC
+	ZU(0)
 };
 
 JEMALLOC_ALIGNED(CACHELINE)
@@ -298,14 +315,6 @@ malloc_init(void)
  * cannot tolerate TLS variable access.
  */
 
-arena_t *
-a0get(void)
-{
-
-	assert(a0 != NULL);
-	return (a0);
-}
-
 static void *
 a0ialloc(size_t size, bool zero, bool is_metadata)
 {
@@ -313,14 +322,15 @@ a0ialloc(size_t size, bool zero, bool is_metadata)
 	if (unlikely(malloc_init_a0()))
 		return (NULL);
 
-	return (iallocztm(NULL, size, zero, false, is_metadata, a0get()));
+	return (iallocztm(NULL, size, size2index(size), zero, false,
+	    is_metadata, arena_get(0, false), true));
 }
 
 static void
 a0idalloc(void *ptr, bool is_metadata)
 {
 
-	idalloctm(NULL, ptr, false, is_metadata);
+	idalloctm(NULL, ptr, false, is_metadata, true);
 }
 
 void *
@@ -377,47 +387,59 @@ bootstrap_free(void *ptr)
 	a0idalloc(ptr, false);
 }
 
+static void
+arena_set(unsigned ind, arena_t *arena)
+{
+
+	atomic_write_p((void **)&arenas[ind], arena);
+}
+
+static void
+narenas_total_set(unsigned narenas)
+{
+
+	atomic_write_u(&narenas_total, narenas);
+}
+
+static void
+narenas_total_inc(void)
+{
+
+	atomic_add_u(&narenas_total, 1);
+}
+
+unsigned
+narenas_total_get(void)
+{
+
+	return (atomic_read_u(&narenas_total));
+}
+
 /* Create a new arena and insert it into the arenas array at index ind. */
 static arena_t *
 arena_init_locked(unsigned ind)
 {
 	arena_t *arena;
 
-	/* Expand arenas if necessary. */
-	assert(ind <= narenas_total);
+	assert(ind <= narenas_total_get());
 	if (ind > MALLOCX_ARENA_MAX)
 		return (NULL);
-	if (ind == narenas_total) {
-		unsigned narenas_new = narenas_total + 1;
-		arena_t **arenas_new =
-		    (arena_t **)a0malloc(CACHELINE_CEILING(narenas_new *
-		    sizeof(arena_t *)));
-		if (arenas_new == NULL)
-			return (NULL);
-		memcpy(arenas_new, arenas, narenas_total * sizeof(arena_t *));
-		arenas_new[ind] = NULL;
-		/*
-		 * Deallocate only if arenas came from a0malloc() (not
-		 * base_alloc()).
-		 */
-		if (narenas_total != narenas_auto)
-			a0dalloc(arenas);
-		arenas = arenas_new;
-		narenas_total = narenas_new;
-	}
+	if (ind == narenas_total_get())
+		narenas_total_inc();
 
 	/*
 	 * Another thread may have already initialized arenas[ind] if it's an
 	 * auto arena.
 	 */
-	arena = arenas[ind];
+	arena = arena_get(ind, false);
 	if (arena != NULL) {
 		assert(ind < narenas_auto);
 		return (arena);
 	}
 
 	/* Actually initialize the arena. */
-	arena = arenas[ind] = arena_new(ind);
+	arena = arena_new(ind);
+	arena_set(ind, arena);
 	return (arena);
 }
 
@@ -432,145 +454,114 @@ arena_init(unsigned ind)
 	return (arena);
 }
 
-unsigned
-narenas_total_get(void)
-{
-	unsigned narenas;
-
-	malloc_mutex_lock(&arenas_lock);
-	narenas = narenas_total;
-	malloc_mutex_unlock(&arenas_lock);
-
-	return (narenas);
-}
-
 static void
-arena_bind_locked(tsd_t *tsd, unsigned ind)
+arena_bind(tsd_t *tsd, unsigned ind)
 {
 	arena_t *arena;
 
-	arena = arenas[ind];
-	arena->nthreads++;
+	arena = arena_get(ind, false);
+	arena_nthreads_inc(arena);
 
 	if (tsd_nominal(tsd))
 		tsd_arena_set(tsd, arena);
 }
 
-static void
-arena_bind(tsd_t *tsd, unsigned ind)
-{
-
-	malloc_mutex_lock(&arenas_lock);
-	arena_bind_locked(tsd, ind);
-	malloc_mutex_unlock(&arenas_lock);
-}
-
 void
 arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind)
 {
 	arena_t *oldarena, *newarena;
 
-	malloc_mutex_lock(&arenas_lock);
-	oldarena = arenas[oldind];
-	newarena = arenas[newind];
-	oldarena->nthreads--;
-	newarena->nthreads++;
-	malloc_mutex_unlock(&arenas_lock);
+	oldarena = arena_get(oldind, false);
+	newarena = arena_get(newind, false);
+	arena_nthreads_dec(oldarena);
+	arena_nthreads_inc(newarena);
 	tsd_arena_set(tsd, newarena);
 }
 
-unsigned
-arena_nbound(unsigned ind)
-{
-	unsigned nthreads;
-
-	malloc_mutex_lock(&arenas_lock);
-	nthreads = arenas[ind]->nthreads;
-	malloc_mutex_unlock(&arenas_lock);
-	return (nthreads);
-}
-
 static void
 arena_unbind(tsd_t *tsd, unsigned ind)
 {
 	arena_t *arena;
 
-	malloc_mutex_lock(&arenas_lock);
-	arena = arenas[ind];
-	arena->nthreads--;
-	malloc_mutex_unlock(&arenas_lock);
+	arena = arena_get(ind, false);
+	arena_nthreads_dec(arena);
 	tsd_arena_set(tsd, NULL);
 }
 
-arena_t *
-arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing)
+arena_tdata_t *
+arena_tdata_get_hard(tsd_t *tsd, unsigned ind)
 {
-	arena_t *arena;
-	arena_t **arenas_cache = tsd_arenas_cache_get(tsd);
-	unsigned narenas_cache = tsd_narenas_cache_get(tsd);
+	arena_tdata_t *tdata, *arenas_tdata_old;
+	arena_tdata_t *arenas_tdata = tsd_arenas_tdata_get(tsd);
+	unsigned narenas_tdata_old, i;
+	unsigned narenas_tdata = tsd_narenas_tdata_get(tsd);
 	unsigned narenas_actual = narenas_total_get();
 
-	/* Deallocate old cache if it's too small. */
-	if (arenas_cache != NULL && narenas_cache < narenas_actual) {
-		a0dalloc(arenas_cache);
-		arenas_cache = NULL;
-		narenas_cache = 0;
-		tsd_arenas_cache_set(tsd, arenas_cache);
-		tsd_narenas_cache_set(tsd, narenas_cache);
-	}
-
-	/* Allocate cache if it's missing. */
-	if (arenas_cache == NULL) {
-		bool *arenas_cache_bypassp = tsd_arenas_cache_bypassp_get(tsd);
-		assert(ind < narenas_actual || !init_if_missing);
-		narenas_cache = (ind < narenas_actual) ? narenas_actual : ind+1;
-
-		if (tsd_nominal(tsd) && !*arenas_cache_bypassp) {
-			*arenas_cache_bypassp = true;
-			arenas_cache = (arena_t **)a0malloc(sizeof(arena_t *) *
-			    narenas_cache);
-			*arenas_cache_bypassp = false;
+	/*
+	 * Dissociate old tdata array (and set up for deallocation upon return)
+	 * if it's too small.
+	 */
+	if (arenas_tdata != NULL && narenas_tdata < narenas_actual) {
+		arenas_tdata_old = arenas_tdata;
+		narenas_tdata_old = narenas_tdata;
+		arenas_tdata = NULL;
+		narenas_tdata = 0;
+		tsd_arenas_tdata_set(tsd, arenas_tdata);
+		tsd_narenas_tdata_set(tsd, narenas_tdata);
+	} else {
+		arenas_tdata_old = NULL;
+		narenas_tdata_old = 0;
+	}
+
+	/* Allocate tdata array if it's missing. */
+	if (arenas_tdata == NULL) {
+		bool *arenas_tdata_bypassp = tsd_arenas_tdata_bypassp_get(tsd);
+		narenas_tdata = (ind < narenas_actual) ? narenas_actual : ind+1;
+
+		if (tsd_nominal(tsd) && !*arenas_tdata_bypassp) {
+			*arenas_tdata_bypassp = true;
+			arenas_tdata = (arena_tdata_t *)a0malloc(
+			    sizeof(arena_tdata_t) * narenas_tdata);
+			*arenas_tdata_bypassp = false;
 		}
-		if (arenas_cache == NULL) {
-			/*
-			 * This function must always tell the truth, even if
-			 * it's slow, so don't let OOM, thread cleanup (note
-			 * tsd_nominal check), nor recursive allocation
-			 * avoidance (note arenas_cache_bypass check) get in the
-			 * way.
-			 */
-			if (ind >= narenas_actual)
-				return (NULL);
-			malloc_mutex_lock(&arenas_lock);
-			arena = arenas[ind];
-			malloc_mutex_unlock(&arenas_lock);
-			return (arena);
+		if (arenas_tdata == NULL) {
+			tdata = NULL;
+			goto label_return;
 		}
-		assert(tsd_nominal(tsd) && !*arenas_cache_bypassp);
-		tsd_arenas_cache_set(tsd, arenas_cache);
-		tsd_narenas_cache_set(tsd, narenas_cache);
+		assert(tsd_nominal(tsd) && !*arenas_tdata_bypassp);
+		tsd_arenas_tdata_set(tsd, arenas_tdata);
+		tsd_narenas_tdata_set(tsd, narenas_tdata);
 	}
 
 	/*
-	 * Copy to cache.  It's possible that the actual number of arenas has
-	 * increased since narenas_total_get() was called above, but that causes
-	 * no correctness issues unless two threads concurrently execute the
-	 * arenas.extend mallctl, which we trust mallctl synchronization to
+	 * Copy to tdata array.  It's possible that the actual number of arenas
+	 * has increased since narenas_total_get() was called above, but that
+	 * causes no correctness issues unless two threads concurrently execute
+	 * the arenas.extend mallctl, which we trust mallctl synchronization to
 	 * prevent.
 	 */
-	malloc_mutex_lock(&arenas_lock);
-	memcpy(arenas_cache, arenas, sizeof(arena_t *) * narenas_actual);
-	malloc_mutex_unlock(&arenas_lock);
-	if (narenas_cache > narenas_actual) {
-		memset(&arenas_cache[narenas_actual], 0, sizeof(arena_t *) *
-		    (narenas_cache - narenas_actual));
+
+	/* Copy/initialize tickers. */
+	for (i = 0; i < narenas_actual; i++) {
+		if (i < narenas_tdata_old) {
+			ticker_copy(&arenas_tdata[i].decay_ticker,
+			    &arenas_tdata_old[i].decay_ticker);
+		} else {
+			ticker_init(&arenas_tdata[i].decay_ticker,
+			    DECAY_NTICKS_PER_UPDATE);
+		}
+	}
+	if (narenas_tdata > narenas_actual) {
+		memset(&arenas_tdata[narenas_actual], 0, sizeof(arena_tdata_t)
+		    * (narenas_tdata - narenas_actual));
 	}
 
-	/* Read the refreshed cache, and init the arena if necessary. */
-	arena = arenas_cache[ind];
-	if (init_if_missing && arena == NULL)
-		arena = arenas_cache[ind] = arena_init(ind);
-	return (arena);
+	/* Read the refreshed tdata array. */
+	tdata = &arenas_tdata[ind];
+label_return:
+	if (arenas_tdata_old != NULL)
+		a0dalloc(arenas_tdata_old);
+	return (tdata);
 }
 
 /* Slow path, called only by arena_choose(). */
@@ -585,15 +576,16 @@ arena_choose_hard(tsd_t *tsd)
 		choose = 0;
 		first_null = narenas_auto;
 		malloc_mutex_lock(&arenas_lock);
-		assert(a0get() != NULL);
+		assert(arena_get(0, false) != NULL);
 		for (i = 1; i < narenas_auto; i++) {
-			if (arenas[i] != NULL) {
+			if (arena_get(i, false) != NULL) {
 				/*
 				 * Choose the first arena that has the lowest
 				 * number of threads assigned to it.
 				 */
-				if (arenas[i]->nthreads <
-				    arenas[choose]->nthreads)
+				if (arena_nthreads_get(arena_get(i, false)) <
+				    arena_nthreads_get(arena_get(choose,
+				    false)))
 					choose = i;
 			} else if (first_null == narenas_auto) {
 				/*
@@ -609,13 +601,13 @@ arena_choose_hard(tsd_t *tsd)
 			}
 		}
 
-		if (arenas[choose]->nthreads == 0
+		if (arena_nthreads_get(arena_get(choose, false)) == 0
 		    || first_null == narenas_auto) {
 			/*
 			 * Use an unloaded arena, or the least loaded arena if
 			 * all arenas are already initialized.
 			 */
-			ret = arenas[choose];
+			ret = arena_get(choose, false);
 		} else {
 			/* Initialize a new arena. */
 			choose = first_null;
@@ -625,10 +617,10 @@ arena_choose_hard(tsd_t *tsd)
 				return (NULL);
 			}
 		}
-		arena_bind_locked(tsd, choose);
+		arena_bind(tsd, choose);
 		malloc_mutex_unlock(&arenas_lock);
 	} else {
-		ret = a0get();
+		ret = arena_get(0, false);
 		arena_bind(tsd, 0);
 	}
 
@@ -660,26 +652,29 @@ arena_cleanup(tsd_t *tsd)
 }
 
 void
-arenas_cache_cleanup(tsd_t *tsd)
+arenas_tdata_cleanup(tsd_t *tsd)
 {
-	arena_t **arenas_cache;
+	arena_tdata_t *arenas_tdata;
 
-	arenas_cache = tsd_arenas_cache_get(tsd);
-	if (arenas_cache != NULL) {
-		tsd_arenas_cache_set(tsd, NULL);
-		a0dalloc(arenas_cache);
+	/* Prevent tsd->arenas_tdata from being (re)created. */
+	*tsd_arenas_tdata_bypassp_get(tsd) = true;
+
+	arenas_tdata = tsd_arenas_tdata_get(tsd);
+	if (arenas_tdata != NULL) {
+		tsd_arenas_tdata_set(tsd, NULL);
+		a0dalloc(arenas_tdata);
 	}
 }
 
 void
-narenas_cache_cleanup(tsd_t *tsd)
+narenas_tdata_cleanup(tsd_t *tsd)
 {
 
 	/* Do nothing. */
 }
 
 void
-arenas_cache_bypass_cleanup(tsd_t *tsd)
+arenas_tdata_bypass_cleanup(tsd_t *tsd)
 {
 
 	/* Do nothing. */
@@ -700,7 +695,7 @@ stats_print_atexit(void)
 		 * continue to allocate.
 		 */
 		for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
-			arena_t *arena = arenas[i];
+			arena_t *arena = arena_get(i, false);
 			if (arena != NULL) {
 				tcache_t *tcache;
 
@@ -843,6 +838,26 @@ malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v,
 }
 
 static void
+malloc_slow_flag_init(void)
+{
+	/*
+	 * Combine the runtime options into malloc_slow for fast path.  Called
+	 * after processing all the options.
+	 */
+	malloc_slow_flags |= (opt_junk_alloc ? flag_opt_junk_alloc : 0)
+	    | (opt_junk_free ? flag_opt_junk_free : 0)
+	    | (opt_quarantine ? flag_opt_quarantine : 0)
+	    | (opt_zero ? flag_opt_zero : 0)
+	    | (opt_utrace ? flag_opt_utrace : 0)
+	    | (opt_xmalloc ? flag_opt_xmalloc : 0);
+
+	if (config_valgrind)
+		malloc_slow_flags |= (in_valgrind ? flag_in_valgrind : 0);
+
+	malloc_slow = (malloc_slow_flags != 0);
+}
+
+static void
 malloc_conf_init(void)
 {
 	unsigned i;
@@ -868,10 +883,13 @@ malloc_conf_init(void)
 			opt_tcache = false;
 	}
 
-	for (i = 0; i < 3; i++) {
+	for (i = 0; i < 4; i++) {
 		/* Get runtime configuration. */
 		switch (i) {
 		case 0:
+			opts = config_malloc_conf;
+			break;
+		case 1:
 			if (je_malloc_conf != NULL) {
 				/*
 				 * Use options that were compiled into the
@@ -884,8 +902,8 @@ malloc_conf_init(void)
 				opts = buf;
 			}
 			break;
-		case 1: {
-			int linklen = 0;
+		case 2: {
+			ssize_t linklen = 0;
 #ifndef _WIN32
 			int saved_errno = errno;
 			const char *linkname =
@@ -911,7 +929,7 @@ malloc_conf_init(void)
 			buf[linklen] = '\0';
 			opts = buf;
 			break;
-		} case 2: {
+		} case 3: {
 			const char *envname =
 #ifdef JEMALLOC_PREFIX
 			    JEMALLOC_CPREFIX"MALLOC_CONF"
@@ -958,7 +976,7 @@ malloc_conf_init(void)
 				if (cont)				\
 					continue;			\
 			}
-#define	CONF_HANDLE_SIZE_T(o, n, min, max, clip)			\
+#define	CONF_HANDLE_T_U(t, o, n, min, max, clip)			\
 			if (CONF_MATCH(n)) {				\
 				uintmax_t um;				\
 				char *end;				\
@@ -972,11 +990,11 @@ malloc_conf_init(void)
 					    k, klen, v, vlen);		\
 				} else if (clip) {			\
 					if ((min) != 0 && um < (min))	\
-						o = (min);		\
+						o = (t)(min);		\
 					else if (um > (max))		\
-						o = (max);		\
+						o = (t)(max);		\
 					else				\
-						o = um;			\
+						o = (t)um;		\
 				} else {				\
 					if (((min) != 0 && um < (min))	\
 					    || um > (max)) {		\
@@ -985,10 +1003,14 @@ malloc_conf_init(void)
 						    "conf value",	\
 						    k, klen, v, vlen);	\
 					} else				\
-						o = um;			\
+						o = (t)um;		\
 				}					\
 				continue;				\
 			}
+#define	CONF_HANDLE_UNSIGNED(o, n, min, max, clip)			\
+			CONF_HANDLE_T_U(unsigned, o, n, min, max, clip)
+#define	CONF_HANDLE_SIZE_T(o, n, min, max, clip)			\
+			CONF_HANDLE_T_U(size_t, o, n, min, max, clip)
 #define	CONF_HANDLE_SSIZE_T(o, n, min, max)				\
 			if (CONF_MATCH(n)) {				\
 				long l;					\
@@ -1056,10 +1078,29 @@ malloc_conf_init(void)
 				}
 				continue;
 			}
-			CONF_HANDLE_SIZE_T(opt_narenas, "narenas", 1,
-			    SIZE_T_MAX, false)
+			CONF_HANDLE_UNSIGNED(opt_narenas, "narenas", 1,
+			    UINT_MAX, false)
+			if (strncmp("purge", k, klen) == 0) {
+				int i;
+				bool match = false;
+				for (i = 0; i < purge_mode_limit; i++) {
+					if (strncmp(purge_mode_names[i], v,
+					    vlen) == 0) {
+						opt_purge = (purge_mode_t)i;
+						match = true;
+						break;
+					}
+				}
+				if (!match) {
+					malloc_conf_error("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				continue;
+			}
 			CONF_HANDLE_SSIZE_T(opt_lg_dirty_mult, "lg_dirty_mult",
 			    -1, (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SSIZE_T(opt_decay_time, "decay_time", -1,
+			    NSTIME_SEC_MAX);
 			CONF_HANDLE_BOOL(opt_stats_print, "stats_print", true)
 			if (config_fill) {
 				if (CONF_MATCH("junk")) {
@@ -1213,7 +1254,8 @@ malloc_init_hard_a0_locked(void)
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
 	 */
-	narenas_total = narenas_auto = 1;
+	narenas_auto = 1;
+	narenas_total_set(narenas_auto);
 	arenas = &a0;
 	memset(arenas, 0, sizeof(arena_t *) * narenas_auto);
 	/*
@@ -1242,26 +1284,37 @@ malloc_init_hard_a0(void)
  *
  * init_lock must be held.
  */
-static void
+static bool
 malloc_init_hard_recursible(void)
 {
+	bool ret = false;
 
 	malloc_init_state = malloc_init_recursible;
 	malloc_mutex_unlock(&init_lock);
 
+	/* LinuxThreads' pthread_setspecific() allocates. */
+	if (malloc_tsd_boot0()) {
+		ret = true;
+		goto label_return;
+	}
+
 	ncpus = malloc_ncpus();
 
 #if (!defined(JEMALLOC_MUTEX_INIT_CB) && !defined(JEMALLOC_ZONE) \
     && !defined(_WIN32) && !defined(__native_client__))
-	/* LinuxThreads's pthread_atfork() allocates. */
+	/* LinuxThreads' pthread_atfork() allocates. */
 	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent,
 	    jemalloc_postfork_child) != 0) {
+		ret = true;
 		malloc_write("<jemalloc>: Error in pthread_atfork()\n");
 		if (opt_abort)
 			abort();
 	}
 #endif
+
+label_return:
 	malloc_mutex_lock(&init_lock);
+	return (ret);
 }
 
 /* init_lock must be held. */
@@ -1284,30 +1337,26 @@ malloc_init_hard_finish(void)
 	}
 	narenas_auto = opt_narenas;
 	/*
-	 * Make sure that the arenas array can be allocated.  In practice, this
-	 * limit is enough to allow the allocator to function, but the ctl
-	 * machinery will fail to allocate memory at far lower limits.
+	 * Limit the number of arenas to the indexing range of MALLOCX_ARENA().
 	 */
-	if (narenas_auto > chunksize / sizeof(arena_t *)) {
-		narenas_auto = chunksize / sizeof(arena_t *);
+	if (narenas_auto > MALLOCX_ARENA_MAX) {
+		narenas_auto = MALLOCX_ARENA_MAX;
 		malloc_printf("<jemalloc>: Reducing narenas to limit (%d)\n",
 		    narenas_auto);
 	}
-	narenas_total = narenas_auto;
+	narenas_total_set(narenas_auto);
 
 	/* Allocate and initialize arenas. */
-	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas_total);
+	arenas = (arena_t **)base_alloc(sizeof(arena_t *) *
+	    (MALLOCX_ARENA_MAX+1));
 	if (arenas == NULL)
 		return (true);
-	/*
-	 * Zero the array.  In practice, this should always be pre-zeroed,
-	 * since it was just mmap()ed, but let's be sure.
-	 */
-	memset(arenas, 0, sizeof(arena_t *) * narenas_total);
 	/* Copy the pointer to the one arena that was already initialized. */
-	arenas[0] = a0;
+	arena_set(0, a0);
 
 	malloc_init_state = malloc_init_initialized;
+	malloc_slow_flag_init();
+
 	return (false);
 }
 
@@ -1329,17 +1378,17 @@ malloc_init_hard(void)
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
-	if (malloc_tsd_boot0()) {
+
+	if (malloc_init_hard_recursible()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
+
 	if (config_prof && prof_boot2()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
 
-	malloc_init_hard_recursible();
-
 	if (malloc_init_hard_finish()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
@@ -1359,34 +1408,36 @@ malloc_init_hard(void)
  */
 
 static void *
-imalloc_prof_sample(tsd_t *tsd, size_t usize, prof_tctx_t *tctx)
+imalloc_prof_sample(tsd_t *tsd, size_t usize, szind_t ind,
+    prof_tctx_t *tctx, bool slow_path)
 {
 	void *p;
 
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = imalloc(tsd, LARGE_MINCLASS);
+		szind_t ind_large = size2index(LARGE_MINCLASS);
+		p = imalloc(tsd, LARGE_MINCLASS, ind_large, slow_path);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else
-		p = imalloc(tsd, usize);
+		p = imalloc(tsd, usize, ind, slow_path);
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imalloc_prof(tsd_t *tsd, size_t usize)
+imalloc_prof(tsd_t *tsd, size_t usize, szind_t ind, bool slow_path)
 {
 	void *p;
 	prof_tctx_t *tctx;
 
 	tctx = prof_alloc_prep(tsd, usize, prof_active_get_unlocked(), true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
-		p = imalloc_prof_sample(tsd, usize, tctx);
+		p = imalloc_prof_sample(tsd, usize, ind, tctx, slow_path);
 	else
-		p = imalloc(tsd, usize);
+		p = imalloc(tsd, usize, ind, slow_path);
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
@@ -1397,23 +1448,44 @@ imalloc_prof(tsd_t *tsd, size_t usize)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imalloc_body(size_t size, tsd_t **tsd, size_t *usize)
+imalloc_body(size_t size, tsd_t **tsd, size_t *usize, bool slow_path)
 {
+	szind_t ind;
 
-	if (unlikely(malloc_init()))
+	if (slow_path && unlikely(malloc_init()))
 		return (NULL);
 	*tsd = tsd_fetch();
+	ind = size2index(size);
+	if (unlikely(ind >= NSIZES))
+		return (NULL);
 
-	if (config_prof && opt_prof) {
-		*usize = s2u(size);
-		if (unlikely(*usize == 0))
-			return (NULL);
-		return (imalloc_prof(*tsd, *usize));
+	if (config_stats || (config_prof && opt_prof) || (slow_path &&
+	    config_valgrind && unlikely(in_valgrind))) {
+		*usize = index2size(ind);
+		assert(*usize > 0 && *usize <= HUGE_MAXCLASS);
 	}
 
-	if (config_stats || (config_valgrind && unlikely(in_valgrind)))
-		*usize = s2u(size);
-	return (imalloc(*tsd, size));
+	if (config_prof && opt_prof)
+		return (imalloc_prof(*tsd, *usize, ind, slow_path));
+
+	return (imalloc(*tsd, size, ind, slow_path));
+}
+
+JEMALLOC_ALWAYS_INLINE_C void
+imalloc_post_check(void *ret, tsd_t *tsd, size_t usize, bool slow_path)
+{
+	if (unlikely(ret == NULL)) {
+		if (slow_path && config_xmalloc && unlikely(opt_xmalloc)) {
+			malloc_write("<jemalloc>: Error in malloc(): "
+			    "out of memory\n");
+			abort();
+		}
+		set_errno(ENOMEM);
+	}
+	if (config_stats && likely(ret != NULL)) {
+		assert(usize == isalloc(ret, config_prof));
+		*tsd_thread_allocatedp_get(tsd) += usize;
+	}
 }
 
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
@@ -1428,21 +1500,20 @@ je_malloc(size_t size)
 	if (size == 0)
 		size = 1;
 
-	ret = imalloc_body(size, &tsd, &usize);
-	if (unlikely(ret == NULL)) {
-		if (config_xmalloc && unlikely(opt_xmalloc)) {
-			malloc_write("<jemalloc>: Error in malloc(): "
-			    "out of memory\n");
-			abort();
-		}
-		set_errno(ENOMEM);
-	}
-	if (config_stats && likely(ret != NULL)) {
-		assert(usize == isalloc(ret, config_prof));
-		*tsd_thread_allocatedp_get(tsd) += usize;
+	if (likely(!malloc_slow)) {
+		/*
+		 * imalloc_body() is inlined so that fast and slow paths are
+		 * generated separately with statically known slow_path.
+		 */
+		ret = imalloc_body(size, &tsd, &usize, false);
+		imalloc_post_check(ret, tsd, usize, false);
+	} else {
+		ret = imalloc_body(size, &tsd, &usize, true);
+		imalloc_post_check(ret, tsd, usize, true);
+		UTRACE(0, size, ret);
+		JEMALLOC_VALGRIND_MALLOC(ret != NULL, ret, usize, false);
 	}
-	UTRACE(0, size, ret);
-	JEMALLOC_VALGRIND_MALLOC(ret != NULL, ret, usize, false);
+
 	return (ret);
 }
 
@@ -1519,7 +1590,7 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 	}
 
 	usize = sa2u(size, alignment);
-	if (unlikely(usize == 0)) {
+	if (unlikely(usize == 0 || usize > HUGE_MAXCLASS)) {
 		result = NULL;
 		goto label_oom;
 	}
@@ -1580,34 +1651,35 @@ je_aligned_alloc(size_t alignment, size_t size)
 }
 
 static void *
-icalloc_prof_sample(tsd_t *tsd, size_t usize, prof_tctx_t *tctx)
+icalloc_prof_sample(tsd_t *tsd, size_t usize, szind_t ind, prof_tctx_t *tctx)
 {
 	void *p;
 
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = icalloc(tsd, LARGE_MINCLASS);
+		szind_t ind_large = size2index(LARGE_MINCLASS);
+		p = icalloc(tsd, LARGE_MINCLASS, ind_large);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else
-		p = icalloc(tsd, usize);
+		p = icalloc(tsd, usize, ind);
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-icalloc_prof(tsd_t *tsd, size_t usize)
+icalloc_prof(tsd_t *tsd, size_t usize, szind_t ind)
 {
 	void *p;
 	prof_tctx_t *tctx;
 
 	tctx = prof_alloc_prep(tsd, usize, prof_active_get_unlocked(), true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
-		p = icalloc_prof_sample(tsd, usize, tctx);
+		p = icalloc_prof_sample(tsd, usize, ind, tctx);
 	else
-		p = icalloc(tsd, usize);
+		p = icalloc(tsd, usize, ind);
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
@@ -1625,6 +1697,7 @@ je_calloc(size_t num, size_t size)
 	void *ret;
 	tsd_t *tsd;
 	size_t num_size;
+	szind_t ind;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
 	if (unlikely(malloc_init())) {
@@ -1654,17 +1727,18 @@ je_calloc(size_t num, size_t size)
 		goto label_return;
 	}
 
+	ind = size2index(num_size);
+	if (unlikely(ind >= NSIZES)) {
+		ret = NULL;
+		goto label_return;
+	}
 	if (config_prof && opt_prof) {
-		usize = s2u(num_size);
-		if (unlikely(usize == 0)) {
-			ret = NULL;
-			goto label_return;
-		}
-		ret = icalloc_prof(tsd, usize);
+		usize = index2size(ind);
+		ret = icalloc_prof(tsd, usize, ind);
 	} else {
 		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
-			usize = s2u(num_size);
-		ret = icalloc(tsd, num_size);
+			usize = index2size(ind);
+		ret = icalloc(tsd, num_size, ind);
 	}
 
 label_return:
@@ -1729,7 +1803,7 @@ irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize)
 }
 
 JEMALLOC_INLINE_C void
-ifree(tsd_t *tsd, void *ptr, tcache_t *tcache)
+ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path)
 {
 	size_t usize;
 	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
@@ -1744,10 +1818,15 @@ ifree(tsd_t *tsd, void *ptr, tcache_t *tcache)
 		usize = isalloc(ptr, config_prof);
 	if (config_stats)
 		*tsd_thread_deallocatedp_get(tsd) += usize;
-	if (config_valgrind && unlikely(in_valgrind))
-		rzsize = p2rz(ptr);
-	iqalloc(tsd, ptr, tcache);
-	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
+
+	if (likely(!slow_path))
+		iqalloc(tsd, ptr, tcache, false);
+	else {
+		if (config_valgrind && unlikely(in_valgrind))
+			rzsize = p2rz(ptr);
+		iqalloc(tsd, ptr, tcache, true);
+		JEMALLOC_VALGRIND_FREE(ptr, rzsize);
+	}
 }
 
 JEMALLOC_INLINE_C void
@@ -1784,7 +1863,7 @@ je_realloc(void *ptr, size_t size)
 			/* realloc(ptr, 0) is equivalent to free(ptr). */
 			UTRACE(ptr, 0, 0);
 			tsd = tsd_fetch();
-			ifree(tsd, ptr, tcache_get(tsd, false));
+			ifree(tsd, ptr, tcache_get(tsd, false), true);
 			return (NULL);
 		}
 		size = 1;
@@ -1801,8 +1880,8 @@ je_realloc(void *ptr, size_t size)
 
 		if (config_prof && opt_prof) {
 			usize = s2u(size);
-			ret = unlikely(usize == 0) ? NULL : irealloc_prof(tsd,
-			    ptr, old_usize, usize);
+			ret = unlikely(usize == 0 || usize > HUGE_MAXCLASS) ?
+			    NULL : irealloc_prof(tsd, ptr, old_usize, usize);
 		} else {
 			if (config_stats || (config_valgrind &&
 			    unlikely(in_valgrind)))
@@ -1811,7 +1890,10 @@ je_realloc(void *ptr, size_t size)
 		}
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		ret = imalloc_body(size, &tsd, &usize);
+		if (likely(!malloc_slow))
+			ret = imalloc_body(size, &tsd, &usize, false);
+		else
+			ret = imalloc_body(size, &tsd, &usize, true);
 	}
 
 	if (unlikely(ret == NULL)) {
@@ -1840,7 +1922,10 @@ je_free(void *ptr)
 	UTRACE(ptr, 0, 0);
 	if (likely(ptr != NULL)) {
 		tsd_t *tsd = tsd_fetch();
-		ifree(tsd, ptr, tcache_get(tsd, false));
+		if (likely(!malloc_slow))
+			ifree(tsd, ptr, tcache_get(tsd, false), false);
+		else
+			ifree(tsd, ptr, tcache_get(tsd, false), true);
 	}
 }
 
@@ -1927,7 +2012,8 @@ imallocx_flags_decode_hard(tsd_t *tsd, size_t size, int flags, size_t *usize,
 		*alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags);
 		*usize = sa2u(size, *alignment);
 	}
-	assert(*usize != 0);
+	if (unlikely(*usize == 0 || *usize > HUGE_MAXCLASS))
+		return (true);
 	*zero = MALLOCX_ZERO_GET(flags);
 	if ((flags & MALLOCX_TCACHE_MASK) != 0) {
 		if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE)
@@ -1938,7 +2024,7 @@ imallocx_flags_decode_hard(tsd_t *tsd, size_t size, int flags, size_t *usize,
 		*tcache = tcache_get(tsd, true);
 	if ((flags & MALLOCX_ARENA_MASK) != 0) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
-		*arena = arena_get(tsd, arena_ind, true, true);
+		*arena = arena_get(arena_ind, true);
 		if (unlikely(*arena == NULL))
 			return (true);
 	} else
@@ -1953,7 +2039,8 @@ imallocx_flags_decode(tsd_t *tsd, size_t size, int flags, size_t *usize,
 
 	if (likely(flags == 0)) {
 		*usize = s2u(size);
-		assert(*usize != 0);
+		if (unlikely(*usize == 0 || *usize > HUGE_MAXCLASS))
+			return (true);
 		*alignment = 0;
 		*zero = false;
 		*tcache = tcache_get(tsd, true);
@@ -1969,12 +2056,15 @@ JEMALLOC_ALWAYS_INLINE_C void *
 imallocx_flags(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
     tcache_t *tcache, arena_t *arena)
 {
+	szind_t ind;
 
 	if (unlikely(alignment != 0))
 		return (ipalloct(tsd, usize, alignment, zero, tcache, arena));
+	ind = size2index(usize);
+	assert(ind < NSIZES);
 	if (unlikely(zero))
-		return (icalloct(tsd, usize, tcache, arena));
-	return (imalloct(tsd, usize, tcache, arena));
+		return (icalloct(tsd, usize, ind, tcache, arena));
+	return (imalloct(tsd, usize, ind, tcache, arena));
 }
 
 static void *
@@ -2038,9 +2128,15 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	arena_t *arena;
 
 	if (likely(flags == 0)) {
-		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
-			*usize = s2u(size);
-		return (imalloc(tsd, size));
+		szind_t ind = size2index(size);
+		if (unlikely(ind >= NSIZES))
+			return (NULL);
+		if (config_stats || (config_valgrind &&
+		    unlikely(in_valgrind))) {
+			*usize = index2size(ind);
+			assert(*usize > 0 && *usize <= HUGE_MAXCLASS);
+		}
+		return (imalloc(tsd, size, ind, true));
 	}
 
 	if (unlikely(imallocx_flags_decode_hard(tsd, size, flags, usize,
@@ -2176,7 +2272,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
-		arena = arena_get(tsd, arena_ind, true, true);
+		arena = arena_get(arena_ind, true);
 		if (unlikely(arena == NULL))
 			goto label_oom;
 	} else
@@ -2196,7 +2292,8 @@ je_rallocx(void *ptr, size_t size, int flags)
 
 	if (config_prof && opt_prof) {
 		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
-		assert(usize != 0);
+		if (unlikely(usize == 0 || usize > HUGE_MAXCLASS))
+			goto label_oom;
 		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
 		    zero, tcache, arena);
 		if (unlikely(p == NULL))
@@ -2229,12 +2326,12 @@ label_oom:
 }
 
 JEMALLOC_ALWAYS_INLINE_C size_t
-ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
-    size_t alignment, bool zero)
+ixallocx_helper(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
+    size_t extra, size_t alignment, bool zero)
 {
 	size_t usize;
 
-	if (ixalloc(ptr, old_usize, size, extra, alignment, zero))
+	if (ixalloc(tsd, ptr, old_usize, size, extra, alignment, zero))
 		return (old_usize);
 	usize = isalloc(ptr, config_prof);
 
@@ -2242,14 +2339,15 @@ ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
 }
 
 static size_t
-ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
-    size_t alignment, bool zero, prof_tctx_t *tctx)
+ixallocx_prof_sample(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
+    size_t extra, size_t alignment, bool zero, prof_tctx_t *tctx)
 {
 	size_t usize;
 
 	if (tctx == NULL)
 		return (old_usize);
-	usize = ixallocx_helper(ptr, old_usize, size, extra, alignment, zero);
+	usize = ixallocx_helper(tsd, ptr, old_usize, size, extra, alignment,
+	    zero);
 
 	return (usize);
 }
@@ -2270,16 +2368,29 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 	 * prof_alloc_prep() to decide whether to capture a backtrace.
 	 * prof_realloc() will use the actual usize to decide whether to sample.
 	 */
-	usize_max = (alignment == 0) ? s2u(size+extra) : sa2u(size+extra,
-	    alignment);
-	assert(usize_max != 0);
+	if (alignment == 0) {
+		usize_max = s2u(size+extra);
+		assert(usize_max > 0 && usize_max <= HUGE_MAXCLASS);
+	} else {
+		usize_max = sa2u(size+extra, alignment);
+		if (unlikely(usize_max == 0 || usize_max > HUGE_MAXCLASS)) {
+			/*
+			 * usize_max is out of range, and chances are that
+			 * allocation will fail, but use the maximum possible
+			 * value and carry on with prof_alloc_prep(), just in
+			 * case allocation succeeds.
+			 */
+			usize_max = HUGE_MAXCLASS;
+		}
+	}
 	tctx = prof_alloc_prep(tsd, usize_max, prof_active, false);
+
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
-		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
+		usize = ixallocx_prof_sample(tsd, ptr, old_usize, size, extra,
 		    alignment, zero, tctx);
 	} else {
-		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
-		    zero);
+		usize = ixallocx_helper(tsd, ptr, old_usize, size, extra,
+		    alignment, zero);
 	}
 	if (usize == old_usize) {
 		prof_alloc_rollback(tsd, tctx, false);
@@ -2309,15 +2420,21 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 
 	old_usize = isalloc(ptr, config_prof);
 
-	/* Clamp extra if necessary to avoid (size + extra) overflow. */
-	if (unlikely(size + extra > HUGE_MAXCLASS)) {
-		/* Check for size overflow. */
-		if (unlikely(size > HUGE_MAXCLASS)) {
-			usize = old_usize;
-			goto label_not_resized;
-		}
-		extra = HUGE_MAXCLASS - size;
+	/*
+	 * The API explicitly absolves itself of protecting against (size +
+	 * extra) numerical overflow, but we may need to clamp extra to avoid
+	 * exceeding HUGE_MAXCLASS.
+	 *
+	 * Ordinarily, size limit checking is handled deeper down, but here we
+	 * have to check as part of (size + extra) clamping, since we need the
+	 * clamped value in the above helper functions.
+	 */
+	if (unlikely(size > HUGE_MAXCLASS)) {
+		usize = old_usize;
+		goto label_not_resized;
 	}
+	if (unlikely(HUGE_MAXCLASS - size < extra))
+		extra = HUGE_MAXCLASS - size;
 
 	if (config_valgrind && unlikely(in_valgrind))
 		old_rzsize = u2rz(old_usize);
@@ -2326,8 +2443,8 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		usize = ixallocx_prof(tsd, ptr, old_usize, size, extra,
 		    alignment, zero);
 	} else {
-		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
-		    zero);
+		usize = ixallocx_helper(tsd, ptr, old_usize, size, extra,
+		    alignment, zero);
 	}
 	if (unlikely(usize == old_usize))
 		goto label_not_resized;
@@ -2379,7 +2496,7 @@ je_dallocx(void *ptr, int flags)
 		tcache = tcache_get(tsd, false);
 
 	UTRACE(ptr, 0, 0);
-	ifree(tsd_fetch(), ptr, tcache);
+	ifree(tsd_fetch(), ptr, tcache, true);
 }
 
 JEMALLOC_ALWAYS_INLINE_C size_t
@@ -2391,7 +2508,6 @@ inallocx(size_t size, int flags)
 		usize = s2u(size);
 	else
 		usize = sa2u(size, MALLOCX_ALIGN_GET_SPECIFIED(flags));
-	assert(usize != 0);
 	return (usize);
 }
 
@@ -2424,13 +2540,18 @@ JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
 JEMALLOC_ATTR(pure)
 je_nallocx(size_t size, int flags)
 {
+	size_t usize;
 
 	assert(size != 0);
 
 	if (unlikely(malloc_init()))
 		return (0);
 
-	return (inallocx(size, flags));
+	usize = inallocx(size, flags);
+	if (unlikely(usize > HUGE_MAXCLASS))
+		return (0);
+
+	return (usize);
 }
 
 JEMALLOC_EXPORT int JEMALLOC_NOTHROW
@@ -2628,7 +2749,7 @@ JEMALLOC_EXPORT void
 _malloc_prefork(void)
 #endif
 {
-	unsigned i;
+	unsigned i, narenas;
 
 #ifdef JEMALLOC_MUTEX_INIT_CB
 	if (!malloc_initialized())
@@ -2640,9 +2761,11 @@ _malloc_prefork(void)
 	ctl_prefork();
 	prof_prefork();
 	malloc_mutex_prefork(&arenas_lock);
-	for (i = 0; i < narenas_total; i++) {
-		if (arenas[i] != NULL)
-			arena_prefork(arenas[i]);
+	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
+		arena_t *arena;
+
+		if ((arena = arena_get(i, false)) != NULL)
+			arena_prefork(arena);
 	}
 	chunk_prefork();
 	base_prefork();
@@ -2656,7 +2779,7 @@ JEMALLOC_EXPORT void
 _malloc_postfork(void)
 #endif
 {
-	unsigned i;
+	unsigned i, narenas;
 
 #ifdef JEMALLOC_MUTEX_INIT_CB
 	if (!malloc_initialized())
@@ -2667,9 +2790,11 @@ _malloc_postfork(void)
 	/* Release all mutexes, now that fork() has completed. */
 	base_postfork_parent();
 	chunk_postfork_parent();
-	for (i = 0; i < narenas_total; i++) {
-		if (arenas[i] != NULL)
-			arena_postfork_parent(arenas[i]);
+	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
+		arena_t *arena;
+
+		if ((arena = arena_get(i, false)) != NULL)
+			arena_postfork_parent(arena);
 	}
 	malloc_mutex_postfork_parent(&arenas_lock);
 	prof_postfork_parent();
@@ -2679,16 +2804,18 @@ _malloc_postfork(void)
 void
 jemalloc_postfork_child(void)
 {
-	unsigned i;
+	unsigned i, narenas;
 
 	assert(malloc_initialized());
 
 	/* Release all mutexes, now that fork() has completed. */
 	base_postfork_child();
 	chunk_postfork_child();
-	for (i = 0; i < narenas_total; i++) {
-		if (arenas[i] != NULL)
-			arena_postfork_child(arenas[i]);
+	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
+		arena_t *arena;
+
+		if ((arena = arena_get(i, false)) != NULL)
+			arena_postfork_child(arena);
 	}
 	malloc_mutex_postfork_child(&arenas_lock);
 	prof_postfork_child();
diff --git a/contrib/jemalloc/src/nstime.c b/contrib/jemalloc/src/nstime.c
new file mode 100644
index 000000000000..4cf90b5840fb
--- /dev/null
+++ b/contrib/jemalloc/src/nstime.c
@@ -0,0 +1,148 @@
+#include "jemalloc/internal/jemalloc_internal.h"
+
+#define	BILLION	UINT64_C(1000000000)
+
+void
+nstime_init(nstime_t *time, uint64_t ns)
+{
+
+	time->ns = ns;
+}
+
+void
+nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec)
+{
+
+	time->ns = sec * BILLION + nsec;
+}
+
+uint64_t
+nstime_ns(const nstime_t *time)
+{
+
+	return (time->ns);
+}
+
+uint64_t
+nstime_sec(const nstime_t *time)
+{
+
+	return (time->ns / BILLION);
+}
+
+uint64_t
+nstime_nsec(const nstime_t *time)
+{
+
+	return (time->ns % BILLION);
+}
+
+void
+nstime_copy(nstime_t *time, const nstime_t *source)
+{
+
+	*time = *source;
+}
+
+int
+nstime_compare(const nstime_t *a, const nstime_t *b)
+{
+
+	return ((a->ns > b->ns) - (a->ns < b->ns));
+}
+
+void
+nstime_add(nstime_t *time, const nstime_t *addend)
+{
+
+	assert(UINT64_MAX - time->ns >= addend->ns);
+
+	time->ns += addend->ns;
+}
+
+void
+nstime_subtract(nstime_t *time, const nstime_t *subtrahend)
+{
+
+	assert(nstime_compare(time, subtrahend) >= 0);
+
+	time->ns -= subtrahend->ns;
+}
+
+void
+nstime_imultiply(nstime_t *time, uint64_t multiplier)
+{
+
+	assert((((time->ns | multiplier) & (UINT64_MAX << (sizeof(uint64_t) <<
+	    2))) == 0) || ((time->ns * multiplier) / multiplier == time->ns));
+
+	time->ns *= multiplier;
+}
+
+void
+nstime_idivide(nstime_t *time, uint64_t divisor)
+{
+
+	assert(divisor != 0);
+
+	time->ns /= divisor;
+}
+
+uint64_t
+nstime_divide(const nstime_t *time, const nstime_t *divisor)
+{
+
+	assert(divisor->ns != 0);
+
+	return (time->ns / divisor->ns);
+}
+
+#ifdef JEMALLOC_JET
+#undef nstime_update
+#define	nstime_update JEMALLOC_N(nstime_update_impl)
+#endif
+bool
+nstime_update(nstime_t *time)
+{
+	nstime_t old_time;
+
+	nstime_copy(&old_time, time);
+
+#ifdef _WIN32
+	{
+		FILETIME ft;
+		uint64_t ticks;
+		GetSystemTimeAsFileTime(&ft);
+		ticks = (((uint64_t)ft.dwHighDateTime) << 32) |
+		    ft.dwLowDateTime;
+		time->ns = ticks * 100;
+	}
+#elif JEMALLOC_CLOCK_GETTIME
+	{
+		struct timespec ts;
+
+		if (sysconf(_SC_MONOTONIC_CLOCK) > 0)
+			clock_gettime(CLOCK_MONOTONIC, &ts);
+		else
+			clock_gettime(CLOCK_REALTIME, &ts);
+		time->ns = ts.tv_sec * BILLION + ts.tv_nsec;
+	}
+#else
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	time->ns = tv.tv_sec * BILLION + tv.tv_usec * 1000;
+#endif
+
+	/* Handle non-monotonic clocks. */
+	if (unlikely(nstime_compare(&old_time, time) > 0)) {
+		nstime_copy(time, &old_time);
+		return (true);
+	}
+
+	return (false);
+}
+#ifdef JEMALLOC_JET
+#undef nstime_update
+#define	nstime_update JEMALLOC_N(nstime_update)
+nstime_update_t *nstime_update = JEMALLOC_N(nstime_update_impl);
+#endif
diff --git a/contrib/jemalloc/src/prng.c b/contrib/jemalloc/src/prng.c
new file mode 100644
index 000000000000..76646a2a4c34
--- /dev/null
+++ b/contrib/jemalloc/src/prng.c
@@ -0,0 +1,2 @@
+#define	JEMALLOC_PRNG_C_
+#include "jemalloc/internal/jemalloc_internal.h"
diff --git a/contrib/jemalloc/src/prof.c b/contrib/jemalloc/src/prof.c
index 5d2b9598fdb4..b38722770d9a 100644
--- a/contrib/jemalloc/src/prof.c
+++ b/contrib/jemalloc/src/prof.c
@@ -109,7 +109,7 @@ static char		prof_dump_buf[
     1
 #endif
 ];
-static unsigned		prof_dump_buf_end;
+static size_t		prof_dump_buf_end;
 static int		prof_dump_fd;
 
 /* Do not dump any profiles until bootstrapping is complete. */
@@ -551,9 +551,9 @@ prof_gctx_create(tsd_t *tsd, prof_bt_t *bt)
 	/*
 	 * Create a single allocation that has space for vec of length bt->len.
 	 */
-	prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsd, offsetof(prof_gctx_t,
-	    vec) + (bt->len * sizeof(void *)), false, tcache_get(tsd, true),
-	    true, NULL);
+	size_t size = offsetof(prof_gctx_t, vec) + (bt->len * sizeof(void *));
+	prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsd, size,
+	    size2index(size), false, tcache_get(tsd, true), true, NULL, true);
 	if (gctx == NULL)
 		return (NULL);
 	gctx->lock = prof_gctx_mutex_choose();
@@ -594,7 +594,7 @@ prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, prof_gctx_t *gctx,
 		prof_leave(tsd, tdata_self);
 		/* Destroy gctx. */
 		malloc_mutex_unlock(gctx->lock);
-		idalloctm(tsd, gctx, tcache_get(tsd, false), true);
+		idalloctm(tsd, gctx, tcache_get(tsd, false), true, true);
 	} else {
 		/*
 		 * Compensate for increment in prof_tctx_destroy() or
@@ -701,7 +701,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		prof_tdata_destroy(tsd, tdata, false);
 
 	if (destroy_tctx)
-		idalloctm(tsd, tctx, tcache_get(tsd, false), true);
+		idalloctm(tsd, tctx, tcache_get(tsd, false), true, true);
 }
 
 static bool
@@ -730,7 +730,8 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
 		if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
 			prof_leave(tsd, tdata);
-			idalloctm(tsd, gctx.v, tcache_get(tsd, false), true);
+			idalloctm(tsd, gctx.v, tcache_get(tsd, false), true,
+			    true);
 			return (true);
 		}
 		new_gctx = true;
@@ -789,8 +790,9 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 
 		/* Link a prof_tctx_t into gctx for this thread. */
 		tcache = tcache_get(tsd, true);
-		ret.v = iallocztm(tsd, sizeof(prof_tctx_t), false, tcache, true,
-		    NULL);
+		ret.v = iallocztm(tsd, sizeof(prof_tctx_t),
+		    size2index(sizeof(prof_tctx_t)), false, tcache, true, NULL,
+		    true);
 		if (ret.p == NULL) {
 			if (new_gctx)
 				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
@@ -810,7 +812,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		if (error) {
 			if (new_gctx)
 				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
-			idalloctm(tsd, ret.v, tcache, true);
+			idalloctm(tsd, ret.v, tcache, true, true);
 			return (NULL);
 		}
 		malloc_mutex_lock(gctx->lock);
@@ -869,8 +871,7 @@ prof_sample_threshold_update(prof_tdata_t *tdata)
 	 *   pp 500
 	 *   (http://luc.devroye.org/rnbookindex.html)
 	 */
-	prng64(r, 53, tdata->prng_state, UINT64_C(6364136223846793005),
-	    UINT64_C(1442695040888963407));
+	r = prng_lg_range(&tdata->prng_state, 53);
 	u = (double)r * (1.0/9007199254740992.0L);
 	tdata->bytes_until_sample = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
@@ -988,7 +989,7 @@ prof_dump_close(bool propagate_err)
 static bool
 prof_dump_write(bool propagate_err, const char *s)
 {
-	unsigned i, slen, n;
+	size_t i, slen, n;
 
 	cassert(config_prof);
 
@@ -1211,7 +1212,7 @@ prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs)
 					tctx_tree_remove(&gctx->tctxs,
 					    to_destroy);
 					idalloctm(tsd, to_destroy,
-					    tcache_get(tsd, false), true);
+					    tcache_get(tsd, false), true, true);
 				} else
 					next = NULL;
 			} while (next != NULL);
@@ -1358,6 +1359,7 @@ label_return:
 	return (ret);
 }
 
+#ifndef _WIN32
 JEMALLOC_FORMAT_PRINTF(1, 2)
 static int
 prof_open_maps(const char *format, ...)
@@ -1373,6 +1375,18 @@ prof_open_maps(const char *format, ...)
 
 	return (mfd);
 }
+#endif
+
+static int
+prof_getpid(void)
+{
+
+#ifdef _WIN32
+	return (GetCurrentProcessId());
+#else
+	return (getpid());
+#endif
+}
 
 static bool
 prof_dump_maps(bool propagate_err)
@@ -1383,9 +1397,11 @@ prof_dump_maps(bool propagate_err)
 	cassert(config_prof);
 #ifdef __FreeBSD__
 	mfd = prof_open_maps("/proc/curproc/map");
+#elif defined(_WIN32)
+	mfd = -1; // Not implemented
 #else
 	{
-		int pid = getpid();
+		int pid = prof_getpid();
 
 		mfd = prof_open_maps("/proc/%d/task/%d/maps", pid, pid);
 		if (mfd == -1)
@@ -1554,12 +1570,12 @@ prof_dump_filename(char *filename, char v, uint64_t vseq)
 	        /* "<prefix>.<pid>.<seq>.v<vseq>.heap" */
 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
 		    "%s.%d.%"FMTu64".%c%"FMTu64".heap",
-		    opt_prof_prefix, (int)getpid(), prof_dump_seq, v, vseq);
+		    opt_prof_prefix, prof_getpid(), prof_dump_seq, v, vseq);
 	} else {
 	        /* "<prefix>.<pid>.<seq>.<v>.heap" */
 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
 		    "%s.%d.%"FMTu64".%c.heap",
-		    opt_prof_prefix, (int)getpid(), prof_dump_seq, v);
+		    opt_prof_prefix, prof_getpid(), prof_dump_seq, v);
 	}
 	prof_dump_seq++;
 }
@@ -1714,8 +1730,8 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
 
 	/* Initialize an empty cache for this thread. */
 	tcache = tcache_get(tsd, true);
-	tdata = (prof_tdata_t *)iallocztm(tsd, sizeof(prof_tdata_t), false,
-	    tcache, true, NULL);
+	tdata = (prof_tdata_t *)iallocztm(tsd, sizeof(prof_tdata_t),
+	    size2index(sizeof(prof_tdata_t)), false, tcache, true, NULL, true);
 	if (tdata == NULL)
 		return (NULL);
 
@@ -1729,7 +1745,7 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
 
 	if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloctm(tsd, tdata, tcache, true);
+		idalloctm(tsd, tdata, tcache, true, true);
 		return (NULL);
 	}
 
@@ -1784,9 +1800,9 @@ prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata,
 
 	tcache = tcache_get(tsd, false);
 	if (tdata->thread_name != NULL)
-		idalloctm(tsd, tdata->thread_name, tcache, true);
+		idalloctm(tsd, tdata->thread_name, tcache, true, true);
 	ckh_delete(tsd, &tdata->bt2tctx);
-	idalloctm(tsd, tdata, tcache, true);
+	idalloctm(tsd, tdata, tcache, true, true);
 }
 
 static void
@@ -1947,7 +1963,8 @@ prof_thread_name_alloc(tsd_t *tsd, const char *thread_name)
 	if (size == 1)
 		return ("");
 
-	ret = iallocztm(tsd, size, false, tcache_get(tsd, true), true, NULL);
+	ret = iallocztm(tsd, size, size2index(size), false, tcache_get(tsd,
+	    true), true, NULL, true);
 	if (ret == NULL)
 		return (NULL);
 	memcpy(ret, thread_name, size);
@@ -1980,7 +1997,7 @@ prof_thread_name_set(tsd_t *tsd, const char *thread_name)
 
 	if (tdata->thread_name != NULL) {
 		idalloctm(tsd, tdata->thread_name, tcache_get(tsd, false),
-		    true);
+		    true, true);
 		tdata->thread_name = NULL;
 	}
 	if (strlen(s) > 0)
diff --git a/contrib/jemalloc/src/quarantine.c b/contrib/jemalloc/src/quarantine.c
index 6c43dfcaa3af..ff8801cb7c7c 100644
--- a/contrib/jemalloc/src/quarantine.c
+++ b/contrib/jemalloc/src/quarantine.c
@@ -23,12 +23,14 @@ static quarantine_t *
 quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 {
 	quarantine_t *quarantine;
+	size_t size;
 
 	assert(tsd_nominal(tsd));
 
-	quarantine = (quarantine_t *)iallocztm(tsd, offsetof(quarantine_t, objs)
-	    + ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)), false,
-	    tcache_get(tsd, true), true, NULL);
+	size = offsetof(quarantine_t, objs) + ((ZU(1) << lg_maxobjs) *
+	    sizeof(quarantine_obj_t));
+	quarantine = (quarantine_t *)iallocztm(tsd, size, size2index(size),
+	    false, tcache_get(tsd, true), true, NULL, true);
 	if (quarantine == NULL)
 		return (NULL);
 	quarantine->curbytes = 0;
@@ -55,7 +57,7 @@ quarantine_alloc_hook_work(tsd_t *tsd)
 	if (tsd_quarantine_get(tsd) == NULL)
 		tsd_quarantine_set(tsd, quarantine);
 	else
-		idalloctm(tsd, quarantine, tcache_get(tsd, false), true);
+		idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true);
 }
 
 static quarantine_t *
@@ -87,7 +89,7 @@ quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 		memcpy(&ret->objs[ncopy_a], quarantine->objs, ncopy_b *
 		    sizeof(quarantine_obj_t));
 	}
-	idalloctm(tsd, quarantine, tcache_get(tsd, false), true);
+	idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true);
 
 	tsd_quarantine_set(tsd, ret);
 	return (ret);
@@ -98,7 +100,7 @@ quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine)
 {
 	quarantine_obj_t *obj = &quarantine->objs[quarantine->first];
 	assert(obj->usize == isalloc(obj->ptr, config_prof));
-	idalloctm(tsd, obj->ptr, NULL, false);
+	idalloctm(tsd, obj->ptr, NULL, false, true);
 	quarantine->curbytes -= obj->usize;
 	quarantine->curobjs--;
 	quarantine->first = (quarantine->first + 1) & ((ZU(1) <<
@@ -123,7 +125,7 @@ quarantine(tsd_t *tsd, void *ptr)
 	assert(opt_quarantine);
 
 	if ((quarantine = tsd_quarantine_get(tsd)) == NULL) {
-		idalloctm(tsd, ptr, NULL, false);
+		idalloctm(tsd, ptr, NULL, false, true);
 		return;
 	}
 	/*
@@ -162,7 +164,7 @@ quarantine(tsd_t *tsd, void *ptr)
 		}
 	} else {
 		assert(quarantine->curbytes == 0);
-		idalloctm(tsd, ptr, NULL, false);
+		idalloctm(tsd, ptr, NULL, false, true);
 	}
 }
 
@@ -177,7 +179,7 @@ quarantine_cleanup(tsd_t *tsd)
 	quarantine = tsd_quarantine_get(tsd);
 	if (quarantine != NULL) {
 		quarantine_drain(tsd, quarantine, 0);
-		idalloctm(tsd, quarantine, tcache_get(tsd, false), true);
+		idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true);
 		tsd_quarantine_set(tsd, NULL);
 	}
 }
diff --git a/contrib/jemalloc/src/stats.c b/contrib/jemalloc/src/stats.c
index 154c3e74cd36..a724947938c0 100644
--- a/contrib/jemalloc/src/stats.c
+++ b/contrib/jemalloc/src/stats.c
@@ -258,7 +258,7 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 {
 	unsigned nthreads;
 	const char *dss;
-	ssize_t lg_dirty_mult;
+	ssize_t lg_dirty_mult, decay_time;
 	size_t page, pactive, pdirty, mapped;
 	size_t metadata_mapped, metadata_allocated;
 	uint64_t npurge, nmadvise, purged;
@@ -278,13 +278,23 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	malloc_cprintf(write_cb, cbopaque, "dss allocation precedence: %s\n",
 	    dss);
 	CTL_M2_GET("stats.arenas.0.lg_dirty_mult", i, &lg_dirty_mult, ssize_t);
-	if (lg_dirty_mult >= 0) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "min active:dirty page ratio: %u:1\n",
-		    (1U << lg_dirty_mult));
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "min active:dirty page ratio: N/A\n");
+	if (opt_purge == purge_mode_ratio) {
+		if (lg_dirty_mult >= 0) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "min active:dirty page ratio: %u:1\n",
+			    (1U << lg_dirty_mult));
+		} else {
+			malloc_cprintf(write_cb, cbopaque,
+			    "min active:dirty page ratio: N/A\n");
+		}
+	}
+	CTL_M2_GET("stats.arenas.0.decay_time", i, &decay_time, ssize_t);
+	if (opt_purge == purge_mode_decay) {
+		if (decay_time >= 0) {
+			malloc_cprintf(write_cb, cbopaque, "decay time: %zd\n",
+			    decay_time);
+		} else
+			malloc_cprintf(write_cb, cbopaque, "decay time: N/A\n");
 	}
 	CTL_M2_GET("stats.arenas.0.pactive", i, &pactive, size_t);
 	CTL_M2_GET("stats.arenas.0.pdirty", i, &pdirty, size_t);
@@ -292,9 +302,8 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.nmadvise", i, &nmadvise, uint64_t);
 	CTL_M2_GET("stats.arenas.0.purged", i, &purged, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "dirty pages: %zu:%zu active:dirty, %"FMTu64" sweep%s, %"FMTu64
-	    " madvise%s, %"FMTu64" purged\n", pactive, pdirty, npurge, npurge ==
-	    1 ? "" : "s", nmadvise, nmadvise == 1 ? "" : "s", purged);
+	    "purging: dirty: %zu, sweeps: %"FMTu64", madvises: %"FMTu64", "
+	    "purged: %"FMTu64"\n", pdirty, npurge, nmadvise, purged);
 
 	malloc_cprintf(write_cb, cbopaque,
 	    "                            allocated      nmalloc      ndalloc"
@@ -426,9 +435,10 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		bool bv;
 		unsigned uv;
 		ssize_t ssv;
-		size_t sv, bsz, ssz, sssz, cpsz;
+		size_t sv, bsz, usz, ssz, sssz, cpsz;
 
 		bsz = sizeof(bool);
+		usz = sizeof(unsigned);
 		ssz = sizeof(size_t);
 		sssz = sizeof(ssize_t);
 		cpsz = sizeof(const char *);
@@ -438,6 +448,8 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		CTL_GET("config.debug", &bv, bool);
 		malloc_cprintf(write_cb, cbopaque, "Assertions %s\n",
 		    bv ? "enabled" : "disabled");
+		malloc_cprintf(write_cb, cbopaque,
+		    "config.malloc_conf: \"%s\"\n", config_malloc_conf);
 
 #define	OPT_WRITE_BOOL(n)						\
 		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0) {	\
@@ -453,6 +465,11 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			    : "false", bv2 ? "true" : "false");		\
 		}							\
 }
+#define	OPT_WRITE_UNSIGNED(n)						\
+		if (je_mallctl("opt."#n, &uv, &usz, NULL, 0) == 0) {	\
+			malloc_cprintf(write_cb, cbopaque,		\
+			"  opt."#n": %zu\n", sv);			\
+		}
 #define	OPT_WRITE_SIZE_T(n)						\
 		if (je_mallctl("opt."#n, &sv, &ssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
@@ -483,8 +500,14 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_BOOL(abort)
 		OPT_WRITE_SIZE_T(lg_chunk)
 		OPT_WRITE_CHAR_P(dss)
-		OPT_WRITE_SIZE_T(narenas)
-		OPT_WRITE_SSIZE_T_MUTABLE(lg_dirty_mult, arenas.lg_dirty_mult)
+		OPT_WRITE_UNSIGNED(narenas)
+		OPT_WRITE_CHAR_P(purge)
+		if (opt_purge == purge_mode_ratio) {
+			OPT_WRITE_SSIZE_T_MUTABLE(lg_dirty_mult,
+			    arenas.lg_dirty_mult)
+		}
+		if (opt_purge == purge_mode_decay)
+			OPT_WRITE_SSIZE_T_MUTABLE(decay_time, arenas.decay_time)
 		OPT_WRITE_BOOL(stats_print)
 		OPT_WRITE_CHAR_P(junk)
 		OPT_WRITE_SIZE_T(quarantine)
@@ -529,13 +552,22 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		malloc_cprintf(write_cb, cbopaque, "Page size: %zu\n", sv);
 
 		CTL_GET("arenas.lg_dirty_mult", &ssv, ssize_t);
-		if (ssv >= 0) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "Min active:dirty page ratio per arena: %u:1\n",
-			    (1U << ssv));
-		} else {
+		if (opt_purge == purge_mode_ratio) {
+			if (ssv >= 0) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "Min active:dirty page ratio per arena: "
+				    "%u:1\n", (1U << ssv));
+			} else {
+				malloc_cprintf(write_cb, cbopaque,
+				    "Min active:dirty page ratio per arena: "
+				    "N/A\n");
+			}
+		}
+		CTL_GET("arenas.decay_time", &ssv, ssize_t);
+		if (opt_purge == purge_mode_decay) {
 			malloc_cprintf(write_cb, cbopaque,
-			    "Min active:dirty page ratio per arena: N/A\n");
+			    "Unused dirty page decay time: %zd%s\n",
+			    ssv, (ssv < 0) ? " (no decay)" : "");
 		}
 		if (je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0) == 0) {
 			malloc_cprintf(write_cb, cbopaque,
diff --git a/contrib/jemalloc/src/tcache.c b/contrib/jemalloc/src/tcache.c
index fdafd0c620a6..6e32f40471af 100644
--- a/contrib/jemalloc/src/tcache.c
+++ b/contrib/jemalloc/src/tcache.c
@@ -10,7 +10,7 @@ ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 tcache_bin_info_t	*tcache_bin_info;
 static unsigned		stack_nelms; /* Total stack elms per tcache. */
 
-size_t			nhbins;
+unsigned		nhbins;
 size_t			tcache_maxclass;
 
 tcaches_t		*tcaches;
@@ -67,20 +67,19 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache)
 	tcache->next_gc_bin++;
 	if (tcache->next_gc_bin == nhbins)
 		tcache->next_gc_bin = 0;
-	tcache->ev_cnt = 0;
 }
 
 void *
 tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind)
+    tcache_bin_t *tbin, szind_t binind, bool *tcache_success)
 {
 	void *ret;
 
-	arena_tcache_fill_small(arena, tbin, binind, config_prof ?
+	arena_tcache_fill_small(tsd, arena, tbin, binind, config_prof ?
 	    tcache->prof_accumbytes : 0);
 	if (config_prof)
 		tcache->prof_accumbytes = 0;
-	ret = tcache_alloc_easy(tbin);
+	ret = tcache_alloc_easy(tbin, tcache_success);
 
 	return (ret);
 }
@@ -102,7 +101,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
-		    tbin->avail[0]);
+		    *(tbin->avail - 1));
 		arena_t *bin_arena = extent_node_arena_get(&chunk->node);
 		arena_bin_t *bin = &bin_arena->bins[binind];
 
@@ -122,7 +121,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		}
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = tbin->avail[i];
+			ptr = *(tbin->avail - 1 - i);
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (extent_node_arena_get(&chunk->node) == bin_arena) {
@@ -139,11 +138,12 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 				 * locked.  Stash the object, so that it can be
 				 * handled in a future pass.
 				 */
-				tbin->avail[ndeferred] = ptr;
+				*(tbin->avail - 1 - ndeferred) = ptr;
 				ndeferred++;
 			}
 		}
 		malloc_mutex_unlock(&bin->lock);
+		arena_decay_ticks(tsd, bin_arena, nflush - ndeferred);
 	}
 	if (config_stats && !merged_stats) {
 		/*
@@ -158,8 +158,8 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		malloc_mutex_unlock(&bin->lock);
 	}
 
-	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
-	    rem * sizeof(void *));
+	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
+	    sizeof(void *));
 	tbin->ncached = rem;
 	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
@@ -182,7 +182,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
-		    tbin->avail[0]);
+		    *(tbin->avail - 1));
 		arena_t *locked_arena = extent_node_arena_get(&chunk->node);
 		UNUSED bool idump;
 
@@ -206,7 +206,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 		}
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = tbin->avail[i];
+			ptr = *(tbin->avail - 1 - i);
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (extent_node_arena_get(&chunk->node) ==
@@ -220,13 +220,14 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 				 * Stash the object, so that it can be handled
 				 * in a future pass.
 				 */
-				tbin->avail[ndeferred] = ptr;
+				*(tbin->avail - 1 - ndeferred) = ptr;
 				ndeferred++;
 			}
 		}
 		malloc_mutex_unlock(&locked_arena->lock);
 		if (config_prof && idump)
 			prof_idump();
+		arena_decay_ticks(tsd, locked_arena, nflush - ndeferred);
 	}
 	if (config_stats && !merged_stats) {
 		/*
@@ -241,8 +242,8 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 		malloc_mutex_unlock(&arena->lock);
 	}
 
-	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
-	    rem * sizeof(void *));
+	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
+	    sizeof(void *));
 	tbin->ncached = rem;
 	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
@@ -324,18 +325,26 @@ tcache_create(tsd_t *tsd, arena_t *arena)
 	/* Avoid false cacheline sharing. */
 	size = sa2u(size, CACHELINE);
 
-	tcache = ipallocztm(tsd, size, CACHELINE, true, false, true, a0get());
+	tcache = ipallocztm(tsd, size, CACHELINE, true, false, true,
+	    arena_get(0, false));
 	if (tcache == NULL)
 		return (NULL);
 
 	tcache_arena_associate(tcache, arena);
 
+	ticker_init(&tcache->gc_ticker, TCACHE_GC_INCR);
+
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
 	for (i = 0; i < nhbins; i++) {
 		tcache->tbins[i].lg_fill_div = 1;
+		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
+		/*
+		 * avail points past the available space.  Allocations will
+		 * access the slots toward higher addresses (for the benefit of
+		 * prefetch).
+		 */
 		tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
 		    (uintptr_t)stack_offset);
-		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}
 
 	return (tcache);
@@ -379,7 +388,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 	    arena_prof_accum(arena, tcache->prof_accumbytes))
 		prof_idump();
 
-	idalloctm(tsd, tcache, false, true);
+	idalloctm(tsd, tcache, false, true, true);
 }
 
 void
@@ -445,7 +454,7 @@ tcaches_create(tsd_t *tsd, unsigned *r_ind)
 
 	if (tcaches_avail == NULL && tcaches_past > MALLOCX_TCACHE_MAX)
 		return (true);
-	tcache = tcache_create(tsd, a0get());
+	tcache = tcache_create(tsd, arena_get(0, false));
 	if (tcache == NULL)
 		return (true);
 
@@ -453,7 +462,7 @@ tcaches_create(tsd_t *tsd, unsigned *r_ind)
 		elm = tcaches_avail;
 		tcaches_avail = tcaches_avail->next;
 		elm->tcache = tcache;
-		*r_ind = elm - tcaches;
+		*r_ind = (unsigned)(elm - tcaches);
 	} else {
 		elm = &tcaches[tcaches_past];
 		elm->tcache = tcache;
diff --git a/contrib/jemalloc/src/ticker.c b/contrib/jemalloc/src/ticker.c
new file mode 100644
index 000000000000..db0902404ef2
--- /dev/null
+++ b/contrib/jemalloc/src/ticker.c
@@ -0,0 +1,2 @@
+#define	JEMALLOC_TICKER_C_
+#include "jemalloc/internal/jemalloc_internal.h"
diff --git a/contrib/jemalloc/src/tsd.c b/contrib/jemalloc/src/tsd.c
index 9ffe9afef7a0..34c1573cdd5a 100644
--- a/contrib/jemalloc/src/tsd.c
+++ b/contrib/jemalloc/src/tsd.c
@@ -113,7 +113,7 @@ malloc_tsd_boot0(void)
 	ncleanups = 0;
 	if (tsd_boot0())
 		return (true);
-	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = true;
+	*tsd_arenas_tdata_bypassp_get(tsd_fetch()) = true;
 	return (false);
 }
 
@@ -122,7 +122,7 @@ malloc_tsd_boot1(void)
 {
 
 	tsd_boot1();
-	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = false;
+	*tsd_arenas_tdata_bypassp_get(tsd_fetch()) = false;
 }
 
 #ifdef _WIN32
@@ -148,13 +148,15 @@ _tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved)
 #ifdef _MSC_VER
 #  ifdef _M_IX86
 #    pragma comment(linker, "/INCLUDE:__tls_used")
+#    pragma comment(linker, "/INCLUDE:_tls_callback")
 #  else
 #    pragma comment(linker, "/INCLUDE:_tls_used")
+#    pragma comment(linker, "/INCLUDE:tls_callback")
 #  endif
 #  pragma section(".CRT$XLY",long,read)
 #endif
 JEMALLOC_SECTION(".CRT$XLY") JEMALLOC_ATTR(used)
-static BOOL	(WINAPI *const tls_callback)(HINSTANCE hinstDLL,
+BOOL	(WINAPI *const tls_callback)(HINSTANCE hinstDLL,
     DWORD fdwReason, LPVOID lpvReserved) = _tls_callback;
 #endif
 
diff --git a/contrib/jemalloc/src/util.c b/contrib/jemalloc/src/util.c
index 25b61c207e68..116e98195f36 100644
--- a/contrib/jemalloc/src/util.c
+++ b/contrib/jemalloc/src/util.c
@@ -1,3 +1,7 @@
+/*
+ * Define simple versions of assertion macros that won't recurse in case
+ * of assertion failures in malloc_*printf().
+ */
 #define	assert(e) do {							\
 	if (config_debug && !(e)) {					\
 		malloc_write("<jemalloc>: Failed assertion\n");		\
@@ -49,10 +53,14 @@ wrtmessage(void *cbopaque, const char *s)
 	 * Use syscall(2) rather than write(2) when possible in order to avoid
 	 * the possibility of memory allocation within libc.  This is necessary
 	 * on FreeBSD; most operating systems do not have this problem though.
+	 *
+	 * syscall() returns long or int, depending on platform, so capture the
+	 * unused result in the widest plausible type to avoid compiler
+	 * warnings.
 	 */
-	UNUSED int result = syscall(SYS_write, STDERR_FILENO, s, strlen(s));
+	UNUSED long result = syscall(SYS_write, STDERR_FILENO, s, strlen(s));
 #else
-	UNUSED int result = write(STDERR_FILENO, s, strlen(s));
+	UNUSED ssize_t result = write(STDERR_FILENO, s, strlen(s));
 #endif
 }
 
@@ -98,7 +106,7 @@ buferror(int err, char *buf, size_t buflen)
 
 #ifdef _WIN32
 	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0,
-	    (LPSTR)buf, buflen, NULL);
+	    (LPSTR)buf, (DWORD)buflen, NULL);
 	return (0);
 #elif defined(__GLIBC__) && defined(_GNU_SOURCE)
 	char *b = strerror_r(err, buf, buflen);
@@ -593,7 +601,8 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 		str[i] = '\0';
 	else
 		str[size - 1] = '\0';
-	ret = i;
+	assert(i < INT_MAX);
+	ret = (int)i;
 
 #undef APPEND_C
 #undef APPEND_S
@@ -664,3 +673,12 @@ malloc_printf(const char *format, ...)
 	malloc_vcprintf(NULL, NULL, format, ap);
 	va_end(ap);
 }
+
+/*
+ * Restore normal assertion macros, in order to make it possible to compile all
+ * C files as a single concatenation.
+ */
+#undef assert
+#undef not_reached
+#undef not_implemented
+#include "jemalloc/internal/assert.h"