summaryrefslogtreecommitdiff
path: root/sys/vm/swap_pager.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/vm/swap_pager.c')
-rw-r--r--sys/vm/swap_pager.c2553
1 files changed, 1307 insertions, 1246 deletions
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 16911684c998..b06352014b32 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 1998 Matthew Dillon,
* Copyright (c) 1994 John S. Dyson
* Copyright (c) 1990 University of Utah.
* Copyright (c) 1991, 1993
@@ -36,17 +37,34 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
+ * New Swap System
+ * Matthew Dillon
+ *
+ * Radix Bitmap 'blists'.
+ *
+ * - The new swapper uses the new radix bitmap code. This should scale
+ * to arbitrarily small or arbitrarily large swap spaces and an almost
+ * arbitrary degree of fragmentation.
+ *
+ * Features:
+ *
+ * - on the fly reallocation of swap during putpages. The new system
+ * does not try to keep previously allocated swap blocks for dirty
+ * pages.
+ *
+ * - on the fly deallocation of swap
+ *
+ * - No more garbage collection required. Unnecessarily allocated swap
+ * blocks only exist for dirty vm_page_t's now and these are already
+ * cycled (in a high-load system) by the pager. We also do on-the-fly
+ * removal of invalidated swap blocks when a page is destroyed
+ * or renamed.
+ *
* from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
*
* @(#)swap_pager.c 8.9 (Berkeley) 3/21/94
- * $Id: swap_pager.c,v 1.106 1999/01/08 17:31:23 eivind Exp $
- */
-
-/*
- * Quick hack to page to dedicated partition(s).
- * TODO:
- * Add multiprocessor locks
- * Deal with async writes in a better fashion
+ *
+ * $Id: swap_pager.c,v 1.107 1999/01/10 01:58:28 eivind Exp $
*/
#include <sys/param.h>
@@ -57,18 +75,16 @@
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/vmmeter.h>
-#include <sys/rlist.h>
+#include <sys/blist.h>
+#include <sys/lock.h>
#ifndef MAX_PAGEOUT_CLUSTER
#define MAX_PAGEOUT_CLUSTER 16
#endif
-#ifndef NPENDINGIO
-#define NPENDINGIO 16
-#endif
-
-#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
+#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
+#include "opt_swap.h"
#include <vm/vm.h>
#include <vm/vm_prot.h>
#include <vm/vm_object.h>
@@ -77,848 +93,651 @@
#include <vm/vm_pageout.h>
#include <vm/swap_pager.h>
#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
-static int nswiodone;
-int swap_pager_full;
-extern int vm_swap_size;
-static int no_swap_space = 1;
-static int max_pageout_cluster;
-struct rlisthdr swaplist;
-
-TAILQ_HEAD(swpclean, swpagerclean);
-
-typedef struct swpagerclean *swp_clean_t;
+#define SWM_FREE 0x02 /* free, period */
+#define SWM_POP 0x04 /* pop out */
-static struct swpagerclean {
- TAILQ_ENTRY(swpagerclean) spc_list;
- int spc_flags;
- struct buf *spc_bp;
- vm_object_t spc_object;
- vm_offset_t spc_kva;
- int spc_first;
- int spc_count;
- vm_page_t spc_m[MAX_PAGEOUT_CLUSTER];
-} swcleanlist[NPENDINGIO];
-
-
-/* spc_flags values */
-#define SPC_ERROR 0x01
+/*
+ * vm_swap_size is in page-sized chunks now. It was DEV_BSIZE'd chunks
+ * in the old system.
+ */
-#define SWB_EMPTY (-1)
+extern int vm_swap_size; /* number of free swap blocks, in pages */
-/* list of completed page cleans */
-static struct swpclean swap_pager_done;
+int swap_pager_full; /* swap space exhaustion (w/ hysteresis)*/
+static int nsw_rcount; /* free read buffers */
+static int nsw_wcount; /* free write buffers */
+static int nsw_hysteresis; /* hysteresis */
+static int max_pageout_cluster; /* maximum VOP I/O allowed */
+static int sw_alloc_interlock; /* swap pager allocation interlock */
-/* list of pending page cleans */
-static struct swpclean swap_pager_inuse;
+struct blist *swapblist;
+static struct swblock **swhash;
+static int swhash_mask;
-/* list of free pager clean structs */
-static struct swpclean swap_pager_free;
-static int swap_pager_free_count;
-static int swap_pager_free_pending;
-/* list of "named" anon region objects */
-static struct pagerlst swap_pager_object_list;
+/*
+ * "named" and "unnamed" anon region objects. Try to reduce the overhead
+ * of searching a named list by hashing it just a little.
+ */
-/* list of "unnamed" anon region objects */
-struct pagerlst swap_pager_un_object_list;
+#define NOBJLISTS 8
-#define SWAP_FREE_NEEDED 0x1 /* need a swap block */
-#define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2
-static int swap_pager_needflags;
+#define NOBJLIST(handle) \
+ (&swap_pager_object_list[((int)(long)handle >> 4) & (NOBJLISTS-1)])
-static struct pagerlst *swp_qs[] = {
- &swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0
-};
+static struct pagerlst swap_pager_object_list[NOBJLISTS];
+struct pagerlst swap_pager_un_object_list;
+vm_zone_t swap_zone;
/*
- * pagerops for OBJT_SWAP - "swap pager".
+ * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
+ * calls hooked from other parts of the VM system and do not appear here.
+ * (see vm/swap_pager.h).
*/
+
static vm_object_t
swap_pager_alloc __P((void *handle, vm_ooffset_t size,
vm_prot_t prot, vm_ooffset_t offset));
static void swap_pager_dealloc __P((vm_object_t object));
-static boolean_t
- swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex,
- int *before, int *after));
static int swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
static void swap_pager_init __P((void));
-static void spc_free __P((swp_clean_t));
+static void swap_pager_unswapped __P((vm_page_t));
struct pagerops swappagerops = {
- swap_pager_init,
- swap_pager_alloc,
- swap_pager_dealloc,
- swap_pager_getpages,
- swap_pager_putpages,
- swap_pager_haspage,
- swap_pager_sync
+ swap_pager_init, /* early system initialization of pager */
+ swap_pager_alloc, /* allocate an OBJT_SWAP object */
+ swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
+ swap_pager_getpages, /* pagein */
+ swap_pager_putpages, /* pageout */
+ swap_pager_haspage, /* get backing store status for page */
+ swap_pager_unswapped /* remove swap related to page */
};
-static int npendingio;
-static int dmmin;
+/*
+ * dmmax is in page-sized chunks with the new swap system. It was
+ * dev-bsized chunks in the old.
+ *
+ * swap_*() routines are externally accessible. swp_*() routines are
+ * internal.
+ */
+
int dmmax;
+static int dmmax_mask;
+int nswap_lowat = 128; /* in pages, swap_pager_full warning */
+int nswap_hiwat = 256; /* in pages, swap_pager_full warning */
+
+static __inline void swp_sizecheck __P((void));
+static void swp_pager_sync_iodone __P((struct buf *bp));
+static void swp_pager_async_iodone __P((struct buf *bp));
+
+/*
+ * Swap bitmap functions
+ */
+
+static __inline void swp_pager_freeswapspace __P((daddr_t blk, int npages));
+static __inline daddr_t swp_pager_getswapspace __P((int npages));
+
+/*
+ * Metadata functions
+ */
+
+static void swp_pager_meta_build __P((vm_object_t, daddr_t, daddr_t, int));
+static void swp_pager_meta_free __P((vm_object_t, daddr_t, daddr_t));
+static void swp_pager_meta_free_all __P((vm_object_t));
+static daddr_t swp_pager_meta_ctl __P((vm_object_t, vm_pindex_t, int));
-static int swap_pager_block_index __P((vm_pindex_t pindex));
-static int swap_pager_block_offset __P((vm_pindex_t pindex));
-static daddr_t *swap_pager_diskaddr __P((vm_object_t object,
- vm_pindex_t pindex, int *valid));
-static void swap_pager_finish __P((swp_clean_t spc));
-static void swap_pager_free_swap __P((vm_object_t object));
-static void swap_pager_freeswapspace __P((vm_object_t object,
- unsigned int from,
- unsigned int to));
-static int swap_pager_getswapspace __P((vm_object_t object,
- unsigned int amount,
- daddr_t *rtval));
-static void swap_pager_iodone __P((struct buf *));
-static void swap_pager_iodone1 __P((struct buf *bp));
-static void swap_pager_reclaim __P((void));
-static void swap_pager_ridpages __P((vm_page_t *m, int count,
- int reqpage));
-static void swap_pager_setvalid __P((vm_object_t object,
- vm_offset_t offset, int valid));
-static __inline void swapsizecheck __P((void));
-
-#define SWAPLOW (vm_swap_size < (512 * btodb(PAGE_SIZE)))
+/*
+ * SWP_SIZECHECK() - update swap_pager_full indication
+ *
+ * update the swap_pager_full indication and warn when we are
+ * about to run out of swap space.
+ *
+ * No restrictions on call
+ * This routine may not block.
+ * This routine must be called at splvm()
+ */
static __inline void
-swapsizecheck()
+swp_sizecheck()
{
- if (vm_swap_size < 128 * btodb(PAGE_SIZE)) {
+ if (vm_swap_size < nswap_lowat) {
if (swap_pager_full == 0)
printf("swap_pager: out of swap space\n");
swap_pager_full = 1;
- } else if (vm_swap_size > 192 * btodb(PAGE_SIZE))
+ } else if (vm_swap_size > nswap_hiwat) {
swap_pager_full = 0;
+ }
}
+/*
+ * SWAP_PAGER_INIT() - initialize the swap pager!
+ *
+ * Expected to be started from system init. NOTE: This code is run
+ * before much else so be careful what you depend on. Most of the VM
+ * system has yet to be initialized at this point.
+ */
+
static void
swap_pager_init()
{
- int maxsafepending;
- TAILQ_INIT(&swap_pager_object_list);
- TAILQ_INIT(&swap_pager_un_object_list);
-
/*
- * Initialize clean lists
+ * Initialize object lists
*/
- TAILQ_INIT(&swap_pager_inuse);
- TAILQ_INIT(&swap_pager_done);
- TAILQ_INIT(&swap_pager_free);
- swap_pager_free_count = 0;
+ int i;
+
+ for (i = 0; i < NOBJLISTS; ++i)
+ TAILQ_INIT(&swap_pager_object_list[i]);
+ TAILQ_INIT(&swap_pager_un_object_list);
/*
- * Calculate the swap allocation constants.
+ * Device Stripe, in PAGE_SIZE'd blocks
*/
- dmmin = PAGE_SIZE / DEV_BSIZE;
- dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2;
-
- maxsafepending = cnt.v_free_min - cnt.v_free_reserved;
- npendingio = NPENDINGIO;
- max_pageout_cluster = MAX_PAGEOUT_CLUSTER;
-
- if ((2 * NPENDINGIO * MAX_PAGEOUT_CLUSTER) > maxsafepending) {
- max_pageout_cluster = MAX_PAGEOUT_CLUSTER / 2;
- npendingio = maxsafepending / (2 * max_pageout_cluster);
- if (npendingio < 2)
- npendingio = 2;
- }
+
+ dmmax = SWB_NPAGES * 2;
+ dmmax_mask = ~(dmmax - 1);
}
+/*
+ * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
+ *
+ * Expected to be started from pageout process once, prior to entering
+ * its main loop.
+ */
+
void
swap_pager_swap_init()
{
- swp_clean_t spc;
- struct buf *bp;
- int i;
+ int n;
/*
- * kva's are allocated here so that we dont need to keep doing
- * kmem_alloc pageables at runtime
+ * Number of in-transit swap bp operations. Don't
+ * exhaust the pbufs completely. Make sure we
+ * initialize workable values (0 will work for hysteresis
+ * but it isn't very efficient).
+ *
+ * The max_pageout_cluster is constrained by the bp->b_pages[]
+ * array (MAXPHYS/PAGE_SIZE) and our locally defined
+ * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
+ * constrained by the swap device interleave stripe size.
*/
- for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) {
- spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * max_pageout_cluster);
- if (!spc->spc_kva) {
- break;
- }
- spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL);
- if (!spc->spc_bp) {
- kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
- break;
- }
- spc->spc_flags = 0;
- TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
- swap_pager_free_count++;
- }
-}
-int
-swap_pager_swp_alloc(object, wait)
- vm_object_t object;
- int wait;
-{
- sw_blk_t swb;
- int nblocks;
- int i, j;
-
- nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES;
- swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait);
- if (swb == NULL)
- return 1;
-
- for (i = 0; i < nblocks; i++) {
- swb[i].swb_valid = 0;
- swb[i].swb_locked = 0;
- for (j = 0; j < SWB_NPAGES; j++)
- swb[i].swb_block[j] = SWB_EMPTY;
- }
+ nsw_rcount = (nswbuf + 1) / 2;
+ nsw_wcount = (nswbuf + 3) / 4;
+ nsw_hysteresis = nsw_wcount / 2;
+ max_pageout_cluster = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
- object->un_pager.swp.swp_nblocks = nblocks;
- object->un_pager.swp.swp_allocsize = 0;
- object->un_pager.swp.swp_blocks = swb;
- object->un_pager.swp.swp_poip = 0;
+ /*
+ * Initialize our zone. Right now I'm just guessing on the number
+ * we need based on the number of pages in the system. Each swblock
+ * can hold 16 pages, so this is probably overkill.
+ */
- if (object->handle != NULL) {
- TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list);
- } else {
- TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
- }
+ n = cnt.v_page_count * 2;
- return 0;
+ swap_zone = zinit(
+ "SWAPMETA",
+ sizeof(struct swblock),
+ n,
+ ZONE_INTERRUPT,
+ 1
+ );
+
+ /*
+ * Initialize our meta-data hash table. The swapper does not need to
+ * be quite as efficient as the VM system, so we do not use an
+ * oversized hash table.
+ *
+ * n: size of hash table, must be power of 2
+ * swhash_mask: hash table index mask
+ */
+
+ for (n = 1; n < cnt.v_page_count / 4; n <<= 1)
+ ;
+
+ swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK);
+ bzero(swhash, sizeof(struct swblock *) * n);
+
+ swhash_mask = n - 1;
}
/*
- * Allocate an object and associated resources.
- * Note that if we are called from the pageout daemon (handle == NULL)
- * we should not wait for memory as it could resulting in deadlock.
+ * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
+ * its metadata structures.
+ *
+ * This routine is called from the mmap and fork code to create a new
+ * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object
+ * and then converting it with swp_pager_meta_build().
+ *
+ * This routine may block in vm_object_allocate() and create a named
+ * object lookup race, so we must interlock. We must also run at
+ * splvm() for the object lookup to handle races with interrupts, but
+ * we do not have to maintain splvm() in between the lookup and the
+ * add because (I believe) it is not possible to attempt to create
+ * a new swap object w/handle when a default object with that handle
+ * already exists.
*/
+
static vm_object_t
swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t offset)
{
vm_object_t object;
- /*
- * If this is a "named" anonymous region, look it up and use the
- * object if it exists, otherwise allocate a new one.
- */
if (handle) {
- object = vm_pager_object_lookup(&swap_pager_object_list, handle);
+ /*
+ * Reference existing named region or allocate new one. There
+ * should not be a race here against swp_pager_meta_build()
+ * as called from vm_page_remove() in regards to the lookup
+ * of the handle.
+ */
+
+ while (sw_alloc_interlock) {
+ sw_alloc_interlock = -1;
+ tsleep(&sw_alloc_interlock, PVM, "swpalc", 0);
+ }
+ sw_alloc_interlock = 1;
+
+ object = vm_pager_object_lookup(NOBJLIST(handle), handle);
+
if (object != NULL) {
vm_object_reference(object);
} else {
- /*
- * XXX - there is a race condition here. Two processes
- * can request the same named object simultaneuously,
- * and if one blocks for memory, the result is a disaster.
- * Probably quite rare, but is yet another reason to just
- * rip support of "named anonymous regions" out altogether.
- */
- object = vm_object_allocate(OBJT_SWAP,
+ object = vm_object_allocate(OBJT_DEFAULT,
OFF_TO_IDX(offset + PAGE_MASK + size));
object->handle = handle;
- (void) swap_pager_swp_alloc(object, M_WAITOK);
+
+ swp_pager_meta_build(
+ object,
+ 0,
+ SWAPBLK_NONE,
+ 0
+ );
}
+
+ if (sw_alloc_interlock < 0)
+ wakeup(&sw_alloc_interlock);
+
+ sw_alloc_interlock = 0;
} else {
- object = vm_object_allocate(OBJT_SWAP,
+ object = vm_object_allocate(OBJT_DEFAULT,
OFF_TO_IDX(offset + PAGE_MASK + size));
- (void) swap_pager_swp_alloc(object, M_WAITOK);
+
+ swp_pager_meta_build(
+ object,
+ 0,
+ SWAPBLK_NONE,
+ 0
+ );
}
return (object);
}
/*
- * returns disk block associated with pager and offset
- * additionally, as a side effect returns a flag indicating
- * if the block has been written
+ * SWAP_PAGER_DEALLOC() - remove swap metadata from object
+ *
+ * The swap backing for the object is destroyed. The code is
+ * designed such that we can reinstantiate it later, but this
+ * routine is typically called only when the entire object is
+ * about to be destroyed.
+ *
+ * This routine may block, but no longer does.
+ *
+ * The object must be locked or unreferenceable.
*/
-static __inline daddr_t *
-swap_pager_diskaddr(object, pindex, valid)
+static void
+swap_pager_dealloc(object)
vm_object_t object;
- vm_pindex_t pindex;
- int *valid;
{
- register sw_blk_t swb;
- int ix;
-
- if (valid)
- *valid = 0;
- ix = pindex / SWB_NPAGES;
- if ((ix >= object->un_pager.swp.swp_nblocks) ||
- (pindex >= object->size)) {
- return (FALSE);
+ /*
+ * Remove from list right away so lookups will fail if we block for
+ * pageout completion.
+ */
+
+ if (object->handle == NULL) {
+ TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
+ } else {
+ TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
}
- swb = &object->un_pager.swp.swp_blocks[ix];
- ix = pindex % SWB_NPAGES;
- if (valid)
- *valid = swb->swb_valid & (1 << ix);
- return &swb->swb_block[ix];
-}
-/*
- * Utility routine to set the valid (written) bit for
- * a block associated with a pager and offset
- */
-static void
-swap_pager_setvalid(object, offset, valid)
- vm_object_t object;
- vm_offset_t offset;
- int valid;
-{
- register sw_blk_t swb;
- int ix;
+ vm_object_pip_wait(object, "swpdea");
- ix = offset / SWB_NPAGES;
- if (ix >= object->un_pager.swp.swp_nblocks)
- return;
+ /*
+ * Free all remaining metadata. We only bother to free it from
+ * the swap meta data. We do not attempt to free swapblk's still
+ * associated with vm_page_t's for this object. We do not care
+ * if paging is still in progress on some objects.
+ */
- swb = &object->un_pager.swp.swp_blocks[ix];
- ix = offset % SWB_NPAGES;
- if (valid)
- swb->swb_valid |= (1 << ix);
- else
- swb->swb_valid &= ~(1 << ix);
- return;
+ swp_pager_meta_free_all(object);
}
+/************************************************************************
+ * SWAP PAGER BITMAP ROUTINES *
+ ************************************************************************/
+
/*
- * this routine allocates swap space with a fragmentation
- * minimization policy.
+ * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
+ *
+ * Allocate swap for the requested number of pages. The starting
+ * swap block number (a page index) is returned or SWAPBLK_NONE
+ * if the allocation failed.
+ *
+ * Also has the side effect of advising that somebody made a mistake
+ * when they configured swap and didn't configure enough.
+ *
+ * Must be called at splvm() to avoid races with bitmap frees from
+ * vm_page_remove() aka swap_pager_page_removed().
+ *
+ * This routine may not block
+ * This routine must be called at splvm().
*/
-static int
-swap_pager_getswapspace(object, amount, rtval)
- vm_object_t object;
- unsigned int amount;
- daddr_t *rtval;
+
+static __inline daddr_t
+swp_pager_getswapspace(npages)
+ int npages;
{
- unsigned location;
+ daddr_t blk;
- vm_swap_size -= amount;
-
- if (!rlist_alloc(&swaplist, amount, &location)) {
- vm_swap_size += amount;
- return 0;
+ if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
+ printf("swap_pager_getswapspace: failed\n");
} else {
- swapsizecheck();
- object->un_pager.swp.swp_allocsize += amount;
- *rtval = location;
- return 1;
+ vm_swap_size -= npages;
+ swp_sizecheck();
}
+ return(blk);
}
/*
- * this routine frees swap space with a fragmentation
- * minimization policy.
+ * SWP_PAGER_FREESWAPSPACE() - free raw swap space
+ *
+ * This routine returns the specified swap blocks back to the bitmap.
+ *
+ * Note: This routine may not block (it could in the old swap code),
+ * and through the use of the new blist routines it does not block.
+ *
+ * We must be called at splvm() to avoid races with bitmap frees from
+ * vm_page_remove() aka swap_pager_page_removed().
+ *
+ * This routine may not block
+ * This routine must be called at splvm().
*/
-static void
-swap_pager_freeswapspace(object, from, to)
- vm_object_t object;
- unsigned int from;
- unsigned int to;
+
+static __inline void
+swp_pager_freeswapspace(blk, npages)
+ daddr_t blk;
+ int npages;
{
- rlist_free(&swaplist, from, to);
- vm_swap_size += (to - from) + 1;
- object->un_pager.swp.swp_allocsize -= (to - from) + 1;
- swapsizecheck();
+ blist_free(swapblist, blk, npages);
+ vm_swap_size += npages;
+ swp_sizecheck();
}
+
/*
- * this routine frees swap blocks from a specified pager
+ * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
+ * range within an object.
+ *
+ * This is a globally accessible routine.
+ *
+ * This routine removes swapblk assignments from swap metadata.
+ *
+ * The external callers of this routine typically have already destroyed
+ * or renamed vm_page_t's associated with this range in the object so
+ * we should be ok.
*/
+
void
swap_pager_freespace(object, start, size)
vm_object_t object;
vm_pindex_t start;
vm_size_t size;
{
- vm_pindex_t i;
- int s;
-
- s = splvm();
- for (i = start; i < start + size; i += 1) {
- int valid;
- daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
-
- if (addr && *addr != SWB_EMPTY) {
- swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1);
- if (valid) {
- swap_pager_setvalid(object, i, 0);
- }
- *addr = SWB_EMPTY;
- }
- }
- splx(s);
+ swp_pager_meta_free(object, start, size);
}
/*
- * same as freespace, but don't free, just force a DMZ next time
- */
-void
-swap_pager_dmzspace(object, start, size)
- vm_object_t object;
- vm_pindex_t start;
- vm_size_t size;
-{
- vm_pindex_t i;
- int s;
-
- s = splvm();
- for (i = start; i < start + size; i += 1) {
- int valid;
- daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
-
- if (addr && *addr != SWB_EMPTY) {
- if (valid) {
- swap_pager_setvalid(object, i, 0);
- }
- }
- }
- splx(s);
-}
-
-static void
-swap_pager_free_swap(object)
- vm_object_t object;
-{
- register int i, j;
- register sw_blk_t swb;
- int first_block=0, block_count=0;
- int s;
- /*
- * Free left over swap blocks
- */
- swb = object->un_pager.swp.swp_blocks;
- if (swb == NULL) {
- return;
- }
-
- s = splvm();
- for (i = 0; i < object->un_pager.swp.swp_nblocks; i++, swb++) {
- for (j = 0; j < SWB_NPAGES; j++) {
- if (swb->swb_block[j] != SWB_EMPTY) {
- /*
- * initially the length of the run is zero
- */
- if (block_count == 0) {
- first_block = swb->swb_block[j];
- block_count = btodb(PAGE_SIZE);
- swb->swb_block[j] = SWB_EMPTY;
- /*
- * if the new block can be included into the current run
- */
- } else if (swb->swb_block[j] == first_block + block_count) {
- block_count += btodb(PAGE_SIZE);
- swb->swb_block[j] = SWB_EMPTY;
- /*
- * terminate the previous run, and start a new one
- */
- } else {
- swap_pager_freeswapspace(object, first_block,
- (unsigned) first_block + block_count - 1);
- first_block = swb->swb_block[j];
- block_count = btodb(PAGE_SIZE);
- swb->swb_block[j] = SWB_EMPTY;
- }
- }
- }
- }
-
- if (block_count) {
- swap_pager_freeswapspace(object, first_block,
- (unsigned) first_block + block_count - 1);
- }
- splx(s);
-}
-
-
-/*
- * swap_pager_reclaim frees up over-allocated space from all pagers
- * this eliminates internal fragmentation due to allocation of space
- * for segments that are never swapped to. It has been written so that
- * it does not block until the rlist_free operation occurs; it keeps
- * the queues consistant.
- */
-
-/*
- * Maximum number of blocks (pages) to reclaim per pass
- */
-#define MAXRECLAIM 128
-
-static void
-swap_pager_reclaim()
-{
- vm_object_t object;
- int i, j, k;
- int s;
- int reclaimcount;
- static struct {
- int address;
- vm_object_t object;
- } reclaims[MAXRECLAIM];
- static int in_reclaim;
-
- /*
- * allow only one process to be in the swap_pager_reclaim subroutine
- */
- s = splvm();
- if (in_reclaim) {
- tsleep(&in_reclaim, PSWP, "swrclm", 0);
- splx(s);
- return;
- }
- in_reclaim = 1;
- reclaimcount = 0;
-
- /* for each pager queue */
- for (k = 0; swp_qs[k]; k++) {
-
- object = TAILQ_FIRST(swp_qs[k]);
- while (object && (reclaimcount < MAXRECLAIM)) {
-
- /*
- * see if any blocks associated with a pager has been
- * allocated but not used (written)
- */
- if ((object->flags & OBJ_DEAD) == 0 &&
- (object->paging_in_progress == 0)) {
- for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) {
- sw_blk_t swb = &object->un_pager.swp.swp_blocks[i];
-
- if (swb->swb_locked)
- continue;
- for (j = 0; j < SWB_NPAGES; j++) {
- if (swb->swb_block[j] != SWB_EMPTY &&
- (swb->swb_valid & (1 << j)) == 0) {
- reclaims[reclaimcount].address = swb->swb_block[j];
- reclaims[reclaimcount++].object = object;
- swb->swb_block[j] = SWB_EMPTY;
- if (reclaimcount >= MAXRECLAIM)
- goto rfinished;
- }
- }
- }
- }
- object = TAILQ_NEXT(object, pager_object_list);
- }
- }
-
-rfinished:
-
- /*
- * free the blocks that have been added to the reclaim list
- */
- for (i = 0; i < reclaimcount; i++) {
- swap_pager_freeswapspace(reclaims[i].object,
- reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1);
- }
- splx(s);
- in_reclaim = 0;
- wakeup(&in_reclaim);
-}
-
-
-/*
- * swap_pager_copy copies blocks from one pager to another and
- * destroys the source pager
+ * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
+ * and destroy the source.
+ *
+ * Copy any valid swapblks from the source to the destination. In
+ * cases where both the source and destination have a valid swapblk,
+ * we keep the destination's.
+ *
+ * This routine is allowed to block. It may block allocating metadata
+ * indirectly through swp_pager_meta_build() or if paging is still in
+ * progress on the source.
+ *
+ * XXX vm_page_collapse() kinda expects us not to block because we
+ * supposedly do not need to allocate memory, but for the moment we
+ * *may* have to get a little memory from the zone allocator, but
+ * it is taken from the interrupt memory. We should be ok.
+ *
+ * The source object contains no vm_page_t's (which is just as well)
+ *
+ * The source object is of type OBJT_SWAP.
+ *
+ * The source and destination objects must be
+ * locked or inaccessible (XXX are they ???)
*/
void
-swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset,
- offset, destroysource)
+swap_pager_copy(srcobject, dstobject, offset, destroysource)
vm_object_t srcobject;
- vm_pindex_t srcoffset;
vm_object_t dstobject;
- vm_pindex_t dstoffset;
vm_pindex_t offset;
int destroysource;
{
vm_pindex_t i;
- int origsize;
- int s;
-
- if (vm_swap_size)
- no_swap_space = 0;
-
- origsize = srcobject->un_pager.swp.swp_allocsize;
/*
- * remove the source object from the swap_pager internal queue
+ * If destroysource is set, we remove the source object from the
+ * swap_pager internal queue now.
*/
+
if (destroysource) {
if (srcobject->handle == NULL) {
- TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list);
+ TAILQ_REMOVE(
+ &swap_pager_un_object_list,
+ srcobject,
+ pager_object_list
+ );
} else {
- TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list);
+ TAILQ_REMOVE(
+ NOBJLIST(srcobject->handle),
+ srcobject,
+ pager_object_list
+ );
}
}
- s = splvm();
- while (srcobject->un_pager.swp.swp_poip) {
- tsleep(srcobject, PVM, "spgout", 0);
- }
-
/*
- * clean all of the pages that are currently active and finished
+ * transfer source to destination.
*/
- if (swap_pager_free_pending)
- swap_pager_sync();
- /*
- * transfer source to destination
- */
- for (i = 0; i < dstobject->size; i += 1) {
- int srcvalid, dstvalid;
- daddr_t *srcaddrp = swap_pager_diskaddr(srcobject,
- i + offset + srcoffset, &srcvalid);
- daddr_t *dstaddrp;
+ for (i = 0; i < dstobject->size; ++i) {
+ daddr_t dstaddr;
/*
- * see if the source has space allocated
+ * Locate (without changing) the swapblk on the destination,
+ * unless it is invalid in which case free it silently, or
+ * if the destination is a resident page, in which case the
+ * source is thrown away.
*/
- if (srcaddrp && *srcaddrp != SWB_EMPTY) {
+
+ dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
+
+ if (dstaddr == SWAPBLK_NONE) {
/*
- * if the source is valid and the dest has no space,
- * then copy the allocation from the srouce to the
- * dest.
+ * Destination has no swapblk and is not resident,
+ * copy source.
*/
- if (srcvalid) {
- dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset,
- &dstvalid);
- /*
- * if the dest already has a valid block,
- * deallocate the source block without
- * copying.
- */
- if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
- swap_pager_freeswapspace(dstobject, *dstaddrp,
- *dstaddrp + btodb(PAGE_SIZE) - 1);
- *dstaddrp = SWB_EMPTY;
- }
- if (dstaddrp && *dstaddrp == SWB_EMPTY) {
- *dstaddrp = *srcaddrp;
- *srcaddrp = SWB_EMPTY;
- dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE);
- srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE);
- swap_pager_setvalid(dstobject, i + dstoffset, 1);
- }
- }
+ daddr_t srcaddr;
+
+ srcaddr = swp_pager_meta_ctl(
+ srcobject,
+ i + offset,
+ SWM_POP
+ );
+
+ if (srcaddr != SWAPBLK_NONE)
+ swp_pager_meta_build(dstobject, i, srcaddr, 1);
+ } else {
/*
- * if the source is not empty at this point, then
- * deallocate the space.
+ * Destination has valid swapblk or it is represented
+ * by a resident page. We destroy the sourceblock.
*/
- if (*srcaddrp != SWB_EMPTY) {
- swap_pager_freeswapspace(srcobject, *srcaddrp,
- *srcaddrp + btodb(PAGE_SIZE) - 1);
- *srcaddrp = SWB_EMPTY;
- }
+
+ swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
}
}
- splx(s);
/*
- * Free left over swap blocks
+ * Free left over swap blocks in source.
+ *
+ * We have to revert the type to OBJT_DEFAULT so we do not accidently
+ * double-remove the object from the swap queues.
*/
- if (destroysource) {
- swap_pager_free_swap(srcobject);
- if (srcobject->un_pager.swp.swp_allocsize) {
- printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n",
- srcobject->un_pager.swp.swp_allocsize, origsize);
- }
-
- free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA);
- srcobject->un_pager.swp.swp_blocks = NULL;
+ if (destroysource) {
+ swp_pager_meta_free_all(srcobject);
+ /*
+ * Reverting the type is not necessary, the caller is going
+ * to destroy srcobject directly, but I'm doing it here
+ * for consistancy since we've removed the object from its
+ * queues.
+ */
+ srcobject->type = OBJT_DEFAULT;
}
return;
}
-static void
-swap_pager_dealloc(object)
+/*
+ * SWAP_PAGER_HASPAGE() - determine if we have good backing store for
+ * the requested page.
+ *
+ * We determine whether good backing store exists for the requested
+ * page and return TRUE if it does, FALSE if it doesn't.
+ *
+ * If TRUE, we also try to determine how much valid, contiguous backing
+ * store exists before and after the requested page within a reasonable
+ * distance. We do not try to restrict it to the swap device stripe
+ * (that is handled in getpages/putpages). It probably isn't worth
+ * doing here.
+ */
+
+boolean_t
+swap_pager_haspage(object, pindex, before, after)
vm_object_t object;
+ vm_pindex_t pindex;
+ int *before;
+ int *after;
{
- int s;
- sw_blk_t swb;
+ daddr_t blk0;
/*
- * Remove from list right away so lookups will fail if we block for
- * pageout completion.
+ * do we have good backing store at the requested index ?
*/
- if (object->handle == NULL) {
- TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
- } else {
- TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list);
- }
- /*
- * Wait for all pageouts to finish and remove all entries from
- * cleaning list.
- */
+ blk0 = swp_pager_meta_ctl(object, pindex, 0);
- s = splvm();
- while (object->un_pager.swp.swp_poip) {
- tsleep(object, PVM, "swpout", 0);
+ if (blk0 & SWAPBLK_NONE) {
+ if (before)
+ *before = 0;
+ if (after)
+ *after = 0;
+ return (FALSE);
}
- splx(s);
-
- if (swap_pager_free_pending)
- swap_pager_sync();
/*
- * Free left over swap blocks
+ * find backwards-looking contiguous good backing store
*/
- swap_pager_free_swap(object);
- if (object->un_pager.swp.swp_allocsize) {
- printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n",
- object->un_pager.swp.swp_allocsize);
- }
- swb = object->un_pager.swp.swp_blocks;
- if (swb) {
- /*
- * Free swap management resources
- */
- free(swb, M_VMPGDATA);
- object->un_pager.swp.swp_blocks = NULL;
- }
-}
+ if (before != NULL) {
+ int i;
-static __inline int
-swap_pager_block_index(pindex)
- vm_pindex_t pindex;
-{
- return (pindex / SWB_NPAGES);
-}
-
-static __inline int
-swap_pager_block_offset(pindex)
- vm_pindex_t pindex;
-{
- return (pindex % SWB_NPAGES);
-}
+ for (i = 1; i < (SWB_NPAGES/2); ++i) {
+ daddr_t blk;
-/*
- * swap_pager_haspage returns TRUE if the pager has data that has
- * been written out.
- */
-static boolean_t
-swap_pager_haspage(object, pindex, before, after)
- vm_object_t object;
- vm_pindex_t pindex;
- int *before;
- int *after;
-{
- register sw_blk_t swb;
- int ix;
-
- if (before != NULL)
- *before = 0;
- if (after != NULL)
- *after = 0;
- ix = pindex / SWB_NPAGES;
- if (ix >= object->un_pager.swp.swp_nblocks) {
- return (FALSE);
+ if (i > pindex)
+ break;
+ blk = swp_pager_meta_ctl(object, pindex - i, 0);
+ if (blk & SWAPBLK_NONE)
+ break;
+ if (blk != blk0 - i)
+ break;
+ }
+ *before = (i - 1);
}
- swb = &object->un_pager.swp.swp_blocks[ix];
- ix = pindex % SWB_NPAGES;
-
- if (swb->swb_block[ix] != SWB_EMPTY) {
-
- if (swb->swb_valid & (1 << ix)) {
- int tix;
- if (before) {
- for(tix = ix - 1; tix >= 0; --tix) {
- if ((swb->swb_valid & (1 << tix)) == 0)
- break;
- if ((swb->swb_block[tix] +
- (ix - tix) * (PAGE_SIZE/DEV_BSIZE)) !=
- swb->swb_block[ix])
- break;
- (*before)++;
- }
- }
- if (after) {
- for(tix = ix + 1; tix < SWB_NPAGES; tix++) {
- if ((swb->swb_valid & (1 << tix)) == 0)
- break;
- if ((swb->swb_block[tix] -
- (tix - ix) * (PAGE_SIZE/DEV_BSIZE)) !=
- swb->swb_block[ix])
- break;
- (*after)++;
- }
- }
+ /*
+ * find forward-looking contiguous good backing store
+ */
- return TRUE;
+ if (after != NULL) {
+ int i;
+
+ for (i = 1; i < (SWB_NPAGES/2); ++i) {
+ daddr_t blk;
+
+ blk = swp_pager_meta_ctl(object, pindex + i, 0);
+ if (blk & SWAPBLK_NONE)
+ break;
+ if (blk != blk0 + i)
+ break;
}
+ *after = (i - 1);
}
- return (FALSE);
-}
-/*
- * Wakeup based upon spc state
- */
-static void
-spc_wakeup(void)
-{
- if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) {
- swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT;
- wakeup(&swap_pager_needflags);
- } else if ((swap_pager_needflags & SWAP_FREE_NEEDED) &&
- swap_pager_free_count >= ((2 * npendingio) / 3)) {
- swap_pager_needflags &= ~SWAP_FREE_NEEDED;
- wakeup(&swap_pager_free);
- }
+ return (TRUE);
}
/*
- * Free an spc structure
+ * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
+ *
+ * This removes any associated swap backing store, whether valid or
+ * not, from the page.
+ *
+ * This routine is typically called when a page is made dirty, at
+ * which point any associated swap can be freed. MADV_FREE also
+ * calls us in a special-case situation
+ *
+ * NOTE!!! If the page is clean and the swap was valid, the caller
+ * should make the page dirty before calling this routine. This routine
+ * does NOT change the m->dirty status of the page. Also: MADV_FREE
+ * depends on it.
+ *
+ * This routine may not block
*/
-static void
-spc_free(spc)
- swp_clean_t spc;
-{
- spc->spc_flags = 0;
- TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
- swap_pager_free_count++;
- if (swap_pager_needflags) {
- spc_wakeup();
- }
-}
-/*
- * swap_pager_ridpages is a convienience routine that deallocates all
- * but the required page. this is usually used in error returns that
- * need to invalidate the "extra" readahead pages.
- */
static void
-swap_pager_ridpages(m, count, reqpage)
- vm_page_t *m;
- int count;
- int reqpage;
+swap_pager_unswapped(m)
+ vm_page_t m;
{
- int i;
-
- for (i = 0; i < count; i++) {
- if (i != reqpage) {
- vm_page_free(m[i]);
- }
- }
+ swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
}
/*
- * swap_pager_iodone1 is the completion routine for both reads and async writes
+ * SWAP_PAGER_GETPAGES() - bring pages in from swap
+ *
+ * Attempt to retrieve (m, count) pages from backing store, but make
+ * sure we retrieve at least m[reqpage]. We try to load in as large
+ * a chunk surrounding m[reqpage] as is contiguous in swap and which
+ * belongs to the same object.
+ *
+ * The code is designed for asynchronous operation and
+ * immediate-notification of 'reqpage' but tends not to be
+ * used that way. Please do not optimize-out this algorithmic
+ * feature, I intend to improve on it in the future.
+ *
+ * The parent has a single vm_object_pip_add() reference prior to
+ * calling us and we should return with the same.
+ *
+ * The parent has BUSY'd the pages. We should return with 'm'
+ * left busy, but the others adjusted.
*/
-static void
-swap_pager_iodone1(bp)
- struct buf *bp;
-{
- bp->b_flags |= B_DONE;
- bp->b_flags &= ~B_ASYNC;
- wakeup(bp);
-}
static int
swap_pager_getpages(object, m, count, reqpage)
@@ -926,208 +745,235 @@ swap_pager_getpages(object, m, count, reqpage)
vm_page_t *m;
int count, reqpage;
{
- register struct buf *bp;
- sw_blk_t swb[count];
- register int s;
+ struct buf *bp;
+ vm_page_t mreq;
+ int s;
int i;
- boolean_t rv;
- vm_offset_t kva, off[count];
- vm_pindex_t paging_offset;
- int reqaddr[count];
- int sequential;
-
- int first, last;
- int failed;
- int reqdskregion;
-
- object = m[reqpage]->object;
- paging_offset = OFF_TO_IDX(object->paging_offset);
- sequential = (m[reqpage]->pindex == (object->last_read + 1));
-
- for (i = 0; i < count; i++) {
- vm_pindex_t fidx = m[i]->pindex + paging_offset;
- int ix = swap_pager_block_index(fidx);
-
- if (ix >= object->un_pager.swp.swp_nblocks) {
- int j;
-
- if (i <= reqpage) {
- swap_pager_ridpages(m, count, reqpage);
- return (VM_PAGER_FAIL);
- }
- for (j = i; j < count; j++) {
- vm_page_free(m[j]);
- }
- count = i;
+ int j;
+ daddr_t blk;
+ vm_offset_t kva;
+ vm_pindex_t lastpindex;
+
+ mreq = m[reqpage];
+
+#if !defined(MAX_PERF)
+ if (mreq->object != object) {
+ panic("swap_pager_getpages: object mismatch %p/%p",
+ object,
+ mreq->object
+ );
+ }
+#endif
+ /*
+ * Calculate range to retrieve. The pages have already been assigned
+ * their swapblks. We require a *contiguous* range that falls entirely
+ * within a single device stripe. If we do not supply it, bad things
+ * happen.
+ */
+
+
+ blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
+
+ for (i = reqpage - 1; i >= 0; --i) {
+ daddr_t iblk;
+
+ iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
+ if (iblk & SWAPBLK_NONE)
+ break;
+
+ if ((blk ^ iblk) & dmmax_mask)
+ break;
+
+ if (blk != iblk + (reqpage - i))
break;
- }
- swb[i] = &object->un_pager.swp.swp_blocks[ix];
- off[i] = swap_pager_block_offset(fidx);
- reqaddr[i] = swb[i]->swb_block[off[i]];
}
+ ++i;
- /* make sure that our required input request is existant */
+ for (j = reqpage + 1; j < count; ++j) {
+ daddr_t jblk;
- if (reqaddr[reqpage] == SWB_EMPTY ||
- (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
- swap_pager_ridpages(m, count, reqpage);
- return (VM_PAGER_FAIL);
+ jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
+ if (jblk & SWAPBLK_NONE)
+ break;
+
+ if ((blk ^ jblk) & dmmax_mask)
+ break;
+
+ if (blk != jblk - (j - reqpage))
+ break;
}
- reqdskregion = reqaddr[reqpage] / dmmax;
/*
- * search backwards for the first contiguous page to transfer
+ * If blk itself is bad, well, we can't do any I/O. This should
+ * already be covered as a side effect, but I'm making sure.
*/
- failed = 0;
- first = 0;
- for (i = reqpage - 1; i >= 0; --i) {
- if (sequential || failed || (reqaddr[i] == SWB_EMPTY) ||
- (swb[i]->swb_valid & (1 << off[i])) == 0 ||
- (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
- ((reqaddr[i] / dmmax) != reqdskregion)) {
- failed = 1;
- vm_page_free(m[i]);
- if (first == 0)
- first = i + 1;
- }
+
+ if (blk & SWAPBLK_NONE) {
+ i = reqpage;
+ j = reqpage + 1;
}
+
/*
- * search forwards for the last contiguous page to transfer
+ * free pages outside our collection range. Note: we never free
+ * mreq, it must remain busy throughout.
*/
- failed = 0;
- last = count;
- for (i = reqpage + 1; i < count; i++) {
- if (failed || (reqaddr[i] == SWB_EMPTY) ||
- (swb[i]->swb_valid & (1 << off[i])) == 0 ||
- (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
- ((reqaddr[i] / dmmax) != reqdskregion)) {
- failed = 1;
- vm_page_free(m[i]);
- if (last == count)
- last = i;
- }
- }
- count = last;
- if (first != 0) {
- for (i = first; i < count; i++) {
- m[i - first] = m[i];
- reqaddr[i - first] = reqaddr[i];
- off[i - first] = off[i];
+ {
+ int k;
+
+ for (k = 0; k < i; ++k) {
+ vm_page_free(m[k]);
+ }
+ for (k = j; k < count; ++k) {
+ vm_page_free(m[k]);
}
- count -= first;
- reqpage -= first;
}
- ++swb[reqpage]->swb_locked;
/*
- * at this point: "m" is a pointer to the array of vm_page_t for
- * paging I/O "count" is the number of vm_page_t entries represented
- * by "m" "object" is the vm_object_t for I/O "reqpage" is the index
- * into "m" for the page actually faulted
+ * Return VM_PAGER_FAIL if we have nothing
+ * to do. Return mreq still busy, but the
+ * others unbusied.
*/
+ if (blk & SWAPBLK_NONE)
+ return(VM_PAGER_FAIL);
+
+
/*
* Get a swap buffer header to perform the IO
*/
- bp = getpbuf();
+
+ bp = getpbuf(&nsw_rcount);
kva = (vm_offset_t) bp->b_data;
/*
* map our page(s) into kva for input
+ *
+ * NOTE: B_PAGING is set by pbgetvp()
*/
- pmap_qenter(kva, m, count);
- bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING;
- bp->b_iodone = swap_pager_iodone1;
+ pmap_qenter(kva, m + i, j - i);
+
+ bp->b_flags = B_BUSY | B_READ | B_CALL;
+ bp->b_iodone = swp_pager_async_iodone;
bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */
bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
crhold(bp->b_rcred);
crhold(bp->b_wcred);
bp->b_data = (caddr_t) kva;
- bp->b_blkno = reqaddr[0];
- bp->b_bcount = PAGE_SIZE * count;
- bp->b_bufsize = PAGE_SIZE * count;
+ /*
+ * b_blkno is in page-sized chunks. swapblk is valid, too, so
+ * we don't have to mask it against SWAPBLK_MASK.
+ */
+ bp->b_blkno = blk - (reqpage - i);
+ bp->b_bcount = PAGE_SIZE * (j - i);
+ bp->b_bufsize = PAGE_SIZE * (j - i);
+ bp->b_pager.pg_reqpage = reqpage - i;
+
+ {
+ int k;
+
+ for (k = i; k < j; ++k) {
+ bp->b_pages[k - i] = m[k];
+ vm_page_flag_set(m[k], PG_SWAPINPROG);
+ }
+ }
+ bp->b_npages = j - i;
pbgetvp(swapdev_vp, bp);
cnt.v_swapin++;
- cnt.v_swappgsin += count;
+ cnt.v_swappgsin += bp->b_npages;
+
+ /*
+ * We still hold the lock on mreq, and our automatic completion routine
+ * does not remove it.
+ */
+
+ vm_object_pip_add(mreq->object, bp->b_npages);
+ lastpindex = m[j-1]->pindex;
+
/*
- * perform the I/O
+ * perform the I/O. NOTE!!! bp cannot be considered valid after
+ * this point because we automatically release it on completion.
+ * Instead, we look at the one page we are interested in which we
+ * still hold a lock on even through the I/O completion.
+ *
+ * The other pages in our m[] array are also released on completion,
+ * so we cannot assume they are valid anymore either.
+ *
+ * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
*/
+
VOP_STRATEGY(bp->b_vp, bp);
/*
- * wait for the sync I/O to complete
+ * wait for the page we want to complete. PG_SWAPINPROG is always
+ * cleared on completion. If an I/O error occurs, SWAPBLK_NONE
+ * is set in the meta-data.
*/
+
s = splvm();
- while ((bp->b_flags & B_DONE) == 0) {
- if (tsleep(bp, PVM, "swread", hz*20)) {
+
+ while ((mreq->flags & PG_SWAPINPROG) != 0) {
+ vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
+ cnt.v_intrans++;
+ if (tsleep(mreq, PSWP, "swread", hz*20)) {
printf(
-"swap_pager: indefinite wait buffer: device: %#lx, blkno: %ld, size: %ld\n",
+ "swap_pager: indefinite wait buffer: device:"
+ " %#lx, blkno: %ld, size: %ld\n",
(u_long)bp->b_dev, (long)bp->b_blkno,
- (long)bp->b_bcount);
+ (long)bp->b_bcount
+ );
}
}
- if (bp->b_flags & B_ERROR) {
- printf(
-"swap_pager: I/O error - pagein failed; blkno %ld, size %ld, error %d\n",
- (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
- rv = VM_PAGER_ERROR;
- } else {
- rv = VM_PAGER_OK;
- }
-
splx(s);
- swb[reqpage]->swb_locked--;
-
- /*
- * remove the mapping for kernel virtual
- */
- pmap_qremove(kva, count);
/*
- * release the physical I/O buffer
- */
- relpbuf(bp);
- /*
- * finish up input if everything is ok
+ * mreq is left bussied after completion, but all the other pages
+ * are freed. If we had an unrecoverable read error the page will
+ * not be valid.
*/
- if (rv == VM_PAGER_OK) {
- for (i = 0; i < count; i++) {
- m[i]->dirty = 0;
- vm_page_flag_clear(m[i], PG_ZERO);
- if (i != reqpage) {
- /*
- * whether or not to leave the page
- * activated is up in the air, but we
- * should put the page on a page queue
- * somewhere. (it already is in the
- * object). After some emperical
- * results, it is best to deactivate
- * the readahead pages.
- */
- vm_page_deactivate(m[i]);
- /*
- * just in case someone was asking for
- * this page we now tell them that it
- * is ok to use
- */
- m[i]->valid = VM_PAGE_BITS_ALL;
- vm_page_wakeup(m[i]);
- }
- }
-
- m[reqpage]->object->last_read = m[count-1]->pindex;
+ if (mreq->valid != VM_PAGE_BITS_ALL) {
+ return(VM_PAGER_ERROR);
} else {
- swap_pager_ridpages(m, count, reqpage);
+ mreq->object->last_read = lastpindex;
+ return(VM_PAGER_OK);
}
- return (rv);
+
+ /*
+ * A final note: in a low swap situation, we cannot deallocate swap
+ * and mark a page dirty here because the caller is likely to mark
+ * the page clean when we return, causing the page to possibly revert
+ * to all-zero's later.
+ */
}
+/*
+ * swap_pager_putpages:
+ *
+ * Assign swap (if necessary) and initiate I/O on the specified pages.
+ *
+ * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects
+ * are automatically converted to SWAP objects.
+ *
+ * In a low memory situation we may block in VOP_STRATEGY(), but the new
+ * vm_page reservation system coupled with properly written VFS devices
+ * should ensure that no low-memory deadlock occurs. This is an area
+ * which needs work.
+ *
+ * The parent has N vm_object_pip_add() references prior to
+ * calling us and will remove references for rtvals[] that are
+ * not set to VM_PAGER_PEND. We need to remove the rest on I/O
+ * completion.
+ *
+ * The parent has soft-busy'd the pages it passes us and will unbusy
+ * those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
+ * We need to unbusy the rest on I/O completion.
+ */
+
int
swap_pager_putpages(object, m, count, sync, rtvals)
vm_object_t object;
@@ -1136,534 +982,749 @@ swap_pager_putpages(object, m, count, sync, rtvals)
boolean_t sync;
int *rtvals;
{
- register struct buf *bp;
- sw_blk_t swb[count];
- register int s;
- int i, j, ix, firstidx, lastidx;
- boolean_t rv;
- vm_offset_t kva, off, fidx;
- swp_clean_t spc;
- vm_pindex_t paging_pindex;
- int reqaddr[count];
- int failed;
-
- if (vm_swap_size)
- no_swap_space = 0;
-
- if (no_swap_space) {
- for (i = 0; i < count; i++)
- rtvals[i] = VM_PAGER_FAIL;
- return VM_PAGER_FAIL;
+ int i;
+ int n = 0;
+ int grv = VM_PAGER_OK;
+
+#if !defined(MAX_PERF)
+ if (count && m[0]->object != object) {
+ panic("swap_pager_getpages: object mismatch %p/%p",
+ object,
+ m[0]->object
+ );
+ }
+#endif
+ /*
+ * Step 1
+ *
+ * Turn object into OBJT_SWAP
+ * check for bogus sysops
+ * force sync if not pageout process
+ */
+
+ if (object->type != OBJT_SWAP) {
+ swp_pager_meta_build(object, 0, SWAPBLK_NONE, 0);
}
if (curproc != pageproc)
sync = TRUE;
- object = m[0]->object;
- paging_pindex = OFF_TO_IDX(object->paging_offset);
-
- failed = 0;
- for (j = 0; j < count; j++) {
- fidx = m[j]->pindex + paging_pindex;
- ix = swap_pager_block_index(fidx);
- swb[j] = 0;
- if (ix >= object->un_pager.swp.swp_nblocks) {
- rtvals[j] = VM_PAGER_FAIL;
- failed = 1;
- continue;
- } else {
- rtvals[j] = VM_PAGER_OK;
- }
- swb[j] = &object->un_pager.swp.swp_blocks[ix];
- swb[j]->swb_locked++;
- if (failed) {
- rtvals[j] = VM_PAGER_FAIL;
- continue;
- }
- off = swap_pager_block_offset(fidx);
- reqaddr[j] = swb[j]->swb_block[off];
- if (reqaddr[j] == SWB_EMPTY) {
- daddr_t blk;
- int tries;
- int ntoget;
+ /*
+ * Step 2
+ *
+ * Assign swap blocks and issue I/O. We reallocate swap on the fly.
+ * The page is left dirty until the pageout operation completes
+ * successfully.
+ */
- tries = 0;
- s = splvm();
+ for (i = 0; i < count; i += n) {
+ int s;
+ int j;
+ struct buf *bp;
+ daddr_t blk;
- /*
- * if any other pages have been allocated in this
- * block, we only try to get one page.
- */
- for (i = 0; i < SWB_NPAGES; i++) {
- if (swb[j]->swb_block[i] != SWB_EMPTY)
- break;
- }
+ /*
+ * Maximum I/O size is limited by a number of factors.
+ */
- ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
- /*
- * this code is alittle conservative, but works (the
- * intent of this code is to allocate small chunks for
- * small objects)
- */
- if ((off == 0) && ((fidx + ntoget) > object->size)) {
- ntoget = object->size - fidx;
- }
- retrygetspace:
- if (!swap_pager_full && ntoget > 1 &&
- swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE),
- &blk)) {
-
- for (i = 0; i < ntoget; i++) {
- swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
- swb[j]->swb_valid = 0;
- }
+ n = min(BLIST_MAX_ALLOC, count - i);
+ n = min(n, max_pageout_cluster);
- reqaddr[j] = swb[j]->swb_block[off];
- } else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE),
- &swb[j]->swb_block[off])) {
- /*
- * if the allocation has failed, we try to
- * reclaim space and retry.
- */
- if (++tries == 1) {
- swap_pager_reclaim();
- goto retrygetspace;
- }
- rtvals[j] = VM_PAGER_AGAIN;
- failed = 1;
- swap_pager_full = 1;
- } else {
- reqaddr[j] = swb[j]->swb_block[off];
- swb[j]->swb_valid &= ~(1 << off);
+ /*
+ * Get biggest block of swap we can. If we fail, fall
+ * back and try to allocate a smaller block. Don't go
+ * overboard trying to allocate space if it would overly
+ * fragment swap.
+ */
+ while (
+ (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
+ n > 4
+ ) {
+ n >>= 1;
+ }
+ if (blk == SWAPBLK_NONE) {
+ for (j = 0; j < n; ++j) {
+ rtvals[i+j] = VM_PAGER_FAIL;
}
- splx(s);
+ grv = VM_PAGER_FAIL;
+ continue;
}
- }
- /*
- * search forwards for the last contiguous page to transfer
- */
- failed = 0;
- for (i = 0; i < count; i++) {
- if (failed ||
- (reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) ||
- ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) ||
- (rtvals[i] != VM_PAGER_OK)) {
- failed = 1;
- if (rtvals[i] == VM_PAGER_OK)
- rtvals[i] = VM_PAGER_AGAIN;
+ /*
+ * Oops, too big if it crosses a stripe
+ *
+ * 1111000000
+ * 111111
+ * 1000001
+ */
+ if ((blk ^ (blk + n)) & dmmax_mask) {
+ j = ((blk + dmmax) & dmmax_mask) - blk;
+ swp_pager_freeswapspace(blk + j, n - j);
+ n = j;
}
- }
- ix = 0;
- firstidx = -1;
- for (i = 0; i < count; i++) {
- if (rtvals[i] == VM_PAGER_OK) {
- ix++;
- if (firstidx == -1) {
- firstidx = i;
- }
- } else if (firstidx >= 0) {
- break;
- }
- }
+ /*
+ * All I/O parameters have been satisfied, build the I/O
+ * request and assign the swap space.
+ *
+ * NOTE: B_PAGING is set by pbgetvp()
+ */
- if (firstidx == -1) {
- for (i = 0; i < count; i++) {
- if (rtvals[i] == VM_PAGER_OK)
- rtvals[i] = VM_PAGER_AGAIN;
- }
- return VM_PAGER_AGAIN;
- }
+ bp = getpbuf(&nsw_wcount);
+ bp->b_spc = NULL; /* not used, but NULL-out anyway */
- lastidx = firstidx + ix;
+ pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
- if (ix > max_pageout_cluster) {
- for (i = firstidx + max_pageout_cluster; i < lastidx; i++) {
- if (rtvals[i] == VM_PAGER_OK)
- rtvals[i] = VM_PAGER_AGAIN;
- }
- ix = max_pageout_cluster;
- lastidx = firstidx + ix;
- }
+ bp->b_flags = B_BUSY | B_ASYNC;
+ bp->b_proc = &proc0; /* XXX (but without B_PHYS this is ok) */
+ bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
- for (i = 0; i < firstidx; i++) {
- if (swb[i])
- swb[i]->swb_locked--;
- }
+ if (bp->b_rcred != NOCRED)
+ crhold(bp->b_rcred);
+ if (bp->b_wcred != NOCRED)
+ crhold(bp->b_wcred);
+ pbgetvp(swapdev_vp, bp);
- for (i = lastidx; i < count; i++) {
- if (swb[i])
- swb[i]->swb_locked--;
- }
+ bp->b_bcount = PAGE_SIZE * n;
+ bp->b_bufsize = PAGE_SIZE * n;
+ bp->b_blkno = blk;
-#ifdef INVARIANTS
- for (i = firstidx; i < lastidx; i++) {
- if (reqaddr[i] == SWB_EMPTY) {
- printf("I/O to empty block???? -- pindex: %d, i: %d\n",
- m[i]->pindex, i);
- }
- }
-#endif
+ s = splvm();
- /*
- * Clean up all completed async pageouts.
- */
- if (swap_pager_free_pending)
- swap_pager_sync();
+ for (j = 0; j < n; ++j) {
+ vm_page_t mreq = m[i+j];
- /*
- * get a swap pager clean data structure, block until we get it
- */
- if (curproc == pageproc) {
- if (swap_pager_free_count == 0) {
- s = splvm();
- while (swap_pager_free_count == 0) {
- swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT;
- /*
- * if it does not get one within a short time, then
- * there is a potential deadlock, so we go-on trying
- * to free pages. It is important to block here as opposed
- * to returning, thereby allowing the pageout daemon to continue.
- * It is likely that pageout daemon will start suboptimally
- * reclaiming vnode backed pages if we don't block. Since the
- * I/O subsystem is probably already fully utilized, might as
- * well wait.
- */
- if (tsleep(&swap_pager_needflags, PVM-1, "swpfre", hz/2)) {
- if (swap_pager_free_pending)
- swap_pager_sync();
- if (swap_pager_free_count == 0) {
- for (i = firstidx; i < lastidx; i++) {
- rtvals[i] = VM_PAGER_AGAIN;
- }
- splx(s);
- return VM_PAGER_AGAIN;
- }
- } else {
- swap_pager_sync();
- }
- }
- splx(s);
+ swp_pager_meta_build(
+ mreq->object,
+ mreq->pindex,
+ blk + j,
+ 0
+ );
+ mreq->dirty = VM_PAGE_BITS_ALL;
+ rtvals[i+j] = VM_PAGER_OK;
+
+ vm_page_flag_set(mreq, PG_SWAPINPROG);
+ bp->b_pages[j] = mreq;
}
+ bp->b_flags |= B_CALL;
+ bp->b_npages = n;
- spc = TAILQ_FIRST(&swap_pager_free);
- KASSERT(spc != NULL,
- ("swap_pager_putpages: free queue is empty, %d expected\n",
- swap_pager_free_count));
- TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
- swap_pager_free_count--;
-
- kva = spc->spc_kva;
- bp = spc->spc_bp;
- bzero(bp, sizeof *bp);
- bp->b_spc = spc;
- bp->b_xflags = 0;
- bp->b_data = (caddr_t) kva;
- } else {
- spc = NULL;
- bp = getpbuf();
- kva = (vm_offset_t) bp->b_data;
- bp->b_spc = NULL;
- }
+ cnt.v_swapout++;
+ cnt.v_swappgsout += bp->b_npages;
+ swapdev_vp->v_numoutput++;
- /*
- * map our page(s) into kva for I/O
- */
- pmap_qenter(kva, &m[firstidx], ix);
+ /*
+ * asynchronous
+ *
+ * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
+ */
+
+ if (sync == FALSE) {
+ bp->b_iodone = swp_pager_async_iodone;
+ bp->b_dirtyoff = 0;
+ bp->b_dirtyend = bp->b_bcount;
+ VOP_STRATEGY(bp->b_vp, bp);
+
+ for (j = 0; j < n; ++j)
+ rtvals[i+j] = VM_PAGER_PEND;
+
+ splx(s);
+ grv = VM_PAGER_PEND;
+ continue;
+ }
- /*
- * get the base I/O offset into the swap file
- */
- for (i = firstidx; i < lastidx ; i++) {
- fidx = m[i]->pindex + paging_pindex;
- off = swap_pager_block_offset(fidx);
/*
- * set the valid bit
+ * synchronous
+ *
+ * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
*/
- swb[i]->swb_valid |= (1 << off);
+
+ bp->b_iodone = swp_pager_sync_iodone;
+ VOP_STRATEGY(bp->b_vp, bp);
+
/*
- * and unlock the data structure
+ * Wait for the sync I/O to complete, then update rtvals.
+ * We just set the rtvals[] to VM_PAGER_PEND so we can call
+ * our async completion routine at the end, thus avoiding a
+ * double-free.
*/
- swb[i]->swb_locked--;
- }
+ while ((bp->b_flags & B_DONE) == 0) {
+ tsleep(bp, PVM, "swwrt", 0);
+ }
- bp->b_flags = B_BUSY | B_PAGING;
- bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */
- bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
- if (bp->b_rcred != NOCRED)
- crhold(bp->b_rcred);
- if (bp->b_wcred != NOCRED)
- crhold(bp->b_wcred);
- bp->b_blkno = reqaddr[firstidx];
- pbgetvp(swapdev_vp, bp);
+ if (bp->b_flags & B_ERROR) {
+ grv = VM_PAGER_ERROR;
+ }
- bp->b_bcount = PAGE_SIZE * ix;
- bp->b_bufsize = PAGE_SIZE * ix;
+ for (j = 0; j < n; ++j)
+ rtvals[i+j] = VM_PAGER_PEND;
- s = splvm();
- swapdev_vp->v_numoutput++;
+ if (bp->b_flags & B_ERROR) {
+ grv = VM_PAGER_ERROR;
+ }
- /*
- * If this is an async write we set up additional buffer fields and
- * place a "cleaning" entry on the inuse queue.
- */
- object->un_pager.swp.swp_poip++;
-
- if (spc) {
- spc->spc_flags = 0;
- spc->spc_object = object;
- bp->b_npages = ix;
- for (i = firstidx; i < lastidx; i++) {
- spc->spc_m[i] = m[i];
- bp->b_pages[i - firstidx] = m[i];
- vm_page_protect(m[i], VM_PROT_READ);
- pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
- m[i]->dirty = 0;
- }
- spc->spc_first = firstidx;
- spc->spc_count = ix;
/*
- * the completion routine for async writes
+ * Now that we are through with the bp, we can call the
+ * normal async completion, which frees everything up.
*/
- bp->b_flags |= B_CALL;
- bp->b_iodone = swap_pager_iodone;
- bp->b_dirtyoff = 0;
- bp->b_dirtyend = bp->b_bcount;
- TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
- } else {
- bp->b_flags |= B_CALL;
- bp->b_iodone = swap_pager_iodone1;
- bp->b_npages = ix;
- for (i = firstidx; i < lastidx; i++)
- bp->b_pages[i - firstidx] = m[i];
- }
- cnt.v_swapout++;
- cnt.v_swappgsout += ix;
+ swp_pager_async_iodone(bp);
- /*
- * perform the I/O
- */
- VOP_STRATEGY(bp->b_vp, bp);
- if (sync == FALSE) {
- if (swap_pager_free_pending) {
- swap_pager_sync();
- }
- for (i = firstidx; i < lastidx; i++) {
- rtvals[i] = VM_PAGER_PEND;
- }
splx(s);
- return VM_PAGER_PEND;
}
+ return(grv);
+}
+
+/*
+ * swap_pager_sync_iodone:
+ *
+ * Completion routine for synchronous reads and writes from/to swap.
+ * We just mark the bp is complete and wake up anyone waiting on it.
+ *
+ * This routine may not block.
+ */
+
+static void
+swp_pager_sync_iodone(bp)
+ struct buf *bp;
+{
+ bp->b_flags |= B_DONE;
+ bp->b_flags &= ~B_ASYNC;
+ wakeup(bp);
+}
+
+/*
+ * swp_pager_async_iodone:
+ *
+ * Completion routine for asynchronous reads and writes from/to swap.
+ * Also called manually by synchronous code to finish up a bp.
+ *
+ * WARNING! This routine may be called from an interrupt. We cannot
+ * mess with swap metadata unless we want to run all our other routines
+ * at splbio() too, which I'd rather not do. We up ourselves
+ * to splvm() because we may call vm_page_free(), which can unlink a
+ * page from an object.
+ *
+ * XXX currently I do not believe any object routines protect
+ * object->memq at splvm(). The code must be gone over to determine
+ * the actual state of the problem.
+ *
+ * For READ operations, the pages are PG_BUSY'd. For WRITE operations,
+ * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY
+ * unbusy all pages except the 'main' request page. For WRITE
+ * operations, we vm_page_t->busy'd unbusy all pages ( we can do this
+ * because we marked them all VM_PAGER_PEND on return from putpages ).
+ *
+ * This routine may not block.
+ * This routine is called at splbio()
+ */
+
+static void
+swp_pager_async_iodone(bp)
+ register struct buf *bp;
+{
+ int s;
+ int i;
+ vm_object_t object = NULL;
+
+ s = splvm();
+
+ bp->b_flags |= B_DONE;
+
/*
- * wait for the sync I/O to complete
+ * report error
*/
- while ((bp->b_flags & B_DONE) == 0) {
- tsleep(bp, PVM, "swwrt", 0);
- }
if (bp->b_flags & B_ERROR) {
printf(
-"swap_pager: I/O error - pageout failed; blkno %ld, size %ld, error %d\n",
- (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
- rv = VM_PAGER_ERROR;
- } else {
- rv = VM_PAGER_OK;
+ "swap_pager: I/O error - %s failed; blkno %ld,"
+ "size %ld, error %d\n",
+ ((bp->b_flags & B_READ) ? "pagein" : "pageout"),
+ (long)bp->b_blkno,
+ (long)bp->b_bcount,
+ bp->b_error
+ );
}
- object->un_pager.swp.swp_poip--;
- if (object->un_pager.swp.swp_poip == 0)
- wakeup(object);
-
- if (bp->b_vp)
- pbrelvp(bp);
+ /*
+ * set object.
+ */
- splx(s);
+ if (bp->b_npages)
+ object = bp->b_pages[0]->object;
/*
* remove the mapping for kernel virtual
*/
- pmap_qremove(kva, ix);
+
+ pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
/*
- * if we have written the page, then indicate that the page is clean.
+ * cleanup pages. If an error occurs writing to swap, we are in
+ * very serious trouble. If it happens to be a disk error, though,
+ * we may be able to recover by reassigning the swap later on. So
+ * in this case we remove the m->swapblk assignment for the page
+ * but do not free it in the rlist. The errornous block(s) are thus
+ * never reallocated as swap. Redirty the page and continue.
*/
- if (rv == VM_PAGER_OK) {
- for (i = firstidx; i < lastidx; i++) {
- if (rtvals[i] == VM_PAGER_OK) {
- pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
- m[i]->dirty = 0;
+
+ for (i = 0; i < bp->b_npages; ++i) {
+ vm_page_t m = bp->b_pages[i];
+
+ vm_page_flag_clear(m, PG_SWAPINPROG);
+
+ if (bp->b_flags & B_ERROR) {
+ /*
+ * If an error occurs I'd love to throw the swapblk
+ * away without freeing it back to swapspace, so it
+ * can never be used again. But I can't from an
+ * interrupt.
+ */
+
+ if (bp->b_flags & B_READ) {
/*
- * optimization, if a page has been read
- * during the pageout process, we activate it.
+ * When reading, reqpage needs to stay
+ * locked for the parent, but all other
+ * pages can be freed. We still want to
+ * wakeup the parent waiting on the page,
+ * though. ( also: pg_reqpage can be -1 and
+ * not match anything ).
+ *
+ * We have to wake specifically requested pages
+ * up too because we cleared PG_SWAPINPROG and
+ * someone may be waiting for that.
+ *
+ * NOTE: for reads, m->dirty will probably
+ * be overriden by the original caller of
+ * getpages so don't play cute tricks here.
+ *
+ * XXX it may not be legal to free the page
+ * here as this messes with the object->memq's.
*/
- if (((m[i]->flags & (PG_WANTED|PG_REFERENCED)) ||
- pmap_ts_referenced(VM_PAGE_TO_PHYS(m[i])))) {
- vm_page_activate(m[i]);
- }
+
+ m->valid = 0;
+ vm_page_flag_clear(m, PG_ZERO);
+
+ if (i != bp->b_pager.pg_reqpage)
+ vm_page_free(m);
+ else
+ vm_page_flash(m);
+ /*
+ * If i == bp->b_pager.pg_reqpage, do not wake
+ * the page up. The caller needs to.
+ */
+ } else {
+ /*
+ * If a write error occurs, reactivate page
+ * so it doesn't clog the inactive list,
+ * then finish the I/O.
+ */
+ m->dirty = VM_PAGE_BITS_ALL;
+ vm_page_activate(m);
+ vm_page_io_finish(m);
}
- }
- } else {
- for (i = firstidx; i < lastidx; i++) {
- rtvals[i] = rv;
+ } else if (bp->b_flags & B_READ) {
+ /*
+ * For read success, clear dirty bits. Nobody should
+ * have this page mapped but don't take any chances,
+ * make sure the pmap modify bits are also cleared.
+ *
+ * NOTE: for reads, m->dirty will probably be
+ * overriden by the original caller of getpages so
+ * we cannot set them in order to free the underlying
+ * swap in a low-swap situation. I don't think we'd
+ * want to do that anyway, but it was an optimization
+ * that existed in the old swapper for a time before
+ * it got ripped out due to precisely this problem.
+ *
+ * clear PG_ZERO in page.
+ *
+ * If not the requested page then deactivate it.
+ *
+ * Note that the requested page, reqpage, is left
+ * busied, but we still have to wake it up. The
+ * other pages are released (unbusied) by
+ * vm_page_wakeup(). We do not set reqpage's
+ * valid bits here, it is up to the caller.
+ */
+
+ pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+ m->valid = VM_PAGE_BITS_ALL;
+ m->dirty = 0;
+ vm_page_flag_clear(m, PG_ZERO);
+
+ /*
+ * We have to wake specifically requested pages
+ * up too because we cleared PG_SWAPINPROG and
+ * could be waiting for it in getpages. However,
+ * be sure to not unbusy getpages specifically
+ * requested page - getpages expects it to be
+ * left busy.
+ */
+ if (i != bp->b_pager.pg_reqpage) {
+ vm_page_deactivate(m);
+ vm_page_wakeup(m);
+ } else {
+ vm_page_flash(m);
+ }
+ } else {
+ /*
+ * For write success, clear the modify and dirty
+ * status, then finish the I/O ( which decrements the
+ * busy count and possibly wakes waiter's up ).
+ */
+ vm_page_protect(m, VM_PROT_READ);
+ pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+ m->dirty = 0;
+ vm_page_io_finish(m);
}
}
- if (spc != NULL) {
- if (bp->b_rcred != NOCRED)
- crfree(bp->b_rcred);
- if (bp->b_wcred != NOCRED)
- crfree(bp->b_wcred);
- spc_free(spc);
- } else
- relpbuf(bp);
- if (swap_pager_free_pending)
- swap_pager_sync();
-
- return (rv);
+ /*
+ * adjust pip. NOTE: the original parent may still have its own
+ * pip refs on the object.
+ */
+
+ if (object)
+ vm_object_pip_wakeupn(object, bp->b_npages);
+
+ /*
+ * release the physical I/O buffer
+ */
+
+ relpbuf(bp, ((bp->b_flags & B_READ) ? &nsw_rcount : &nsw_wcount));
+
+ splx(s);
}
-void
-swap_pager_sync()
+/************************************************************************
+ * SWAP META DATA *
+ ************************************************************************
+ *
+ * These routines manipulate the swap metadata stored in the
+ * OBJT_SWAP object.
+ *
+ * In fact, we just have a few counters in the vm_object_t. The
+ * metadata is actually stored in a hash table.
+ */
+
+/*
+ * SWP_PAGER_HASH() - hash swap meta data
+ *
+ * This is an inline helper function which hash the swapblk given
+ * the object and page index. It returns a pointer to a pointer
+ * to the object, or a pointer to a NULL pointer if it could not
+ * find a swapblk.
+ */
+
+static __inline struct swblock **
+swp_pager_hash(vm_object_t object, daddr_t index)
{
- swp_clean_t spc;
+ struct swblock **pswap;
+ struct swblock *swap;
+
+ index &= ~SWAP_META_MASK;
+ pswap = &swhash[(index ^ (int)(long)object) & swhash_mask];
- while (spc = TAILQ_FIRST(&swap_pager_done)) {
- swap_pager_finish(spc);
+ while ((swap = *pswap) != NULL) {
+ if (swap->swb_object == object &&
+ swap->swb_index == index
+ ) {
+ break;
+ }
+ pswap = &swap->swb_hnext;
}
- return;
+ return(pswap);
}
+/*
+ * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object
+ *
+ * We first convert the object to a swap object if it is a default
+ * object.
+ *
+ * The specified swapblk is added to the object's swap metadata. If
+ * the swapblk is not valid, it is freed instead. Any previously
+ * assigned swapblk is freed.
+ */
+
static void
-swap_pager_finish(spc)
- register swp_clean_t spc;
-{
- int i, s, lastidx;
- vm_object_t object;
- vm_page_t *ma;
+swp_pager_meta_build(
+ vm_object_t object,
+ daddr_t index,
+ daddr_t swapblk,
+ int waitok
+) {
+ struct swblock *swap;
+ struct swblock **pswap;
- ma = spc->spc_m;
- object = spc->spc_object;
- lastidx = spc->spc_first + spc->spc_count;
+ /*
+ * Convert default object to swap object if necessary
+ */
- s = splvm();
- TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
- splx(s);
+ if (object->type != OBJT_SWAP) {
+ object->type = OBJT_SWAP;
+ object->un_pager.swp.swp_bcount = 0;
+
+ if (object->handle != NULL) {
+ TAILQ_INSERT_TAIL(
+ NOBJLIST(object->handle),
+ object,
+ pager_object_list
+ );
+ } else {
+ TAILQ_INSERT_TAIL(
+ &swap_pager_un_object_list,
+ object,
+ pager_object_list
+ );
+ }
+ }
+
+ /*
+ * Wait for free memory when waitok is TRUE prior to calling the
+ * zone allocator.
+ */
- pmap_qremove(spc->spc_kva, spc->spc_count);
+ while (waitok && cnt.v_free_count == 0) {
+ VM_WAIT;
+ }
/*
- * If no error, mark as clean and inform the pmap system. If error,
- * mark as dirty so we will try again. (XXX could get stuck doing
- * this, should give up after awhile)
+ * If swapblk being added is invalid, just free it.
*/
- if (spc->spc_flags & SPC_ERROR) {
- for (i = spc->spc_first; i < lastidx; i++) {
- printf("swap_pager_finish: I/O error, clean of page %lx failed\n",
- (u_long) VM_PAGE_TO_PHYS(ma[i]));
- ma[i]->dirty = VM_PAGE_BITS_ALL;
- vm_page_io_finish(ma[i]);
+ if (swapblk & SWAPBLK_NONE) {
+ if (swapblk != SWAPBLK_NONE) {
+ swp_pager_freeswapspace(
+ index,
+ 1
+ );
+ swapblk = SWAPBLK_NONE;
}
+ }
- vm_object_pip_subtract(object, spc->spc_count);
- if ((object->paging_in_progress == 0) &&
- (object->flags & OBJ_PIPWNT)) {
- vm_object_clear_flag(object, OBJ_PIPWNT);
- wakeup(object);
- }
+ /*
+ * Locate hash entry. If not found create, but if we aren't adding
+ * anything just return.
+ */
- } else {
- for (i = spc->spc_first; i < lastidx; i++) {
- if ((ma[i]->queue != PQ_ACTIVE) &&
- ((ma[i]->flags & PG_WANTED) ||
- pmap_ts_referenced(VM_PAGE_TO_PHYS(ma[i])))) {
- vm_page_activate(ma[i]);
- }
- }
+ pswap = swp_pager_hash(object, index);
+
+ if ((swap = *pswap) == NULL) {
+ int i;
+
+ if (swapblk == SWAPBLK_NONE)
+ return;
+
+ swap = *pswap = zalloc(swap_zone);
+
+ swap->swb_hnext = NULL;
+ swap->swb_object = object;
+ swap->swb_index = index & ~SWAP_META_MASK;
+ swap->swb_count = 0;
+
+ ++object->un_pager.swp.swp_bcount;
+
+ for (i = 0; i < SWAP_META_PAGES; ++i)
+ swap->swb_pages[i] = SWAPBLK_NONE;
}
- nswiodone -= spc->spc_count;
- swap_pager_free_pending--;
- spc_free(spc);
+ /*
+ * Delete prior contents of metadata
+ */
- return;
+ index &= SWAP_META_MASK;
+
+ if (swap->swb_pages[index] != SWAPBLK_NONE) {
+ swp_pager_freeswapspace(
+ swap->swb_pages[index] & SWAPBLK_MASK,
+ 1
+ );
+ --swap->swb_count;
+ }
+
+ /*
+ * Enter block into metadata
+ */
+
+ swap->swb_pages[index] = swapblk;
+ ++swap->swb_count;
}
/*
- * swap_pager_iodone
+ * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
+ *
+ * The requested range of blocks is freed, with any associated swap
+ * returned to the swap bitmap.
+ *
+ * This routine will free swap metadata structures as they are cleaned
+ * out. This routine does *NOT* operate on swap metadata associated
+ * with resident pages.
+ *
+ * This routine must be called at splvm()
*/
+
static void
-swap_pager_iodone(bp)
- register struct buf *bp;
+swp_pager_meta_free(vm_object_t object, daddr_t index, daddr_t count)
{
- int i, s, lastidx;
- register swp_clean_t spc;
- vm_object_t object;
- vm_page_t *ma;
+ if (object->type != OBJT_SWAP)
+ return;
+ while (count > 0) {
+ struct swblock **pswap;
+ struct swblock *swap;
- s = splvm();
- spc = (swp_clean_t) bp->b_spc;
- TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
- TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
+ pswap = swp_pager_hash(object, index);
- object = spc->spc_object;
+ if ((swap = *pswap) != NULL) {
+ daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
-#if defined(DIAGNOSTIC)
- if (object->paging_in_progress < spc->spc_count)
- printf("swap_pager_iodone: paging_in_progress(%d) < spc_count(%d)\n",
- object->paging_in_progress, spc->spc_count);
-#endif
-
- if (bp->b_flags & B_ERROR) {
- spc->spc_flags |= SPC_ERROR;
- printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n",
- (bp->b_flags & B_READ) ? "pagein" : "pageout",
- (u_long) bp->b_blkno, bp->b_bcount, bp->b_error);
- } else {
- vm_object_pip_subtract(object, spc->spc_count);
- if ((object->paging_in_progress == 0) &&
- (object->flags & OBJ_PIPWNT)) {
- vm_object_clear_flag(object, OBJ_PIPWNT);
- wakeup(object);
- }
- ma = spc->spc_m;
- lastidx = spc->spc_first + spc->spc_count;
- for (i = spc->spc_first; i < lastidx; i++) {
- /*
- * we wakeup any processes that are waiting on these pages.
- */
- vm_page_io_finish(ma[i]);
+ if (v != SWAPBLK_NONE) {
+ swp_pager_freeswapspace(v, 1);
+ swap->swb_pages[index & SWAP_META_MASK] =
+ SWAPBLK_NONE;
+ if (--swap->swb_count == 0) {
+ *pswap = swap->swb_hnext;
+ zfree(swap_zone, swap);
+ --object->un_pager.swp.swp_bcount;
+ }
+ }
+ --count;
+ ++index;
+ } else {
+ daddr_t n = SWAP_META_PAGES - (index & SWAP_META_MASK);
+ count -= n;
+ index += n;
}
}
+}
+
+/*
+ * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
+ *
+ * This routine locates and destroys all swap metadata associated with
+ * an object.
+ */
+
+static void
+swp_pager_meta_free_all(vm_object_t object)
+{
+ daddr_t index = 0;
- if (bp->b_vp)
- pbrelvp(bp);
+ if (object->type != OBJT_SWAP)
+ return;
- if (bp->b_rcred != NOCRED)
- crfree(bp->b_rcred);
- if (bp->b_wcred != NOCRED)
- crfree(bp->b_wcred);
+ while (object->un_pager.swp.swp_bcount) {
+ struct swblock **pswap;
+ struct swblock *swap;
- nswiodone += spc->spc_count;
- swap_pager_free_pending++;
- if (--spc->spc_object->un_pager.swp.swp_poip == 0) {
- wakeup(spc->spc_object);
- }
+ pswap = swp_pager_hash(object, index);
+ if ((swap = *pswap) != NULL) {
+ int i;
- if (swap_pager_needflags &&
- ((swap_pager_free_count + swap_pager_free_pending) > (npendingio / 2))) {
- spc_wakeup();
+ for (i = 0; i < SWAP_META_PAGES; ++i) {
+ daddr_t v = swap->swb_pages[i];
+ if (v != SWAPBLK_NONE) {
+#if !defined(MAX_PERF)
+ --swap->swb_count;
+#endif
+ swp_pager_freeswapspace(
+ v,
+ 1
+ );
+ }
+ }
+#if !defined(MAX_PERF)
+ if (swap->swb_count != 0)
+ panic("swap_pager_meta_free_all: swb_count != 0");
+#endif
+ *pswap = swap->swb_hnext;
+ zfree(swap_zone, swap);
+ --object->un_pager.swp.swp_bcount;
+ }
+ index += SWAP_META_PAGES;
+#if !defined(MAX_PERF)
+ if (index > 0x20000000)
+ panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
+#endif
}
+}
- if ((TAILQ_FIRST(&swap_pager_inuse) == NULL) &&
- vm_pageout_pages_needed) {
- wakeup(&vm_pageout_pages_needed);
- vm_pageout_pages_needed = 0;
+/*
+ * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data.
+ *
+ * This routine is capable of looking up, popping, or freeing
+ * swapblk assignments in the swap meta data or in the vm_page_t.
+ * The routine typically returns the swapblk being looked-up, or popped,
+ * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
+ * was invalid. This routine will automatically free any invalid
+ * meta-data swapblks.
+ *
+ * It is not possible to store invalid swapblks in the swap meta data
+ * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
+ *
+ * When acting on a busy resident page and paging is in progress, we
+ * have to wait until paging is complete but otherwise can act on the
+ * busy page.
+ *
+ * SWM_FREE remove and free swap block from metadata
+ *
+ * SWM_POP remove from meta data but do not free.. pop it out
+ */
+
+static daddr_t
+swp_pager_meta_ctl(
+ vm_object_t object,
+ vm_pindex_t index,
+ int flags
+) {
+ /*
+ * The meta data only exists of the object is OBJT_SWAP
+ * and even then might not be allocated yet.
+ */
+
+ if (
+ object->type != OBJT_SWAP ||
+ object->un_pager.swp.swp_bcount == 0
+ ) {
+ return(SWAPBLK_NONE);
}
- splx(s);
+ {
+ struct swblock **pswap;
+ struct swblock *swap;
+ daddr_t r1 = SWAPBLK_NONE;
+
+ pswap = swp_pager_hash(object, index);
+
+ index &= SWAP_META_MASK;
+
+ if ((swap = *pswap) != NULL) {
+ r1 = swap->swb_pages[index];
+
+ if (r1 != SWAPBLK_NONE) {
+ if (flags & SWM_FREE) {
+ swp_pager_freeswapspace(
+ r1,
+ 1
+ );
+ r1 = SWAPBLK_NONE;
+ }
+ if (flags & (SWM_FREE|SWM_POP)) {
+ swap->swb_pages[index] = SWAPBLK_NONE;
+ if (--swap->swb_count == 0) {
+ *pswap = swap->swb_hnext;
+ zfree(swap_zone, swap);
+ --object->un_pager.swp.swp_bcount;
+ }
+ }
+ }
+ }
+
+ return(r1);
+ }
+ /* not reached */
}
+