diff options
Diffstat (limited to 'sys/vm/swap_pager.c')
| -rw-r--r-- | sys/vm/swap_pager.c | 511 |
1 files changed, 503 insertions, 8 deletions
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 752974c0e01c..faf90759e33d 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -2,7 +2,7 @@ * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. - * Copyright (c) 1991, 1993 + * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -63,11 +63,16 @@ * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 + * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_mac.h" +#include "opt_swap.h" +#include "opt_vm.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> @@ -75,15 +80,20 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/bio.h> #include <sys/buf.h> +#include <sys/disk.h> +#include <sys/fcntl.h> +#include <sys/mount.h> +#include <sys/namei.h> #include <sys/vnode.h> +#include <sys/mac.h> #include <sys/malloc.h> #include <sys/sysctl.h> +#include <sys/sysproto.h> #include <sys/blist.h> #include <sys/lock.h> #include <sys/sx.h> #include <sys/vmmeter.h> -#include "opt_swap.h" #include <vm/vm.h> #include <vm/pmap.h> #include <vm/vm_map.h> @@ -92,10 +102,35 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_page.h> #include <vm/vm_pager.h> #include <vm/vm_pageout.h> +#include <vm/vm_param.h> #include <vm/swap_pager.h> #include <vm/vm_extern.h> #include <vm/uma.h> +#ifndef NSWAPDEV +#define NSWAPDEV 4 +#endif + +typedef int32_t swblk_t; /* swap offset */ + +struct swblock { + struct swblock *swb_hnext; + vm_object_t swb_object; + vm_pindex_t swb_index; + int swb_count; + daddr_t swb_pages[SWAP_META_PAGES]; +}; + +static struct swdevt should_be_malloced[NSWAPDEV]; +static struct swdevt *swdevt = should_be_malloced; +static int nswap; /* first block after the interleaved devs */ +static int nswdev = NSWAPDEV; +int vm_swap_size; +static int swdev_syscall_active = 0; /* serialize swap(on|off) */ + +static int swapdev_strategy(struct vop_strategy_args *ap); +static struct vnode *swapdev_vp; + #define SWM_FREE 0x02 /* free, period */ #define SWM_POP 0x04 /* pop out */ @@ -131,8 +166,8 @@ SYSCTL_INT(_vm, OID_AUTO, swap_async_max, static struct mtx sw_alloc_mtx; /* protect list manipulation */ static struct pagerlst swap_pager_object_list[NOBJLISTS]; -struct pagerlst swap_pager_un_object_list; -uma_zone_t swap_zone; +static struct pagerlst swap_pager_un_object_list; +static uma_zone_t swap_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure @@ -149,6 +184,7 @@ static boolean_t static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_strategy(vm_object_t, struct bio *); +static void swap_pager_swapoff(int devidx, int *sw_used); struct pagerops swappagerops = { swap_pager_init, /* early system initialization of pager */ @@ -172,9 +208,9 @@ static void waitchainbuf(struct bio *bp, int count, int done); * swap_*() routines are externally accessible. swp_*() routines are * internal. */ -int dmmax, dmmax_mask; -int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ -int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ +static int dmmax, dmmax_mask; +static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ +static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block"); @@ -1738,7 +1774,7 @@ swp_pager_force_pagein(struct swblock *swap, int idx) * * This routine may block. */ -void +static void swap_pager_swapoff(int devidx, int *sw_used) { struct swblock **pswap; @@ -2205,3 +2241,462 @@ waitchainbuf(struct bio *bp, int limit, int done) splx(s); } +/* + * swapdev_strategy: + * + * VOP_STRATEGY() for swapdev_vp. + * Perform swap strategy interleave device selection. + * + * The bp is expected to be locked and *not* B_DONE on call. + */ +static int +swapdev_strategy(ap) + struct vop_strategy_args /* { + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap; +{ + int s, sz, off, seg, index; + struct swdevt *sp; + struct vnode *vp; + struct buf *bp; + + KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)", + __func__, ap->a_vp, ap->a_bp->b_vp)); + bp = ap->a_bp; + sz = howmany(bp->b_bcount, PAGE_SIZE); + + /* + * Convert interleaved swap into per-device swap. Note that + * the block size is left in PAGE_SIZE'd chunks (for the newswap) + * here. + */ + if (nswdev > 1) { + off = bp->b_blkno % dmmax; + if (off + sz > dmmax) { + bp->b_error = EINVAL; + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); + return 0; + } + seg = bp->b_blkno / dmmax; + index = seg % nswdev; + seg /= nswdev; + bp->b_blkno = seg * dmmax + off; + } else { + index = 0; + } + sp = &swdevt[index]; + if (bp->b_blkno + sz > sp->sw_nblks) { + bp->b_error = EINVAL; + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); + return 0; + } + bp->b_dev = sp->sw_device; + if (sp->sw_vp == NULL) { + bp->b_error = ENODEV; + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); + return 0; + } + + /* + * Convert from PAGE_SIZE'd to DEV_BSIZE'd chunks for the actual I/O + */ + bp->b_blkno = ctodb(bp->b_blkno); + + vhold(sp->sw_vp); + s = splvm(); + if (bp->b_iocmd == BIO_WRITE) { + vp = bp->b_vp; + if (vp) { + VI_LOCK(vp); + vp->v_numoutput--; + if ((vp->v_iflag & VI_BWAIT) && vp->v_numoutput <= 0) { + vp->v_iflag &= ~VI_BWAIT; + wakeup(&vp->v_numoutput); + } + VI_UNLOCK(vp); + } + VI_LOCK(sp->sw_vp); + sp->sw_vp->v_numoutput++; + VI_UNLOCK(sp->sw_vp); + } + bp->b_vp = sp->sw_vp; + splx(s); + if (bp->b_vp->v_type == VCHR) + VOP_SPECSTRATEGY(bp->b_vp, bp); + else + VOP_STRATEGY(bp->b_vp, bp); + return 0; +} + +/* + * Create a special vnode op vector for swapdev_vp - we only use + * VOP_STRATEGY() and reclaim; everything else returns an error. + */ +vop_t **swapdev_vnodeop_p; +static struct vnodeopv_entry_desc swapdev_vnodeop_entries[] = { + { &vop_default_desc, (vop_t *) vop_defaultop }, + { &vop_reclaim_desc, (vop_t *) vop_null }, + { &vop_strategy_desc, (vop_t *) swapdev_strategy }, + { NULL, NULL } +}; +static struct vnodeopv_desc swapdev_vnodeop_opv_desc = + { &swapdev_vnodeop_p, swapdev_vnodeop_entries }; + +VNODEOP_SET(swapdev_vnodeop_opv_desc); + +/* + * System call swapon(name) enables swapping on device name, + * which must be in the swdevsw. Return EBUSY + * if already swapping on this device. + */ +#ifndef _SYS_SYSPROTO_H_ +struct swapon_args { + char *name; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +swapon(td, uap) + struct thread *td; + struct swapon_args *uap; +{ + struct vattr attr; + struct vnode *vp; + struct nameidata nd; + int error; + + mtx_lock(&Giant); + error = suser(td); + if (error) + goto done2; + + while (swdev_syscall_active) + tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0); + swdev_syscall_active = 1; + + /* + * Swap metadata may not fit in the KVM if we have physical + * memory of >1GB. + */ + if (swap_zone == NULL) { + error = ENOMEM; + goto done; + } + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td); + error = namei(&nd); + if (error) + goto done; + + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + + if (vn_isdisk(vp, &error)) + error = swaponvp(td, vp, vp->v_rdev, 0); + else if (vp->v_type == VREG && + (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && + (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) { + /* + * Allow direct swapping to NFS regular files in the same + * way that nfs_mountroot() sets up diskless swapping. + */ + error = swaponvp(td, vp, NODEV, attr.va_size / DEV_BSIZE); + } + + if (error) + vrele(vp); +done: + swdev_syscall_active = 0; + wakeup_one(&swdev_syscall_active); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Swfree(index) frees the index'th portion of the swap map. + * Each of the nswdev devices provides 1/nswdev'th of the swap + * space, which is laid out with blocks of dmmax pages circularly + * among the devices. + * + * The new swap code uses page-sized blocks. The old swap code used + * DEV_BSIZE'd chunks. + */ +int +swaponvp(td, vp, dev, nblks) + struct thread *td; + struct vnode *vp; + dev_t dev; + u_long nblks; +{ + int index; + struct swdevt *sp; + swblk_t vsbase; + long blk; + swblk_t dvbase; + int error; + u_long aligned_nblks; + off_t mediasize; + + if (!swapdev_vp) { + error = getnewvnode("none", NULL, swapdev_vnodeop_p, + &swapdev_vp); + if (error) + panic("Cannot get vnode for swapdev"); + swapdev_vp->v_type = VNON; /* Untyped */ + } + + ASSERT_VOP_UNLOCKED(vp, "swaponvp"); + for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) { + if (sp->sw_vp == vp) + return EBUSY; + if (!sp->sw_vp) + goto found; + + } + return EINVAL; + found: + (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); +#ifdef MAC + error = mac_check_system_swapon(td->td_ucred, vp); + if (error == 0) +#endif + error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td); + (void) VOP_UNLOCK(vp, 0, td); + if (error) + return (error); + + if (nblks == 0) { + error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize, + FREAD, td->td_ucred, td); + if (error == 0) + nblks = mediasize / DEV_BSIZE; + } + /* + * XXX: We should also check that the sectorsize makes sense + * XXX: it should be a power of two, no larger than the page size. + */ + if (nblks == 0) { + (void) VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td); + return (ENXIO); + } + + /* + * If we go beyond this, we get overflows in the radix + * tree bitmap code. + */ + if (nblks > 0x40000000 / BLIST_META_RADIX / nswdev) { + printf("exceeded maximum of %d blocks per swap unit\n", + 0x40000000 / BLIST_META_RADIX / nswdev); + (void) VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td); + return (ENXIO); + } + /* + * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. + * First chop nblks off to page-align it, then convert. + * + * sw->sw_nblks is in page-sized chunks now too. + */ + nblks &= ~(ctodb(1) - 1); + nblks = dbtoc(nblks); + + sp->sw_vp = vp; + sp->sw_dev = dev2udev(dev); + sp->sw_device = dev; + sp->sw_flags = SW_FREED; + sp->sw_nblks = nblks; + sp->sw_used = 0; + + /* + * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not + * DEV_BSIZE'd. aligned_nblks is used to calculate the + * size of the swap bitmap, taking into account the stripe size. + */ + aligned_nblks = (nblks + (dmmax -1)) & ~(u_long)(dmmax -1); + + if (aligned_nblks * nswdev > nswap) + nswap = aligned_nblks * nswdev; + + if (swapblist == NULL) + swapblist = blist_create(nswap); + else + blist_resize(&swapblist, nswap, 0); + + for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) { + blk = min(nblks - dvbase, dmmax); + vsbase = index * dmmax + dvbase * nswdev; + blist_free(swapblist, vsbase, blk); + vm_swap_size += blk; + } + + swap_pager_full = 0; + + return (0); +} + +/* + * SYSCALL: swapoff(devname) + * + * Disable swapping on the given device. + */ +#ifndef _SYS_SYSPROTO_H_ +struct swapoff_args { + char *name; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +swapoff(td, uap) + struct thread *td; + struct swapoff_args *uap; +{ + struct vnode *vp; + struct nameidata nd; + struct swdevt *sp; + swblk_t dvbase, vsbase; + u_long nblks, aligned_nblks, blk; + int error, index; + + mtx_lock(&Giant); + + error = suser(td); + if (error) + goto done2; + + while (swdev_syscall_active) + tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); + swdev_syscall_active = 1; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td); + error = namei(&nd); + if (error) + goto done; + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + + for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) { + if (sp->sw_vp == vp) + goto found; + } + error = EINVAL; + goto done; +found: +#ifdef MAC + (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = mac_check_system_swapoff(td->td_ucred, vp); + (void) VOP_UNLOCK(vp, 0, td); + if (error != 0) + goto done; +#endif + + nblks = sp->sw_nblks; + + /* + * We can turn off this swap device safely only if the + * available virtual memory in the system will fit the amount + * of data we will have to page back in, plus an epsilon so + * the system doesn't become critically low on swap space. + */ + if (cnt.v_free_count + cnt.v_cache_count + vm_swap_size < + nblks + nswap_lowat) { + error = ENOMEM; + goto done; + } + + /* + * Prevent further allocations on this device. + */ + sp->sw_flags |= SW_CLOSING; + for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) { + blk = min(nblks - dvbase, dmmax); + vsbase = index * dmmax + dvbase * nswdev; + vm_swap_size -= blist_fill(swapblist, vsbase, blk); + } + + /* + * Page in the contents of the device and close it. + */ +#ifndef NO_SWAPPING + vm_proc_swapin_all(index); +#endif /* !NO_SWAPPING */ + swap_pager_swapoff(index, &sp->sw_used); + + VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td); + vrele(vp); + sp->sw_vp = NULL; + + /* + * Resize the bitmap based on the new largest swap device, + * or free the bitmap if there are no more devices. + */ + for (sp = swdevt, nblks = 0; sp < swdevt + nswdev; sp++) { + if (sp->sw_vp == NULL) + continue; + nblks = max(nblks, sp->sw_nblks); + } + + aligned_nblks = (nblks + (dmmax -1)) & ~(u_long)(dmmax -1); + nswap = aligned_nblks * nswdev; + + if (nswap == 0) { + blist_destroy(swapblist); + swapblist = NULL; + vrele(swapdev_vp); + swapdev_vp = NULL; + } else + blist_resize(&swapblist, nswap, 0); + +done: + swdev_syscall_active = 0; + wakeup_one(&swdev_syscall_active); +done2: + mtx_unlock(&Giant); + return (error); +} + +static int +sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *)arg1; + int error, i, n; + struct xswdev xs; + struct swdevt *sp; + + if (arg2 != 1) /* name length */ + return (EINVAL); + + for (sp = swdevt, i = 0, n = 0 ; i < nswdev; i++, sp++) { + if (sp->sw_vp) { + if (n == *name) { + xs.xsw_version = XSWDEV_VERSION; + xs.xsw_dev = sp->sw_dev; + xs.xsw_flags = sp->sw_flags; + xs.xsw_nblks = sp->sw_nblks; + xs.xsw_used = sp->sw_used; + + error = SYSCTL_OUT(req, &xs, sizeof(xs)); + return (error); + } + n++; + } + + } + return (ENOENT); +} + +SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswdev, 0, + "Number of swap devices"); +SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info, + "Swap statistics by device"); |
