diff options
author | Kirk McKusick <mckusick@FreeBSD.org> | 1999-07-08 06:06:00 +0000 |
---|---|---|
committer | Kirk McKusick <mckusick@FreeBSD.org> | 1999-07-08 06:06:00 +0000 |
commit | ad8ac923fa48ba09c5aa52201f3e1e5a81e30eeb (patch) | |
tree | 2284f4f780430039d2d5e44edfd2407ff2a99ebc | |
parent | bedf427650aa25ac43551bad76d88792626ddd49 (diff) |
Notes
-rw-r--r-- | sys/alpha/alpha/machdep.c | 4 | ||||
-rw-r--r-- | sys/amd64/amd64/machdep.c | 4 | ||||
-rw-r--r-- | sys/amd64/amd64/pmap.c | 7 | ||||
-rw-r--r-- | sys/i386/i386/machdep.c | 4 | ||||
-rw-r--r-- | sys/i386/i386/pmap.c | 7 | ||||
-rw-r--r-- | sys/kern/kern_subr.c | 11 | ||||
-rw-r--r-- | sys/kern/vfs_bio.c | 381 | ||||
-rw-r--r-- | sys/kern/vfs_cluster.c | 17 | ||||
-rw-r--r-- | sys/kern/vfs_export.c | 4 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 4 | ||||
-rw-r--r-- | sys/kern/vfs_vnops.c | 8 | ||||
-rw-r--r-- | sys/sys/bio.h | 18 | ||||
-rw-r--r-- | sys/sys/buf.h | 18 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_readwrite.c | 12 |
14 files changed, 267 insertions, 232 deletions
diff --git a/sys/alpha/alpha/machdep.c b/sys/alpha/alpha/machdep.c index 399e60f604de..5ab0d3f2fe6b 100644 --- a/sys/alpha/alpha/machdep.c +++ b/sys/alpha/alpha/machdep.c @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: machdep.c,v 1.46 1999/07/05 08:52:40 msmith Exp $ + * $Id: machdep.c,v 1.47 1999/07/06 17:48:16 peter Exp $ */ /*- * Copyright (c) 1998 The NetBSD Foundation, Inc. @@ -354,7 +354,7 @@ again: valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); - + v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 616e7e5a7449..ec11c9827bd1 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.352 1999/07/05 08:52:49 msmith Exp $ + * $Id: machdep.c,v 1.353 1999/07/06 07:13:33 cracauer Exp $ */ #include "apm.h" @@ -355,7 +355,7 @@ again: valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); - + v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 82ee99efc326..2b5c4fb81478 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -39,7 +39,7 @@ * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.239 1999/06/08 17:14:22 dt Exp $ + * $Id: pmap.c,v 1.240 1999/06/23 21:47:21 luoqi Exp $ */ /* @@ -3105,9 +3105,10 @@ pmap_changebit(pa, bit, setem) /* * pmap_clearbit: * - * Clear a bit/bits in every pte mapping a given physical page. + * Clear a bit/bits in every pte mapping a given physical page. Making + * this inline allows the pmap_changebit inline to be well optimized. */ -static void +static __inline void pmap_clearbit( vm_offset_t pa, int bit) diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 616e7e5a7449..ec11c9827bd1 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.352 1999/07/05 08:52:49 msmith Exp $ + * $Id: machdep.c,v 1.353 1999/07/06 07:13:33 cracauer Exp $ */ #include "apm.h" @@ -355,7 +355,7 @@ again: valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); - + v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 82ee99efc326..2b5c4fb81478 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -39,7 +39,7 @@ * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.239 1999/06/08 17:14:22 dt Exp $ + * $Id: pmap.c,v 1.240 1999/06/23 21:47:21 luoqi Exp $ */ /* @@ -3105,9 +3105,10 @@ pmap_changebit(pa, bit, setem) /* * pmap_clearbit: * - * Clear a bit/bits in every pte mapping a given physical page. + * Clear a bit/bits in every pte mapping a given physical page. Making + * this inline allows the pmap_changebit inline to be well optimized. */ -static void +static __inline void pmap_clearbit( vm_offset_t pa, int bit) diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index 13faec6a18d3..73d7d2c8a092 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 - * $Id: kern_subr.c,v 1.27 1999/02/22 18:39:49 bde Exp $ + * $Id: kern_subr.c,v 1.28 1999/03/12 03:09:29 julian Exp $ */ #include <sys/param.h> @@ -156,6 +156,7 @@ uiomoveco(cp, n, uio, obj) if (ticks - switchticks >= hogticks) uio_yield(); if (uio->uio_rw == UIO_READ) { +#ifdef ENABLE_VFS_IOOPT if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) && ((uio->uio_offset & PAGE_MASK) == 0) && @@ -163,7 +164,9 @@ uiomoveco(cp, n, uio, obj) error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, uio->uio_offset, cnt, (vm_offset_t) iov->iov_base, NULL); - } else { + } else +#endif + { error = copyout(cp, iov->iov_base, cnt); } } else { @@ -192,6 +195,8 @@ uiomoveco(cp, n, uio, obj) return (0); } +#ifdef ENABLE_VFS_IOOPT + int uioread(n, uio, obj, nread) int n; @@ -258,6 +263,8 @@ uioread(n, uio, obj, nread) return error; } +#endif + /* * Give next character to user as result of read. */ diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 5c478c6ccb4a..47e82761db8c 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -11,7 +11,7 @@ * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * - * $Id: vfs_bio.c,v 1.219 1999/06/29 05:59:41 peter Exp $ + * $Id: vfs_bio.c,v 1.220 1999/07/04 00:25:27 mckusick Exp $ */ /* @@ -90,14 +90,11 @@ static int bufspace, maxbufspace, vmiospace, #if 0 static int maxvmiobufspace; #endif +static int maxbdrun; static int needsbuffer; static int numdirtybuffers, lodirtybuffers, hidirtybuffers; static int numfreebuffers, lofreebuffers, hifreebuffers; static int getnewbufcalls; -static int getnewbufloops; -static int getnewbufloops1; -static int getnewbufloops2; -static int getnewbufloops3; static int getnewbufrestarts; static int kvafreespace; @@ -121,6 +118,8 @@ SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW, + &maxbdrun, 0, ""); #if 0 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, &maxvmiobufspace, 0, ""); @@ -135,18 +134,12 @@ SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD, &kvafreespace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, ""); -SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops, CTLFLAG_RW, - &getnewbufloops, 0, ""); -SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops1, CTLFLAG_RW, - &getnewbufloops1, 0, ""); -SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops2, CTLFLAG_RW, - &getnewbufloops2, 0, ""); -SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops3, CTLFLAG_RW, - &getnewbufloops3, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, ""); -static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; + +static int bufhashmask; +static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } }; char *buf_wmesg = BUF_WMESG; @@ -155,12 +148,24 @@ extern int vm_swap_size; #define BUF_MAXUSE 24 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ -#define VFS_BIO_NEED_RESERVED02 0x02 /* unused */ +#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ #define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */ /* + * Buffer hash table code. Note that the logical block scans linearly, which + * gives us some L1 cache locality. + */ + +static __inline +struct bufhashhdr * +bufhash(struct vnode *vnp, daddr_t bn) +{ + return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); +} + +/* * kvaspacewakeup: * * Called when kva space is potential available for recovery or when @@ -185,6 +190,24 @@ kvaspacewakeup(void) } /* + * numdirtywakeup: + * + * If someone is blocked due to there being too many dirty buffers, + * and numdirtybuffers is now reasonable, wake them up. + */ + +static __inline void +numdirtywakeup(void) +{ + if (numdirtybuffers < hidirtybuffers) { + if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { + needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; + wakeup(&needsbuffer); + } + } +} + +/* * bufspacewakeup: * * Called when buffer space is potentially available for recovery or when @@ -260,10 +283,23 @@ bd_wakeup(int dirtybuflevel) /* - * Initialize buffer headers and related structures. + * Initialize buffer headers and related structures. */ + +vm_offset_t +bufhashinit(vm_offset_t vaddr) +{ + /* first, make a null hash table */ + for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1) + ; + bufhashtbl = (void *)vaddr; + vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask; + --bufhashmask; + return(vaddr); +} + void -bufinit() +bufinit(void) { struct buf *bp; int i; @@ -272,8 +308,7 @@ bufinit() LIST_INIT(&invalhash); simple_lock_init(&buftimelock); - /* first, make a null hash table */ - for (i = 0; i < BUFHSZ; i++) + for (i = 0; i <= bufhashmask; i++) LIST_INIT(&bufhashtbl[i]); /* next, make a null set of free lists */ @@ -329,8 +364,8 @@ bufinit() * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ - lodirtybuffers = nbuf / 6 + 10; - hidirtybuffers = nbuf / 3 + 20; + lodirtybuffers = nbuf / 7 + 10; + hidirtybuffers = nbuf / 4 + 20; numdirtybuffers = 0; /* @@ -341,6 +376,15 @@ bufinit() hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; +/* + * Maximum number of async ops initiated per buf_daemon loop. This is + * somewhat of a hack at the moment, we really need to limit ourselves + * based on the number of bytes of I/O in-transit that were initiated + * from buf_daemon. + */ + if ((maxbdrun = nswbuf / 4) < 4) + maxbdrun = 4; + kvafreespace = 0; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); @@ -383,19 +427,14 @@ bremfree(struct buf * bp) if (bp->b_qindex == QUEUE_EMPTYKVA) { kvafreespace -= bp->b_kvasize; } - if (BUF_REFCNT(bp) == 1) - TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); - else if (BUF_REFCNT(bp) == 0) - panic("bremfree: not locked"); - else - /* Temporary panic to verify exclusive locking */ - /* This panic goes away when we allow shared refs */ - panic("bremfree: multiple refs"); + KASSERT(BUF_REFCNT(bp) == 0, ("bremfree: bp %p not locked",bp)); + TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; runningbufspace += bp->b_bufsize; } else { #if !defined(MAX_PERF) - panic("bremfree: removing a buffer when not on a queue"); + if (BUF_REFCNT(bp) <= 1) + panic("bremfree: removing a buffer not on a queue"); #endif } @@ -599,7 +638,9 @@ bwrite(struct buf * bp) void bdwrite(struct buf * bp) { +#if 0 struct vnode *vp; +#endif #if !defined(MAX_PERF) if (BUF_REFCNT(bp) == 0) @@ -654,6 +695,11 @@ bdwrite(struct buf * bp) bd_wakeup(hidirtybuffers); /* + * note: we cannot initiate I/O from a bdwrite even if we wanted to, + * due to the softdep code. + */ +#if 0 + /* * XXX The soft dependency code is not prepared to * have I/O done when a bdwrite is requested. For * now we just let the write be delayed if it is @@ -664,6 +710,7 @@ bdwrite(struct buf * bp) (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))) return; +#endif } /* @@ -722,6 +769,7 @@ bundirty(bp) bp->b_flags &= ~B_DELWRI; reassignbuf(bp, bp->b_vp); --numdirtybuffers; + numdirtywakeup(); } } @@ -757,6 +805,34 @@ bowrite(struct buf * bp) } /* + * bwillwrite: + * + * Called prior to the locking of any vnodes when we are expecting to + * write. We do not want to starve the buffer cache with too many + * dirty buffers so we block here. By blocking prior to the locking + * of any vnodes we attempt to avoid the situation where a locked vnode + * prevents the various system daemons from flushing related buffers. + */ + +void +bwillwrite(void) +{ + int twenty = (hidirtybuffers - lodirtybuffers) / 5; + + if (numdirtybuffers > hidirtybuffers + twenty) { + int s; + + s = splbio(); + while (numdirtybuffers > hidirtybuffers) { + bd_wakeup(hidirtybuffers); + needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; + tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); + } + splx(s); + } +} + +/* * brelse: * * Release a busy buffer and, if requested, free its resources. The @@ -799,8 +875,10 @@ brelse(struct buf * bp) bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); - if (bp->b_flags & B_DELWRI) + if (bp->b_flags & B_DELWRI) { --numdirtybuffers; + numdirtywakeup(); + } bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) @@ -991,6 +1069,7 @@ brelse(struct buf * bp) if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { bp->b_flags &= ~B_DELWRI; --numdirtybuffers; + numdirtywakeup(); } runningbufspace -= bp->b_bufsize; @@ -1070,7 +1149,7 @@ bqrelse(struct buf * bp) /* * Something we can maybe wakeup */ - if (bp->b_bufsize) + if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); /* unlock */ @@ -1139,7 +1218,7 @@ gbincore(struct vnode * vp, daddr_t blkno) struct buf *bp; struct bufhashhdr *bh; - bh = BUFHASH(vp, blkno); + bh = bufhash(vp, blkno); bp = bh->lh_first; /* Search hash chain */ @@ -1155,14 +1234,18 @@ gbincore(struct vnode * vp, daddr_t blkno) } /* - * this routine implements clustered async writes for - * clearing out B_DELWRI buffers... This is much better - * than the old way of writing only one buffer at a time. + * vfs_bio_awrite: + * + * Implement clustered async writes for clearing out B_DELWRI buffers. + * This is much better then the old way of writing only one buffer at + * a time. Note that we may not be presented with the buffers in the + * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf * bp) { int i; + int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; @@ -1174,8 +1257,9 @@ vfs_bio_awrite(struct buf * bp) s = splbio(); /* - * right now we support clustered writing only to regular files, and - * then only if our I/O system is not saturated. + * right now we support clustered writing only to regular files. If + * we find a clusterable block we could be in the middle of a cluster + * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ @@ -1191,18 +1275,34 @@ vfs_bio_awrite(struct buf * bp) (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || - (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) + (bpa->b_blkno != + bp->b_blkno + ((i * size) >> DEV_BSHIFT))) break; } else { break; } } - ncl = i; + for (j = 1; i + j <= maxcl && j <= lblkno; j++) { + if ((bpa = gbincore(vp, lblkno - j)) && + BUF_REFCNT(bpa) == 0 && + ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == + (B_DELWRI | B_CLUSTEROK)) && + (bpa->b_bufsize == size)) { + if ((bpa->b_blkno == bpa->b_lblkno) || + (bpa->b_blkno != + bp->b_blkno - ((j * size) >> DEV_BSHIFT))) + break; + } else { + break; + } + } + --j; + ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { - nwritten = cluster_wbuild(vp, size, lblkno, ncl); + nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); splx(s); return nwritten; } @@ -1240,21 +1340,12 @@ vfs_bio_awrite(struct buf * bp) * If we have to flush dirty buffers ( but we try to avoid this ) * * To avoid VFS layer recursion we do not flush dirty buffers ourselves. - * Instead we ask the pageout daemon to do it for us. We attempt to + * Instead we ask the buf daemon to do it for us. We attempt to * avoid piecemeal wakeups of the pageout daemon. */ - /* - * We fully expect to be able to handle any fragmentation and buffer - * space issues by freeing QUEUE_CLEAN buffers. If this fails, we - * have to wakeup the pageout daemon and ask it to flush some of our - * QUEUE_DIRTY buffers. We have to be careful to prevent a deadlock. - * XXX - */ - static struct buf * -getnewbuf(struct vnode *vp, daddr_t blkno, - int slpflag, int slptimeo, int size, int maxsize) +getnewbuf(int slpflag, int slptimeo, int size, int maxsize) { struct buf *bp; struct buf *nbp; @@ -1262,8 +1353,6 @@ getnewbuf(struct vnode *vp, daddr_t blkno, int outofspace; int nqindex; int defrag = 0; - static int newbufcnt = 0; - int lastnewbuf = newbufcnt; ++getnewbufcalls; --getnewbufrestarts; @@ -1338,13 +1427,9 @@ restart: * depending. */ - if (nbp) - --getnewbufloops; - while ((bp = nbp) != NULL) { int qindex = nqindex; - ++getnewbufloops; /* * Calculate next bp ( we can only use it if we do not block * or do other fancy things ). @@ -1372,7 +1457,6 @@ restart: /* * Sanity Checks */ - KASSERT(BUF_REFCNT(bp) == 0, ("getnewbuf: busy buffer %p on free list", bp)); KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); /* @@ -1388,14 +1472,10 @@ restart: * buffer isn't useful for fixing that problem we continue. */ - if (defrag > 0 && bp->b_kvasize == 0) { - ++getnewbufloops1; + if (defrag > 0 && bp->b_kvasize == 0) continue; - } - if (outofspace > 0 && bp->b_bufsize == 0) { - ++getnewbufloops2; + if (outofspace > 0 && bp->b_bufsize == 0) continue; - } /* * Start freeing the bp. This is somewhat involved. nbp @@ -1433,7 +1513,6 @@ restart: } if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); - LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); @@ -1451,7 +1530,6 @@ restart: bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_usecount = 5; LIST_INIT(&bp->b_dep); @@ -1489,19 +1567,26 @@ restart: /* * If we exhausted our list, sleep as appropriate. We may have to - * wakeup the pageout daemon to write out some dirty buffers. + * wakeup various daemons and write out some dirty buffers. + * + * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { int flags; + char *waitmsg; dosleep: - if (defrag > 0) + if (defrag > 0) { flags = VFS_BIO_NEED_KVASPACE; - else if (outofspace > 0) + waitmsg = "nbufkv"; + } else if (outofspace > 0) { + waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; - else + } else { + waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; + } /* XXX */ @@ -1509,7 +1594,7 @@ dosleep: needsbuffer |= flags; while (needsbuffer & flags) { if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, - "newbuf", slptimeo)) + waitmsg, slptimeo)) return (NULL); } } else { @@ -1553,42 +1638,7 @@ dosleep: } bp->b_data = bp->b_kvabase; } - - /* - * If we have slept at some point in this process and another - * process has managed to allocate a new buffer while we slept, - * we have to return NULL so that our caller can recheck to - * ensure that the other process did not create an identically - * identified buffer to the one we were requesting. We make this - * check by incrementing the static int newbufcnt each time we - * successfully allocate a new buffer. By saving the value of - * newbufcnt in our local lastnewbuf, we can compare newbufcnt - * with lastnewbuf to see if any other process managed to - * allocate a buffer while we were doing so ourselves. - * - * Note that bp, if valid, is locked. - */ - if (lastnewbuf == newbufcnt) { - /* - * No buffers allocated, so we can return one if we were - * successful, or continue trying if we were not successful. - */ - if (bp != NULL) { - newbufcnt += 1; - return (bp); - } - goto restart; - } - /* - * Another process allocated a buffer since we were called, so - * we have to free the one we allocated and return NULL to let - * our caller recheck to see if a new buffer is still needed. - */ - if (bp != NULL) { - bp->b_flags |= B_INVAL; - brelse(bp); - } - return (NULL); + return(bp); } /* @@ -1601,7 +1651,6 @@ static void waitfreebuffers(int slpflag, int slptimeo) { while (numfreebuffers < hifreebuffers) { - bd_wakeup(0); if (numfreebuffers >= hifreebuffers) break; needsbuffer |= VFS_BIO_NEED_FREE; @@ -1646,60 +1695,72 @@ buf_daemon() bd_request = 0; /* - * Do the flush. + * Do the flush. Limit the number of buffers we flush in one + * go. The failure condition occurs when processes are writing + * buffers faster then we can dispose of them. In this case + * we may be flushing so often that the previous set of flushes + * have not had time to complete, causing us to run out of + * physical buffers and block. */ { - while (numdirtybuffers > bd_flushto) { + int runcount = maxbdrun; + + while (numdirtybuffers > bd_flushto && runcount) { + --runcount; if (flushbufqueues() == 0) break; } } /* - * Whew. If nobody is requesting anything we sleep until the - * next event. If we sleep and the sleep times out and - * nobody is waiting for interesting things we back-off. - * Otherwise we get more aggressive. + * If nobody is requesting anything we sleep */ + if (bd_request == 0) + tsleep(&bd_request, PVM, "psleep", bd_interval); - if (bd_request == 0 && - tsleep(&bd_request, PVM, "psleep", bd_interval) && - needsbuffer == 0) { - /* - * timed out and nothing serious going on, - * increase the flushto high water mark to reduce - * the flush rate. - */ - bd_flushto += 10; - } else { - /* - * We were woken up or hit a serious wall that needs - * to be addressed. - */ - bd_flushto -= 10; - if (needsbuffer) { - int middb = (lodirtybuffers+hidirtybuffers)/2; - bd_interval >>= 1; - if (bd_flushto > middb) - bd_flushto = middb; - } + /* + * We calculate how much to add or subtract from bd_flushto + * and bd_interval based on how far off we are from the + * optimal number of dirty buffers, which is 20% below the + * hidirtybuffers mark. We cannot use hidirtybuffers straight + * because being right on the mark will cause getnewbuf() + * to oscillate our wakeup. + * + * The larger the error in either direction, the more we adjust + * bd_flushto and bd_interval. The time interval is adjusted + * by 2 seconds per whole-buffer-range of error. This is an + * exponential convergence algorithm, with large errors + * producing large changes and small errors producing small + * changes. + */ + + { + int brange = hidirtybuffers - lodirtybuffers; + int middb = hidirtybuffers - brange / 5; + int deltabuf = middb - numdirtybuffers; + + bd_flushto += deltabuf / 20; + bd_interval += deltabuf * (2 * hz) / (brange * 1); } - if (bd_flushto < lodirtybuffers) { + if (bd_flushto < lodirtybuffers) bd_flushto = lodirtybuffers; - bd_interval -= hz / 10; - } - if (bd_flushto > hidirtybuffers) { + if (bd_flushto > hidirtybuffers) bd_flushto = hidirtybuffers; - bd_interval += hz / 10; - } if (bd_interval < hz / 10) bd_interval = hz / 10; - if (bd_interval > 5 * hz) bd_interval = 5 * hz; } } +/* + * flushbufqueues: + * + * Try to flush a buffer in the dirty queue. We must be careful to + * free up B_INVAL buffers instead of write them, which NFS is + * particularly sensitive to. + */ + static int flushbufqueues(void) { @@ -1709,15 +1770,6 @@ flushbufqueues(void) bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); while (bp) { - /* - * Try to free up B_INVAL delayed-write buffers rather then - * writing them out. Note also that NFS is somewhat sensitive - * to B_INVAL buffers so it is doubly important that we do - * this. - * - * We do not try to sync buffers whos vnodes are locked, we - * cannot afford to block in this process. - */ KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); if ((bp->b_flags & B_DELWRI) != 0) { if (bp->b_flags & B_INVAL) { @@ -1728,11 +1780,9 @@ flushbufqueues(void) ++r; break; } - if (!VOP_ISLOCKED(bp->b_vp)) { - vfs_bio_awrite(bp); - ++r; - break; - } + vfs_bio_awrite(bp); + ++r; + break; } bp = TAILQ_NEXT(bp, b_freelist); } @@ -1957,8 +2007,6 @@ loop: */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { - if (bp->b_usecount < BUF_MAXUSE) - ++bp->b_usecount; if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "getblk", slpflag, slptimeo) == ENOLCK) goto loop; @@ -2036,8 +2084,6 @@ loop: goto loop; } - if (bp->b_usecount < BUF_MAXUSE) - ++bp->b_usecount; splx(s); bp->b_flags &= ~B_DONE; } else { @@ -2063,8 +2109,7 @@ loop: maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); - if ((bp = getnewbuf(vp, blkno, - slpflag, slptimeo, size, maxsize)) == NULL) { + if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { if (slpflag || slptimeo) { splx(s); return NULL; @@ -2079,7 +2124,8 @@ loop: * If the buffer is created out from under us, we have to * throw away the one we just created. There is now window * race because we are safely running at splbio() from the - * point of the duplicate buffer creation through to here. + * point of the duplicate buffer creation through to here, + * and we've locked the buffer. */ if (gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; @@ -2096,7 +2142,7 @@ loop: bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); - bh = BUFHASH(vp, blkno); + bh = bufhash(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); /* @@ -2135,7 +2181,7 @@ geteblk(int size) int s; s = splbio(); - while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0); + while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0); splx(s); allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ @@ -2218,7 +2264,8 @@ allocbuf(struct buf *bp, int size) #if !defined(NO_B_MALLOC) /* * We only use malloced memory on the first allocation. - * and revert to page-allocated memory when the buffer grows. + * and revert to page-allocated memory when the buffer + * grows. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 9a9eb60ebad0..f6fc890b3dca 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 - * $Id: vfs_cluster.c,v 1.85 1999/06/29 05:59:43 peter Exp $ + * $Id: vfs_cluster.c,v 1.86 1999/07/04 00:31:17 mckusick Exp $ */ #include "opt_debug_cluster.h" @@ -150,21 +150,12 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) } /* - * Set another read-ahead mark so we know to check - * again. + * Set another read-ahead mark so we know + * to check again. */ if (((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) tbp->b_flags |= B_RAM; - -#if 0 - if ((tbp->b_usecount < 1) && - BUF_REFCNT(tbp) == 0 && - (tbp->b_qindex == QUEUE_LRU)) { - TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); - } -#endif } splx(s); if (i >= maxra) { @@ -586,7 +577,7 @@ cluster_write(bp, filesize) if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async) - cluster_wbuild(vp, lblocksize, + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize); } else { struct buf **bpp, **endbp; diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index efca6c8a1578..4ef741c8e367 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $ + * $Id: vfs_subr.c,v 1.206 1999/07/04 00:25:29 mckusick Exp $ */ /* @@ -115,8 +115,8 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, static int reassignbufmethod = 1; SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); -int vfs_ioopt = 0; #ifdef ENABLE_VFS_IOOPT +int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index efca6c8a1578..4ef741c8e367 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $ + * $Id: vfs_subr.c,v 1.206 1999/07/04 00:25:29 mckusick Exp $ */ /* @@ -115,8 +115,8 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, static int reassignbufmethod = 1; SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); -int vfs_ioopt = 0; #ifdef ENABLE_VFS_IOOPT +int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 8a520d335594..87cdac2f4858 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 - * $Id: vfs_vnops.c,v 1.68 1999/04/28 11:37:12 phk Exp $ + * $Id: vfs_vnops.c,v 1.69 1999/07/02 16:29:15 phk Exp $ */ #include <sys/param.h> @@ -334,10 +334,14 @@ vn_write(fp, uio, cred, flags) struct ucred *cred; int flags; { - struct vnode *vp = (struct vnode *)fp->f_data; + struct vnode *vp; struct proc *p = uio->uio_procp; int error, ioflag; + vp = (struct vnode *)fp->f_data; + if (vp->v_type == VREG) + bwillwrite(); + vp = (struct vnode *)fp->f_data; /* XXX needed? */ ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; diff --git a/sys/sys/bio.h b/sys/sys/bio.h index e6d23d86d9cc..87043e30b95b 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $ + * $Id: buf.h,v 1.75 1999/07/04 00:25:32 mckusick Exp $ */ #ifndef _SYS_BUF_H_ @@ -100,7 +100,7 @@ struct buf { TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */ long b_flags; /* B_* flags. */ unsigned short b_qindex; /* buffer queue index */ - unsigned char b_usecount; /* buffer use count */ + unsigned char b_unused1; /* unused field */ unsigned char b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ int b_error; /* Errno value. */ @@ -410,18 +410,6 @@ bufq_first(struct buf_queue_head *head) #endif /* KERNEL */ - -/* - * number of buffer hash entries - */ -#define BUFHSZ 512 - -/* - * buffer hash table calculation, originally by David Greenman - */ -#define BUFHASH(vnp, bn) \ - (&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ]) - /* * Definitions for the buffer free lists. */ @@ -458,7 +446,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; struct uio; +vm_offset_t bufhashinit __P((vm_offset_t)); void bufinit __P((void)); +void bwillwrite __P((void)); void bremfree __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index e6d23d86d9cc..87043e30b95b 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $ + * $Id: buf.h,v 1.75 1999/07/04 00:25:32 mckusick Exp $ */ #ifndef _SYS_BUF_H_ @@ -100,7 +100,7 @@ struct buf { TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */ long b_flags; /* B_* flags. */ unsigned short b_qindex; /* buffer queue index */ - unsigned char b_usecount; /* buffer use count */ + unsigned char b_unused1; /* unused field */ unsigned char b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ int b_error; /* Errno value. */ @@ -410,18 +410,6 @@ bufq_first(struct buf_queue_head *head) #endif /* KERNEL */ - -/* - * number of buffer hash entries - */ -#define BUFHSZ 512 - -/* - * buffer hash table calculation, originally by David Greenman - */ -#define BUFHASH(vnp, bn) \ - (&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ]) - /* * Definitions for the buffer free lists. */ @@ -458,7 +446,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; struct uio; +vm_offset_t bufhashinit __P((vm_offset_t)); void bufinit __P((void)); +void bwillwrite __P((void)); void bremfree __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index d12a8020e2f9..53f980af8060 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 - * $Id: ufs_readwrite.c,v 1.57 1999/01/28 00:57:56 dillon Exp $ + * $Id: ufs_readwrite.c,v 1.58 1999/04/05 19:38:30 julian Exp $ */ #define BLKSIZE(a, b, c) blksize(a, b, c) @@ -106,7 +106,8 @@ READ(ap) if (object) vm_object_reference(object); -#if 1 + +#ifdef ENABLE_VFS_IOOPT /* * If IO optimisation is turned on, * and we are NOT a VM based IO request, @@ -150,7 +151,7 @@ READ(ap) for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; -#if 1 +#ifdef ENABLE_VFS_IOOPT if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { /* * Obviously we didn't finish above, but we @@ -276,6 +277,7 @@ READ(ap) xfersize = size; } +#ifdef ENABLE_VFS_IOOPT if (vfs_ioopt && object && (bp->b_flags & B_VMIO) && ((blkoffset & PAGE_MASK) == 0) && @@ -289,7 +291,9 @@ READ(ap) error = uiomoveco((char *)bp->b_data + blkoffset, (int)xfersize, uio, object); - } else { + } else +#endif + { /* * otherwise use the general form */ |