summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKirk McKusick <mckusick@FreeBSD.org>1999-07-08 06:06:00 +0000
committerKirk McKusick <mckusick@FreeBSD.org>1999-07-08 06:06:00 +0000
commitad8ac923fa48ba09c5aa52201f3e1e5a81e30eeb (patch)
tree2284f4f780430039d2d5e44edfd2407ff2a99ebc
parentbedf427650aa25ac43551bad76d88792626ddd49 (diff)
Notes
-rw-r--r--sys/alpha/alpha/machdep.c4
-rw-r--r--sys/amd64/amd64/machdep.c4
-rw-r--r--sys/amd64/amd64/pmap.c7
-rw-r--r--sys/i386/i386/machdep.c4
-rw-r--r--sys/i386/i386/pmap.c7
-rw-r--r--sys/kern/kern_subr.c11
-rw-r--r--sys/kern/vfs_bio.c381
-rw-r--r--sys/kern/vfs_cluster.c17
-rw-r--r--sys/kern/vfs_export.c4
-rw-r--r--sys/kern/vfs_subr.c4
-rw-r--r--sys/kern/vfs_vnops.c8
-rw-r--r--sys/sys/bio.h18
-rw-r--r--sys/sys/buf.h18
-rw-r--r--sys/ufs/ufs/ufs_readwrite.c12
14 files changed, 267 insertions, 232 deletions
diff --git a/sys/alpha/alpha/machdep.c b/sys/alpha/alpha/machdep.c
index 399e60f604de..5ab0d3f2fe6b 100644
--- a/sys/alpha/alpha/machdep.c
+++ b/sys/alpha/alpha/machdep.c
@@ -23,7 +23,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: machdep.c,v 1.46 1999/07/05 08:52:40 msmith Exp $
+ * $Id: machdep.c,v 1.47 1999/07/06 17:48:16 peter Exp $
*/
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
@@ -354,7 +354,7 @@ again:
valloc(swbuf, struct buf, nswbuf);
valloc(buf, struct buf, nbuf);
-
+ v = bufhashinit(v);
/*
* End of first pass, size has been calculated so allocate memory
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 616e7e5a7449..ec11c9827bd1 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -35,7 +35,7 @@
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
- * $Id: machdep.c,v 1.352 1999/07/05 08:52:49 msmith Exp $
+ * $Id: machdep.c,v 1.353 1999/07/06 07:13:33 cracauer Exp $
*/
#include "apm.h"
@@ -355,7 +355,7 @@ again:
valloc(swbuf, struct buf, nswbuf);
valloc(buf, struct buf, nbuf);
-
+ v = bufhashinit(v);
/*
* End of first pass, size has been calculated so allocate memory
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 82ee99efc326..2b5c4fb81478 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -39,7 +39,7 @@
* SUCH DAMAGE.
*
* from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
- * $Id: pmap.c,v 1.239 1999/06/08 17:14:22 dt Exp $
+ * $Id: pmap.c,v 1.240 1999/06/23 21:47:21 luoqi Exp $
*/
/*
@@ -3105,9 +3105,10 @@ pmap_changebit(pa, bit, setem)
/*
* pmap_clearbit:
*
- * Clear a bit/bits in every pte mapping a given physical page.
+ * Clear a bit/bits in every pte mapping a given physical page. Making
+ * this inline allows the pmap_changebit inline to be well optimized.
*/
-static void
+static __inline void
pmap_clearbit(
vm_offset_t pa,
int bit)
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index 616e7e5a7449..ec11c9827bd1 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -35,7 +35,7 @@
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
- * $Id: machdep.c,v 1.352 1999/07/05 08:52:49 msmith Exp $
+ * $Id: machdep.c,v 1.353 1999/07/06 07:13:33 cracauer Exp $
*/
#include "apm.h"
@@ -355,7 +355,7 @@ again:
valloc(swbuf, struct buf, nswbuf);
valloc(buf, struct buf, nbuf);
-
+ v = bufhashinit(v);
/*
* End of first pass, size has been calculated so allocate memory
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 82ee99efc326..2b5c4fb81478 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -39,7 +39,7 @@
* SUCH DAMAGE.
*
* from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
- * $Id: pmap.c,v 1.239 1999/06/08 17:14:22 dt Exp $
+ * $Id: pmap.c,v 1.240 1999/06/23 21:47:21 luoqi Exp $
*/
/*
@@ -3105,9 +3105,10 @@ pmap_changebit(pa, bit, setem)
/*
* pmap_clearbit:
*
- * Clear a bit/bits in every pte mapping a given physical page.
+ * Clear a bit/bits in every pte mapping a given physical page. Making
+ * this inline allows the pmap_changebit inline to be well optimized.
*/
-static void
+static __inline void
pmap_clearbit(
vm_offset_t pa,
int bit)
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index 13faec6a18d3..73d7d2c8a092 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
- * $Id: kern_subr.c,v 1.27 1999/02/22 18:39:49 bde Exp $
+ * $Id: kern_subr.c,v 1.28 1999/03/12 03:09:29 julian Exp $
*/
#include <sys/param.h>
@@ -156,6 +156,7 @@ uiomoveco(cp, n, uio, obj)
if (ticks - switchticks >= hogticks)
uio_yield();
if (uio->uio_rw == UIO_READ) {
+#ifdef ENABLE_VFS_IOOPT
if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) &&
((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
((uio->uio_offset & PAGE_MASK) == 0) &&
@@ -163,7 +164,9 @@ uiomoveco(cp, n, uio, obj)
error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
uio->uio_offset, cnt,
(vm_offset_t) iov->iov_base, NULL);
- } else {
+ } else
+#endif
+ {
error = copyout(cp, iov->iov_base, cnt);
}
} else {
@@ -192,6 +195,8 @@ uiomoveco(cp, n, uio, obj)
return (0);
}
+#ifdef ENABLE_VFS_IOOPT
+
int
uioread(n, uio, obj, nread)
int n;
@@ -258,6 +263,8 @@ uioread(n, uio, obj, nread)
return error;
}
+#endif
+
/*
* Give next character to user as result of read.
*/
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 5c478c6ccb4a..47e82761db8c 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
* 2. Absolutely no warranty of function or purpose is made by the author
* John S. Dyson.
*
- * $Id: vfs_bio.c,v 1.219 1999/06/29 05:59:41 peter Exp $
+ * $Id: vfs_bio.c,v 1.220 1999/07/04 00:25:27 mckusick Exp $
*/
/*
@@ -90,14 +90,11 @@ static int bufspace, maxbufspace, vmiospace,
#if 0
static int maxvmiobufspace;
#endif
+static int maxbdrun;
static int needsbuffer;
static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
static int numfreebuffers, lofreebuffers, hifreebuffers;
static int getnewbufcalls;
-static int getnewbufloops;
-static int getnewbufloops1;
-static int getnewbufloops2;
-static int getnewbufloops3;
static int getnewbufrestarts;
static int kvafreespace;
@@ -121,6 +118,8 @@ SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
&hibufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
&bufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
+ &maxbdrun, 0, "");
#if 0
SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
&maxvmiobufspace, 0, "");
@@ -135,18 +134,12 @@ SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
&kvafreespace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
&getnewbufcalls, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops, CTLFLAG_RW,
- &getnewbufloops, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops1, CTLFLAG_RW,
- &getnewbufloops1, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops2, CTLFLAG_RW,
- &getnewbufloops2, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops3, CTLFLAG_RW,
- &getnewbufloops3, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
&getnewbufrestarts, 0, "");
-static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
+
+static int bufhashmask;
+static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
char *buf_wmesg = BUF_WMESG;
@@ -155,12 +148,24 @@ extern int vm_swap_size;
#define BUF_MAXUSE 24
#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
-#define VFS_BIO_NEED_RESERVED02 0x02 /* unused */
+#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */
#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
#define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */
/*
+ * Buffer hash table code. Note that the logical block scans linearly, which
+ * gives us some L1 cache locality.
+ */
+
+static __inline
+struct bufhashhdr *
+bufhash(struct vnode *vnp, daddr_t bn)
+{
+ return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
+}
+
+/*
* kvaspacewakeup:
*
* Called when kva space is potential available for recovery or when
@@ -185,6 +190,24 @@ kvaspacewakeup(void)
}
/*
+ * numdirtywakeup:
+ *
+ * If someone is blocked due to there being too many dirty buffers,
+ * and numdirtybuffers is now reasonable, wake them up.
+ */
+
+static __inline void
+numdirtywakeup(void)
+{
+ if (numdirtybuffers < hidirtybuffers) {
+ if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
+ needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
+ wakeup(&needsbuffer);
+ }
+ }
+}
+
+/*
* bufspacewakeup:
*
* Called when buffer space is potentially available for recovery or when
@@ -260,10 +283,23 @@ bd_wakeup(int dirtybuflevel)
/*
- * Initialize buffer headers and related structures.
+ * Initialize buffer headers and related structures.
*/
+
+vm_offset_t
+bufhashinit(vm_offset_t vaddr)
+{
+ /* first, make a null hash table */
+ for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
+ ;
+ bufhashtbl = (void *)vaddr;
+ vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
+ --bufhashmask;
+ return(vaddr);
+}
+
void
-bufinit()
+bufinit(void)
{
struct buf *bp;
int i;
@@ -272,8 +308,7 @@ bufinit()
LIST_INIT(&invalhash);
simple_lock_init(&buftimelock);
- /* first, make a null hash table */
- for (i = 0; i < BUFHSZ; i++)
+ for (i = 0; i <= bufhashmask; i++)
LIST_INIT(&bufhashtbl[i]);
/* next, make a null set of free lists */
@@ -329,8 +364,8 @@ bufinit()
* Reduce the chance of a deadlock occuring by limiting the number
* of delayed-write dirty buffers we allow to stack up.
*/
- lodirtybuffers = nbuf / 6 + 10;
- hidirtybuffers = nbuf / 3 + 20;
+ lodirtybuffers = nbuf / 7 + 10;
+ hidirtybuffers = nbuf / 4 + 20;
numdirtybuffers = 0;
/*
@@ -341,6 +376,15 @@ bufinit()
hifreebuffers = 2 * lofreebuffers;
numfreebuffers = nbuf;
+/*
+ * Maximum number of async ops initiated per buf_daemon loop. This is
+ * somewhat of a hack at the moment, we really need to limit ourselves
+ * based on the number of bytes of I/O in-transit that were initiated
+ * from buf_daemon.
+ */
+ if ((maxbdrun = nswbuf / 4) < 4)
+ maxbdrun = 4;
+
kvafreespace = 0;
bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
@@ -383,19 +427,14 @@ bremfree(struct buf * bp)
if (bp->b_qindex == QUEUE_EMPTYKVA) {
kvafreespace -= bp->b_kvasize;
}
- if (BUF_REFCNT(bp) == 1)
- TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
- else if (BUF_REFCNT(bp) == 0)
- panic("bremfree: not locked");
- else
- /* Temporary panic to verify exclusive locking */
- /* This panic goes away when we allow shared refs */
- panic("bremfree: multiple refs");
+ KASSERT(BUF_REFCNT(bp) == 0, ("bremfree: bp %p not locked",bp));
+ TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
bp->b_qindex = QUEUE_NONE;
runningbufspace += bp->b_bufsize;
} else {
#if !defined(MAX_PERF)
- panic("bremfree: removing a buffer when not on a queue");
+ if (BUF_REFCNT(bp) <= 1)
+ panic("bremfree: removing a buffer not on a queue");
#endif
}
@@ -599,7 +638,9 @@ bwrite(struct buf * bp)
void
bdwrite(struct buf * bp)
{
+#if 0
struct vnode *vp;
+#endif
#if !defined(MAX_PERF)
if (BUF_REFCNT(bp) == 0)
@@ -654,6 +695,11 @@ bdwrite(struct buf * bp)
bd_wakeup(hidirtybuffers);
/*
+ * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+ * due to the softdep code.
+ */
+#if 0
+ /*
* XXX The soft dependency code is not prepared to
* have I/O done when a bdwrite is requested. For
* now we just let the write be delayed if it is
@@ -664,6 +710,7 @@ bdwrite(struct buf * bp)
(vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
return;
+#endif
}
/*
@@ -722,6 +769,7 @@ bundirty(bp)
bp->b_flags &= ~B_DELWRI;
reassignbuf(bp, bp->b_vp);
--numdirtybuffers;
+ numdirtywakeup();
}
}
@@ -757,6 +805,34 @@ bowrite(struct buf * bp)
}
/*
+ * bwillwrite:
+ *
+ * Called prior to the locking of any vnodes when we are expecting to
+ * write. We do not want to starve the buffer cache with too many
+ * dirty buffers so we block here. By blocking prior to the locking
+ * of any vnodes we attempt to avoid the situation where a locked vnode
+ * prevents the various system daemons from flushing related buffers.
+ */
+
+void
+bwillwrite(void)
+{
+ int twenty = (hidirtybuffers - lodirtybuffers) / 5;
+
+ if (numdirtybuffers > hidirtybuffers + twenty) {
+ int s;
+
+ s = splbio();
+ while (numdirtybuffers > hidirtybuffers) {
+ bd_wakeup(hidirtybuffers);
+ needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
+ tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
+ }
+ splx(s);
+ }
+}
+
+/*
* brelse:
*
* Release a busy buffer and, if requested, free its resources. The
@@ -799,8 +875,10 @@ brelse(struct buf * bp)
bp->b_flags |= B_INVAL;
if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
(*bioops.io_deallocate)(bp);
- if (bp->b_flags & B_DELWRI)
+ if (bp->b_flags & B_DELWRI) {
--numdirtybuffers;
+ numdirtywakeup();
+ }
bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
if ((bp->b_flags & B_VMIO) == 0) {
if (bp->b_bufsize)
@@ -991,6 +1069,7 @@ brelse(struct buf * bp)
if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
bp->b_flags &= ~B_DELWRI;
--numdirtybuffers;
+ numdirtywakeup();
}
runningbufspace -= bp->b_bufsize;
@@ -1070,7 +1149,7 @@ bqrelse(struct buf * bp)
/*
* Something we can maybe wakeup
*/
- if (bp->b_bufsize)
+ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
bufspacewakeup();
/* unlock */
@@ -1139,7 +1218,7 @@ gbincore(struct vnode * vp, daddr_t blkno)
struct buf *bp;
struct bufhashhdr *bh;
- bh = BUFHASH(vp, blkno);
+ bh = bufhash(vp, blkno);
bp = bh->lh_first;
/* Search hash chain */
@@ -1155,14 +1234,18 @@ gbincore(struct vnode * vp, daddr_t blkno)
}
/*
- * this routine implements clustered async writes for
- * clearing out B_DELWRI buffers... This is much better
- * than the old way of writing only one buffer at a time.
+ * vfs_bio_awrite:
+ *
+ * Implement clustered async writes for clearing out B_DELWRI buffers.
+ * This is much better then the old way of writing only one buffer at
+ * a time. Note that we may not be presented with the buffers in the
+ * correct order, so we search for the cluster in both directions.
*/
int
vfs_bio_awrite(struct buf * bp)
{
int i;
+ int j;
daddr_t lblkno = bp->b_lblkno;
struct vnode *vp = bp->b_vp;
int s;
@@ -1174,8 +1257,9 @@ vfs_bio_awrite(struct buf * bp)
s = splbio();
/*
- * right now we support clustered writing only to regular files, and
- * then only if our I/O system is not saturated.
+ * right now we support clustered writing only to regular files. If
+ * we find a clusterable block we could be in the middle of a cluster
+ * rather then at the beginning.
*/
if ((vp->v_type == VREG) &&
(vp->v_mount != 0) && /* Only on nodes that have the size info */
@@ -1191,18 +1275,34 @@ vfs_bio_awrite(struct buf * bp)
(B_DELWRI | B_CLUSTEROK)) &&
(bpa->b_bufsize == size)) {
if ((bpa->b_blkno == bpa->b_lblkno) ||
- (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+ (bpa->b_blkno !=
+ bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
break;
} else {
break;
}
}
- ncl = i;
+ for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
+ if ((bpa = gbincore(vp, lblkno - j)) &&
+ BUF_REFCNT(bpa) == 0 &&
+ ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+ (B_DELWRI | B_CLUSTEROK)) &&
+ (bpa->b_bufsize == size)) {
+ if ((bpa->b_blkno == bpa->b_lblkno) ||
+ (bpa->b_blkno !=
+ bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
+ break;
+ } else {
+ break;
+ }
+ }
+ --j;
+ ncl = i + j;
/*
* this is a possible cluster write
*/
if (ncl != 1) {
- nwritten = cluster_wbuild(vp, size, lblkno, ncl);
+ nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
splx(s);
return nwritten;
}
@@ -1240,21 +1340,12 @@ vfs_bio_awrite(struct buf * bp)
* If we have to flush dirty buffers ( but we try to avoid this )
*
* To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- * Instead we ask the pageout daemon to do it for us. We attempt to
+ * Instead we ask the buf daemon to do it for us. We attempt to
* avoid piecemeal wakeups of the pageout daemon.
*/
- /*
- * We fully expect to be able to handle any fragmentation and buffer
- * space issues by freeing QUEUE_CLEAN buffers. If this fails, we
- * have to wakeup the pageout daemon and ask it to flush some of our
- * QUEUE_DIRTY buffers. We have to be careful to prevent a deadlock.
- * XXX
- */
-
static struct buf *
-getnewbuf(struct vnode *vp, daddr_t blkno,
- int slpflag, int slptimeo, int size, int maxsize)
+getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
{
struct buf *bp;
struct buf *nbp;
@@ -1262,8 +1353,6 @@ getnewbuf(struct vnode *vp, daddr_t blkno,
int outofspace;
int nqindex;
int defrag = 0;
- static int newbufcnt = 0;
- int lastnewbuf = newbufcnt;
++getnewbufcalls;
--getnewbufrestarts;
@@ -1338,13 +1427,9 @@ restart:
* depending.
*/
- if (nbp)
- --getnewbufloops;
-
while ((bp = nbp) != NULL) {
int qindex = nqindex;
- ++getnewbufloops;
/*
* Calculate next bp ( we can only use it if we do not block
* or do other fancy things ).
@@ -1372,7 +1457,6 @@ restart:
/*
* Sanity Checks
*/
- KASSERT(BUF_REFCNT(bp) == 0, ("getnewbuf: busy buffer %p on free list", bp));
KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
/*
@@ -1388,14 +1472,10 @@ restart:
* buffer isn't useful for fixing that problem we continue.
*/
- if (defrag > 0 && bp->b_kvasize == 0) {
- ++getnewbufloops1;
+ if (defrag > 0 && bp->b_kvasize == 0)
continue;
- }
- if (outofspace > 0 && bp->b_bufsize == 0) {
- ++getnewbufloops2;
+ if (outofspace > 0 && bp->b_bufsize == 0)
continue;
- }
/*
* Start freeing the bp. This is somewhat involved. nbp
@@ -1433,7 +1513,6 @@ restart:
}
if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
(*bioops.io_deallocate)(bp);
-
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
@@ -1451,7 +1530,6 @@ restart:
bp->b_bcount = 0;
bp->b_npages = 0;
bp->b_dirtyoff = bp->b_dirtyend = 0;
- bp->b_usecount = 5;
LIST_INIT(&bp->b_dep);
@@ -1489,19 +1567,26 @@ restart:
/*
* If we exhausted our list, sleep as appropriate. We may have to
- * wakeup the pageout daemon to write out some dirty buffers.
+ * wakeup various daemons and write out some dirty buffers.
+ *
+ * Generally we are sleeping due to insufficient buffer space.
*/
if (bp == NULL) {
int flags;
+ char *waitmsg;
dosleep:
- if (defrag > 0)
+ if (defrag > 0) {
flags = VFS_BIO_NEED_KVASPACE;
- else if (outofspace > 0)
+ waitmsg = "nbufkv";
+ } else if (outofspace > 0) {
+ waitmsg = "nbufbs";
flags = VFS_BIO_NEED_BUFSPACE;
- else
+ } else {
+ waitmsg = "newbuf";
flags = VFS_BIO_NEED_ANY;
+ }
/* XXX */
@@ -1509,7 +1594,7 @@ dosleep:
needsbuffer |= flags;
while (needsbuffer & flags) {
if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
- "newbuf", slptimeo))
+ waitmsg, slptimeo))
return (NULL);
}
} else {
@@ -1553,42 +1638,7 @@ dosleep:
}
bp->b_data = bp->b_kvabase;
}
-
- /*
- * If we have slept at some point in this process and another
- * process has managed to allocate a new buffer while we slept,
- * we have to return NULL so that our caller can recheck to
- * ensure that the other process did not create an identically
- * identified buffer to the one we were requesting. We make this
- * check by incrementing the static int newbufcnt each time we
- * successfully allocate a new buffer. By saving the value of
- * newbufcnt in our local lastnewbuf, we can compare newbufcnt
- * with lastnewbuf to see if any other process managed to
- * allocate a buffer while we were doing so ourselves.
- *
- * Note that bp, if valid, is locked.
- */
- if (lastnewbuf == newbufcnt) {
- /*
- * No buffers allocated, so we can return one if we were
- * successful, or continue trying if we were not successful.
- */
- if (bp != NULL) {
- newbufcnt += 1;
- return (bp);
- }
- goto restart;
- }
- /*
- * Another process allocated a buffer since we were called, so
- * we have to free the one we allocated and return NULL to let
- * our caller recheck to see if a new buffer is still needed.
- */
- if (bp != NULL) {
- bp->b_flags |= B_INVAL;
- brelse(bp);
- }
- return (NULL);
+ return(bp);
}
/*
@@ -1601,7 +1651,6 @@ static void
waitfreebuffers(int slpflag, int slptimeo)
{
while (numfreebuffers < hifreebuffers) {
- bd_wakeup(0);
if (numfreebuffers >= hifreebuffers)
break;
needsbuffer |= VFS_BIO_NEED_FREE;
@@ -1646,60 +1695,72 @@ buf_daemon()
bd_request = 0;
/*
- * Do the flush.
+ * Do the flush. Limit the number of buffers we flush in one
+ * go. The failure condition occurs when processes are writing
+ * buffers faster then we can dispose of them. In this case
+ * we may be flushing so often that the previous set of flushes
+ * have not had time to complete, causing us to run out of
+ * physical buffers and block.
*/
{
- while (numdirtybuffers > bd_flushto) {
+ int runcount = maxbdrun;
+
+ while (numdirtybuffers > bd_flushto && runcount) {
+ --runcount;
if (flushbufqueues() == 0)
break;
}
}
/*
- * Whew. If nobody is requesting anything we sleep until the
- * next event. If we sleep and the sleep times out and
- * nobody is waiting for interesting things we back-off.
- * Otherwise we get more aggressive.
+ * If nobody is requesting anything we sleep
*/
+ if (bd_request == 0)
+ tsleep(&bd_request, PVM, "psleep", bd_interval);
- if (bd_request == 0 &&
- tsleep(&bd_request, PVM, "psleep", bd_interval) &&
- needsbuffer == 0) {
- /*
- * timed out and nothing serious going on,
- * increase the flushto high water mark to reduce
- * the flush rate.
- */
- bd_flushto += 10;
- } else {
- /*
- * We were woken up or hit a serious wall that needs
- * to be addressed.
- */
- bd_flushto -= 10;
- if (needsbuffer) {
- int middb = (lodirtybuffers+hidirtybuffers)/2;
- bd_interval >>= 1;
- if (bd_flushto > middb)
- bd_flushto = middb;
- }
+ /*
+ * We calculate how much to add or subtract from bd_flushto
+ * and bd_interval based on how far off we are from the
+ * optimal number of dirty buffers, which is 20% below the
+ * hidirtybuffers mark. We cannot use hidirtybuffers straight
+ * because being right on the mark will cause getnewbuf()
+ * to oscillate our wakeup.
+ *
+ * The larger the error in either direction, the more we adjust
+ * bd_flushto and bd_interval. The time interval is adjusted
+ * by 2 seconds per whole-buffer-range of error. This is an
+ * exponential convergence algorithm, with large errors
+ * producing large changes and small errors producing small
+ * changes.
+ */
+
+ {
+ int brange = hidirtybuffers - lodirtybuffers;
+ int middb = hidirtybuffers - brange / 5;
+ int deltabuf = middb - numdirtybuffers;
+
+ bd_flushto += deltabuf / 20;
+ bd_interval += deltabuf * (2 * hz) / (brange * 1);
}
- if (bd_flushto < lodirtybuffers) {
+ if (bd_flushto < lodirtybuffers)
bd_flushto = lodirtybuffers;
- bd_interval -= hz / 10;
- }
- if (bd_flushto > hidirtybuffers) {
+ if (bd_flushto > hidirtybuffers)
bd_flushto = hidirtybuffers;
- bd_interval += hz / 10;
- }
if (bd_interval < hz / 10)
bd_interval = hz / 10;
-
if (bd_interval > 5 * hz)
bd_interval = 5 * hz;
}
}
+/*
+ * flushbufqueues:
+ *
+ * Try to flush a buffer in the dirty queue. We must be careful to
+ * free up B_INVAL buffers instead of write them, which NFS is
+ * particularly sensitive to.
+ */
+
static int
flushbufqueues(void)
{
@@ -1709,15 +1770,6 @@ flushbufqueues(void)
bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
while (bp) {
- /*
- * Try to free up B_INVAL delayed-write buffers rather then
- * writing them out. Note also that NFS is somewhat sensitive
- * to B_INVAL buffers so it is doubly important that we do
- * this.
- *
- * We do not try to sync buffers whos vnodes are locked, we
- * cannot afford to block in this process.
- */
KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
if ((bp->b_flags & B_DELWRI) != 0) {
if (bp->b_flags & B_INVAL) {
@@ -1728,11 +1780,9 @@ flushbufqueues(void)
++r;
break;
}
- if (!VOP_ISLOCKED(bp->b_vp)) {
- vfs_bio_awrite(bp);
- ++r;
- break;
- }
+ vfs_bio_awrite(bp);
+ ++r;
+ break;
}
bp = TAILQ_NEXT(bp, b_freelist);
}
@@ -1957,8 +2007,6 @@ loop:
*/
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
- if (bp->b_usecount < BUF_MAXUSE)
- ++bp->b_usecount;
if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
"getblk", slpflag, slptimeo) == ENOLCK)
goto loop;
@@ -2036,8 +2084,6 @@ loop:
goto loop;
}
- if (bp->b_usecount < BUF_MAXUSE)
- ++bp->b_usecount;
splx(s);
bp->b_flags &= ~B_DONE;
} else {
@@ -2063,8 +2109,7 @@ loop:
maxsize = vmio ? size + (offset & PAGE_MASK) : size;
maxsize = imax(maxsize, bsize);
- if ((bp = getnewbuf(vp, blkno,
- slpflag, slptimeo, size, maxsize)) == NULL) {
+ if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
if (slpflag || slptimeo) {
splx(s);
return NULL;
@@ -2079,7 +2124,8 @@ loop:
* If the buffer is created out from under us, we have to
* throw away the one we just created. There is now window
* race because we are safely running at splbio() from the
- * point of the duplicate buffer creation through to here.
+ * point of the duplicate buffer creation through to here,
+ * and we've locked the buffer.
*/
if (gbincore(vp, blkno)) {
bp->b_flags |= B_INVAL;
@@ -2096,7 +2142,7 @@ loop:
bgetvp(vp, bp);
LIST_REMOVE(bp, b_hash);
- bh = BUFHASH(vp, blkno);
+ bh = bufhash(vp, blkno);
LIST_INSERT_HEAD(bh, bp, b_hash);
/*
@@ -2135,7 +2181,7 @@ geteblk(int size)
int s;
s = splbio();
- while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
+ while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
splx(s);
allocbuf(bp, size);
bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
@@ -2218,7 +2264,8 @@ allocbuf(struct buf *bp, int size)
#if !defined(NO_B_MALLOC)
/*
* We only use malloced memory on the first allocation.
- * and revert to page-allocated memory when the buffer grows.
+ * and revert to page-allocated memory when the buffer
+ * grows.
*/
if ( (bufmallocspace < maxbufmallocspace) &&
(bp->b_bufsize == 0) &&
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 9a9eb60ebad0..f6fc890b3dca 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.85 1999/06/29 05:59:43 peter Exp $
+ * $Id: vfs_cluster.c,v 1.86 1999/07/04 00:31:17 mckusick Exp $
*/
#include "opt_debug_cluster.h"
@@ -150,21 +150,12 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
}
/*
- * Set another read-ahead mark so we know to check
- * again.
+ * Set another read-ahead mark so we know
+ * to check again.
*/
if (((i % racluster) == (racluster - 1)) ||
(i == (maxra - 1)))
tbp->b_flags |= B_RAM;
-
-#if 0
- if ((tbp->b_usecount < 1) &&
- BUF_REFCNT(tbp) == 0 &&
- (tbp->b_qindex == QUEUE_LRU)) {
- TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist);
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist);
- }
-#endif
}
splx(s);
if (i >= maxra) {
@@ -586,7 +577,7 @@ cluster_write(bp, filesize)
if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
if (!async)
- cluster_wbuild(vp, lblocksize,
+ cluster_wbuild_wb(vp, lblocksize,
vp->v_cstart, cursize);
} else {
struct buf **bpp, **endbp;
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index efca6c8a1578..4ef741c8e367 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $
+ * $Id: vfs_subr.c,v 1.206 1999/07/04 00:25:29 mckusick Exp $
*/
/*
@@ -115,8 +115,8 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad,
static int reassignbufmethod = 1;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
-int vfs_ioopt = 0;
#ifdef ENABLE_VFS_IOOPT
+int vfs_ioopt = 0;
SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
#endif
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index efca6c8a1578..4ef741c8e367 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $
+ * $Id: vfs_subr.c,v 1.206 1999/07/04 00:25:29 mckusick Exp $
*/
/*
@@ -115,8 +115,8 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad,
static int reassignbufmethod = 1;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
-int vfs_ioopt = 0;
#ifdef ENABLE_VFS_IOOPT
+int vfs_ioopt = 0;
SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
#endif
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 8a520d335594..87cdac2f4858 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
- * $Id: vfs_vnops.c,v 1.68 1999/04/28 11:37:12 phk Exp $
+ * $Id: vfs_vnops.c,v 1.69 1999/07/02 16:29:15 phk Exp $
*/
#include <sys/param.h>
@@ -334,10 +334,14 @@ vn_write(fp, uio, cred, flags)
struct ucred *cred;
int flags;
{
- struct vnode *vp = (struct vnode *)fp->f_data;
+ struct vnode *vp;
struct proc *p = uio->uio_procp;
int error, ioflag;
+ vp = (struct vnode *)fp->f_data;
+ if (vp->v_type == VREG)
+ bwillwrite();
+ vp = (struct vnode *)fp->f_data; /* XXX needed? */
ioflag = IO_UNIT;
if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
ioflag |= IO_APPEND;
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index e6d23d86d9cc..87043e30b95b 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $
+ * $Id: buf.h,v 1.75 1999/07/04 00:25:32 mckusick Exp $
*/
#ifndef _SYS_BUF_H_
@@ -100,7 +100,7 @@ struct buf {
TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */
long b_flags; /* B_* flags. */
unsigned short b_qindex; /* buffer queue index */
- unsigned char b_usecount; /* buffer use count */
+ unsigned char b_unused1; /* unused field */
unsigned char b_xflags; /* extra flags */
struct lock b_lock; /* Buffer lock */
int b_error; /* Errno value. */
@@ -410,18 +410,6 @@ bufq_first(struct buf_queue_head *head)
#endif /* KERNEL */
-
-/*
- * number of buffer hash entries
- */
-#define BUFHSZ 512
-
-/*
- * buffer hash table calculation, originally by David Greenman
- */
-#define BUFHASH(vnp, bn) \
- (&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
-
/*
* Definitions for the buffer free lists.
*/
@@ -458,7 +446,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
struct uio;
+vm_offset_t bufhashinit __P((vm_offset_t));
void bufinit __P((void));
+void bwillwrite __P((void));
void bremfree __P((struct buf *));
int bread __P((struct vnode *, daddr_t, int,
struct ucred *, struct buf **));
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index e6d23d86d9cc..87043e30b95b 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $
+ * $Id: buf.h,v 1.75 1999/07/04 00:25:32 mckusick Exp $
*/
#ifndef _SYS_BUF_H_
@@ -100,7 +100,7 @@ struct buf {
TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */
long b_flags; /* B_* flags. */
unsigned short b_qindex; /* buffer queue index */
- unsigned char b_usecount; /* buffer use count */
+ unsigned char b_unused1; /* unused field */
unsigned char b_xflags; /* extra flags */
struct lock b_lock; /* Buffer lock */
int b_error; /* Errno value. */
@@ -410,18 +410,6 @@ bufq_first(struct buf_queue_head *head)
#endif /* KERNEL */
-
-/*
- * number of buffer hash entries
- */
-#define BUFHSZ 512
-
-/*
- * buffer hash table calculation, originally by David Greenman
- */
-#define BUFHASH(vnp, bn) \
- (&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
-
/*
* Definitions for the buffer free lists.
*/
@@ -458,7 +446,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
struct uio;
+vm_offset_t bufhashinit __P((vm_offset_t));
void bufinit __P((void));
+void bwillwrite __P((void));
void bremfree __P((struct buf *));
int bread __P((struct vnode *, daddr_t, int,
struct ucred *, struct buf **));
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index d12a8020e2f9..53f980af8060 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -31,7 +31,7 @@
* SUCH DAMAGE.
*
* @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
- * $Id: ufs_readwrite.c,v 1.57 1999/01/28 00:57:56 dillon Exp $
+ * $Id: ufs_readwrite.c,v 1.58 1999/04/05 19:38:30 julian Exp $
*/
#define BLKSIZE(a, b, c) blksize(a, b, c)
@@ -106,7 +106,8 @@ READ(ap)
if (object)
vm_object_reference(object);
-#if 1
+
+#ifdef ENABLE_VFS_IOOPT
/*
* If IO optimisation is turned on,
* and we are NOT a VM based IO request,
@@ -150,7 +151,7 @@ READ(ap)
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
break;
-#if 1
+#ifdef ENABLE_VFS_IOOPT
if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
/*
* Obviously we didn't finish above, but we
@@ -276,6 +277,7 @@ READ(ap)
xfersize = size;
}
+#ifdef ENABLE_VFS_IOOPT
if (vfs_ioopt && object &&
(bp->b_flags & B_VMIO) &&
((blkoffset & PAGE_MASK) == 0) &&
@@ -289,7 +291,9 @@ READ(ap)
error =
uiomoveco((char *)bp->b_data + blkoffset,
(int)xfersize, uio, object);
- } else {
+ } else
+#endif
+ {
/*
* otherwise use the general form
*/