author: Kirk McKusick <mckusick@FreeBSD.org> 1999-07-08 06:06:00 +0000
committer: Kirk McKusick <mckusick@FreeBSD.org> 1999-07-08 06:06:00 +0000
commit: ad8ac923fa48ba09c5aa52201f3e1e5a81e30eeb (patch)
tree: 2284f4f780430039d2d5e44edfd2407ff2a99ebc
parent: bedf427650aa25ac43551bad76d88792626ddd49 (diff)
14 files changed, 267 insertions, 232 deletions
diff --git a/sys/alpha/alpha/machdep.c b/sys/alpha/alpha/machdep.c
index 399e60f604de..5ab0d3f2fe6b 100644
--- a/sys/alpha/alpha/machdep.c
+++ b/sys/alpha/alpha/machdep.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: machdep.c,v 1.46 1999/07/05 08:52:40 msmith Exp $
+ *	$Id: machdep.c,v 1.47 1999/07/06 17:48:16 peter Exp $
  */
 /*-
  * Copyright (c) 1998 The NetBSD Foundation, Inc.
@@ -354,7 +354,7 @@ again:
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
-
+	v = bufhashinit(v);
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 616e7e5a7449..ec11c9827bd1 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -35,7 +35,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.352 1999/07/05 08:52:49 msmith Exp $
+ *	$Id: machdep.c,v 1.353 1999/07/06 07:13:33 cracauer Exp $
  */
 
 #include "apm.h"
@@ -355,7 +355,7 @@ again:
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
-
+	v = bufhashinit(v);
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 82ee99efc326..2b5c4fb81478 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -39,7 +39,7 @@
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.239 1999/06/08 17:14:22 dt Exp $
+ *	$Id: pmap.c,v 1.240 1999/06/23 21:47:21 luoqi Exp $
  */
 
 /*
@@ -3105,9 +3105,10 @@ pmap_changebit(pa, bit, setem)
 /*
  *	pmap_clearbit:
  *
- *	Clear a bit/bits in every pte mapping a given physical page.
+ *	Clear a bit/bits in every pte mapping a given physical page.  Making
+ *	this inline allows the pmap_changebit inline to be well optimized.
  */
-static void
+static __inline void
 pmap_clearbit(
 	vm_offset_t pa,
 	int	bit)
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index 616e7e5a7449..ec11c9827bd1 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -35,7 +35,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.352 1999/07/05 08:52:49 msmith Exp $
+ *	$Id: machdep.c,v 1.353 1999/07/06 07:13:33 cracauer Exp $
  */
 
 #include "apm.h"
@@ -355,7 +355,7 @@ again:
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
-
+	v = bufhashinit(v);
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 82ee99efc326..2b5c4fb81478 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -39,7 +39,7 @@
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.239 1999/06/08 17:14:22 dt Exp $
+ *	$Id: pmap.c,v 1.240 1999/06/23 21:47:21 luoqi Exp $
  */
 
 /*
@@ -3105,9 +3105,10 @@ pmap_changebit(pa, bit, setem)
 /*
  *	pmap_clearbit:
  *
- *	Clear a bit/bits in every pte mapping a given physical page.
+ *	Clear a bit/bits in every pte mapping a given physical page.  Making
+ *	this inline allows the pmap_changebit inline to be well optimized.
  */
-static void
+static __inline void
 pmap_clearbit(
 	vm_offset_t pa,
 	int	bit)
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index 13faec6a18d3..73d7d2c8a092 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
- * $Id: kern_subr.c,v 1.27 1999/02/22 18:39:49 bde Exp $
+ * $Id: kern_subr.c,v 1.28 1999/03/12 03:09:29 julian Exp $
  */
 
 #include <sys/param.h>
@@ -156,6 +156,7 @@ uiomoveco(cp, n, uio, obj)
 			if (ticks - switchticks >= hogticks)
 				uio_yield();
 			if (uio->uio_rw == UIO_READ) {
+#ifdef ENABLE_VFS_IOOPT
 				if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) &&
 					((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
 					((uio->uio_offset & PAGE_MASK) == 0) &&
@@ -163,7 +164,9 @@ uiomoveco(cp, n, uio, obj)
 						error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
 								uio->uio_offset, cnt,
 								(vm_offset_t) iov->iov_base, NULL);
-				} else {
+				} else
+#endif
+				{
 					error = copyout(cp, iov->iov_base, cnt);
 				}
 			} else {
@@ -192,6 +195,8 @@ uiomoveco(cp, n, uio, obj)
 	return (0);
 }
 
+#ifdef ENABLE_VFS_IOOPT
+
 int
 uioread(n, uio, obj, nread)
 	int n;
@@ -258,6 +263,8 @@ uioread(n, uio, obj, nread)
 	return error;
 }
 
+#endif
+
 /*
  * Give next character to user as result of read.
  */
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 5c478c6ccb4a..47e82761db8c 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
- * $Id: vfs_bio.c,v 1.219 1999/06/29 05:59:41 peter Exp $
+ * $Id: vfs_bio.c,v 1.220 1999/07/04 00:25:27 mckusick Exp $
  */
 
 /*
@@ -90,14 +90,11 @@ static int bufspace, maxbufspace, vmiospace,
 #if 0
 static int maxvmiobufspace;
 #endif
+static int maxbdrun;
 static int needsbuffer;
 static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
 static int numfreebuffers, lofreebuffers, hifreebuffers;
 static int getnewbufcalls;
-static int getnewbufloops;
-static int getnewbufloops1;
-static int getnewbufloops2;
-static int getnewbufloops3;
 static int getnewbufrestarts;
 static int kvafreespace;
 
@@ -121,6 +118,8 @@ SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
 	&hibufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
 	&bufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
+	&maxbdrun, 0, "");
 #if 0
 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
 	&maxvmiobufspace, 0, "");
@@ -135,18 +134,12 @@ SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
 	&kvafreespace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
 	&getnewbufcalls, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops, CTLFLAG_RW,
-	&getnewbufloops, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops1, CTLFLAG_RW,
-	&getnewbufloops1, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops2, CTLFLAG_RW,
-	&getnewbufloops2, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops3, CTLFLAG_RW,
-	&getnewbufloops3, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
 	&getnewbufrestarts, 0, "");
 
-static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
+
+static int bufhashmask;
+static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
 char *buf_wmesg = BUF_WMESG;
 
@@ -155,12 +148,24 @@ extern int vm_swap_size;
 #define BUF_MAXUSE		24
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
-#define VFS_BIO_NEED_RESERVED02	0x02	/* unused */
+#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 #define VFS_BIO_NEED_KVASPACE	0x10	/* wait for buffer_map space, emerg  */
 
 /*
+ * Buffer hash table code.  Note that the logical block scans linearly, which
+ * gives us some L1 cache locality.
+ */
+
+static __inline 
+struct bufhashhdr *
+bufhash(struct vnode *vnp, daddr_t bn)
+{
+	return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
+}
+
+/*
  *	kvaspacewakeup:
  *
  *	Called when kva space is potential available for recovery or when
@@ -185,6 +190,24 @@ kvaspacewakeup(void)
 }
 
 /*
+ *	numdirtywakeup:
+ *
+ *	If someone is blocked due to there being too many dirty buffers,
+ *	and numdirtybuffers is now reasonable, wake them up.
+ */
+
+static __inline void
+numdirtywakeup(void)
+{
+	if (numdirtybuffers < hidirtybuffers) {
+		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
+			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
+			wakeup(&needsbuffer);
+		}
+	}
+}
+
+/*
  *	bufspacewakeup:
  *
  *	Called when buffer space is potentially available for recovery or when
@@ -260,10 +283,23 @@ bd_wakeup(int dirtybuflevel)
 
 
 /*
- * Initialize buffer headers and related structures.
+ * Initialize buffer headers and related structures. 
  */
+
+vm_offset_t
+bufhashinit(vm_offset_t vaddr)
+{
+	/* first, make a null hash table */
+	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
+		;
+	bufhashtbl = (void *)vaddr;
+	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
+	--bufhashmask;
+	return(vaddr);
+}
+
 void
-bufinit()
+bufinit(void)
 {
 	struct buf *bp;
 	int i;
@@ -272,8 +308,7 @@ bufinit()
 	LIST_INIT(&invalhash);
 	simple_lock_init(&buftimelock);
 
-	/* first, make a null hash table */
-	for (i = 0; i < BUFHSZ; i++)
+	for (i = 0; i <= bufhashmask; i++)
 		LIST_INIT(&bufhashtbl[i]);
 
 	/* next, make a null set of free lists */
@@ -329,8 +364,8 @@ bufinit()
  * Reduce the chance of a deadlock occuring by limiting the number
  * of delayed-write dirty buffers we allow to stack up.
  */
-	lodirtybuffers = nbuf / 6 + 10;
-	hidirtybuffers = nbuf / 3 + 20;
+	lodirtybuffers = nbuf / 7 + 10;
+	hidirtybuffers = nbuf / 4 + 20;
 	numdirtybuffers = 0;
 
 /*
@@ -341,6 +376,15 @@ bufinit()
 	hifreebuffers = 2 * lofreebuffers;
 	numfreebuffers = nbuf;
 
+/*
+ * Maximum number of async ops initiated per buf_daemon loop.  This is
+ * somewhat of a hack at the moment, we really need to limit ourselves
+ * based on the number of bytes of I/O in-transit that were initiated
+ * from buf_daemon.
+ */
+	if ((maxbdrun = nswbuf / 4) < 4)
+		maxbdrun = 4;
+
 	kvafreespace = 0;
 
 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
@@ -383,19 +427,14 @@ bremfree(struct buf * bp)
 		if (bp->b_qindex == QUEUE_EMPTYKVA) {
 			kvafreespace -= bp->b_kvasize;
 		}
-		if (BUF_REFCNT(bp) == 1)
-			TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
-		else if (BUF_REFCNT(bp) == 0)
-			panic("bremfree: not locked");
-		else
-			/* Temporary panic to verify exclusive locking */
-			/* This panic goes away when we allow shared refs */
-			panic("bremfree: multiple refs");
+		KASSERT(BUF_REFCNT(bp) == 0, ("bremfree: bp %p not locked",bp));
+		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 		bp->b_qindex = QUEUE_NONE;
 		runningbufspace += bp->b_bufsize;
 	} else {
 #if !defined(MAX_PERF)
-		panic("bremfree: removing a buffer when not on a queue");
+		if (BUF_REFCNT(bp) <= 1)
+			panic("bremfree: removing a buffer not on a queue");
 #endif
 	}
 
@@ -599,7 +638,9 @@ bwrite(struct buf * bp)
 void
 bdwrite(struct buf * bp)
 {
+#if 0
 	struct vnode *vp;
+#endif
 
 #if !defined(MAX_PERF)
 	if (BUF_REFCNT(bp) == 0)
@@ -654,6 +695,11 @@ bdwrite(struct buf * bp)
 	bd_wakeup(hidirtybuffers);
 
 	/*
+	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+	 * due to the softdep code.
+	 */
+#if 0
+	/*
 	 * XXX The soft dependency code is not prepared to
 	 * have I/O done when a bdwrite is requested. For
 	 * now we just let the write be delayed if it is
@@ -664,6 +710,7 @@ bdwrite(struct buf * bp)
 		  (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
 		 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
 		return;
+#endif
 }
 
 /*
@@ -722,6 +769,7 @@ bundirty(bp)
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 		--numdirtybuffers;
+		numdirtywakeup();
 	}
 }
 
@@ -757,6 +805,34 @@ bowrite(struct buf * bp)
 }
 
 /*
+ *	bwillwrite:
+ *
+ *	Called prior to the locking of any vnodes when we are expecting to
+ *	write.  We do not want to starve the buffer cache with too many
+ *	dirty buffers so we block here.  By blocking prior to the locking
+ *	of any vnodes we attempt to avoid the situation where a locked vnode
+ *	prevents the various system daemons from flushing related buffers.
+ */
+
+void
+bwillwrite(void)
+{
+	int twenty = (hidirtybuffers - lodirtybuffers) / 5;
+
+	if (numdirtybuffers > hidirtybuffers + twenty) {
+		int s;
+
+		s = splbio();
+		while (numdirtybuffers > hidirtybuffers) {
+			bd_wakeup(hidirtybuffers);
+			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
+			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
+		}
+		splx(s);
+	}
+}
+
+/*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
@@ -799,8 +875,10 @@ brelse(struct buf * bp)
 		bp->b_flags |= B_INVAL;
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
-		if (bp->b_flags & B_DELWRI)
+		if (bp->b_flags & B_DELWRI) {
 			--numdirtybuffers;
+			numdirtywakeup();
+		}
 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
@@ -991,6 +1069,7 @@ brelse(struct buf * bp)
 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
 		bp->b_flags &= ~B_DELWRI;
 		--numdirtybuffers;
+		numdirtywakeup();
 	}
 
 	runningbufspace -= bp->b_bufsize;
@@ -1070,7 +1149,7 @@ bqrelse(struct buf * bp)
 	/*
 	 * Something we can maybe wakeup
 	 */
-	if (bp->b_bufsize)
+	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 		bufspacewakeup();
 
 	/* unlock */
@@ -1139,7 +1218,7 @@ gbincore(struct vnode * vp, daddr_t blkno)
 	struct buf *bp;
 	struct bufhashhdr *bh;
 
-	bh = BUFHASH(vp, blkno);
+	bh = bufhash(vp, blkno);
 	bp = bh->lh_first;
 
 	/* Search hash chain */
@@ -1155,14 +1234,18 @@ gbincore(struct vnode * vp, daddr_t blkno)
 }
 
 /*
- * this routine implements clustered async writes for
- * clearing out B_DELWRI buffers...  This is much better
- * than the old way of writing only one buffer at a time.
+ *	vfs_bio_awrite:
+ *
+ *	Implement clustered async writes for clearing out B_DELWRI buffers.
+ *	This is much better then the old way of writing only one buffer at
+ *	a time.  Note that we may not be presented with the buffers in the 
+ *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf * bp)
 {
 	int i;
+	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int s;
@@ -1174,8 +1257,9 @@ vfs_bio_awrite(struct buf * bp)
 
 	s = splbio();
 	/*
-	 * right now we support clustered writing only to regular files, and
-	 * then only if our I/O system is not saturated.
+	 * right now we support clustered writing only to regular files.  If
+	 * we find a clusterable block we could be in the middle of a cluster
+	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
@@ -1191,18 +1275,34 @@ vfs_bio_awrite(struct buf * bp)
 			    (B_DELWRI | B_CLUSTEROK)) &&
 			    (bpa->b_bufsize == size)) {
 				if ((bpa->b_blkno == bpa->b_lblkno) ||
-				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+				    (bpa->b_blkno !=
+				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 					break;
 			} else {
 				break;
 			}
 		}
-		ncl = i;
+		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
+			if ((bpa = gbincore(vp, lblkno - j)) &&
+			    BUF_REFCNT(bpa) == 0 &&
+			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+			    (B_DELWRI | B_CLUSTEROK)) &&
+			    (bpa->b_bufsize == size)) {
+				if ((bpa->b_blkno == bpa->b_lblkno) ||
+				    (bpa->b_blkno !=
+				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
+					break;
+			} else {
+				break;
+			}
+		}
+		--j;
+		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
-			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
+			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 			splx(s);
 			return nwritten;
 		}
@@ -1240,21 +1340,12 @@ vfs_bio_awrite(struct buf * bp)
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- *	Instead we ask the pageout daemon to do it for us.  We attempt to
+ *	Instead we ask the buf daemon to do it for us.  We attempt to
  *	avoid piecemeal wakeups of the pageout daemon.
  */
 
-	/*
-	 * We fully expect to be able to handle any fragmentation and buffer
-	 * space issues by freeing QUEUE_CLEAN buffers.  If this fails, we 
-	 * have to wakeup the pageout daemon and ask it to flush some of our 
-	 * QUEUE_DIRTY buffers.  We have to be careful to prevent a deadlock.
-	 * XXX
-	 */
-
 static struct buf *
-getnewbuf(struct vnode *vp, daddr_t blkno,
-	int slpflag, int slptimeo, int size, int maxsize)
+getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 {
 	struct buf *bp;
 	struct buf *nbp;
@@ -1262,8 +1353,6 @@ getnewbuf(struct vnode *vp, daddr_t blkno,
 	int outofspace;
 	int nqindex;
 	int defrag = 0;
-	static int newbufcnt = 0;
-	int lastnewbuf = newbufcnt;
 	
 	++getnewbufcalls;
 	--getnewbufrestarts;
@@ -1338,13 +1427,9 @@ restart:
 	 * depending.
 	 */
 
-	if (nbp)
-		--getnewbufloops;
-
 	while ((bp = nbp) != NULL) {
 		int qindex = nqindex;
 
-		++getnewbufloops;
 		/*
 		 * Calculate next bp ( we can only use it if we do not block
 		 * or do other fancy things ).
@@ -1372,7 +1457,6 @@ restart:
 		/*
 		 * Sanity Checks
 		 */
-		KASSERT(BUF_REFCNT(bp) == 0, ("getnewbuf: busy buffer %p on free list", bp));
 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 
 		/*
@@ -1388,14 +1472,10 @@ restart:
 		 * buffer isn't useful for fixing that problem we continue.
 		 */
 
-		if (defrag > 0 && bp->b_kvasize == 0) {
-			++getnewbufloops1;
+		if (defrag > 0 && bp->b_kvasize == 0)
 			continue;
-		}
-		if (outofspace > 0 && bp->b_bufsize == 0) {
-			++getnewbufloops2;
+		if (outofspace > 0 && bp->b_bufsize == 0)
 			continue;
-		}
 
 		/*
 		 * Start freeing the bp.  This is somewhat involved.  nbp
@@ -1433,7 +1513,6 @@ restart:
 		}
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
-
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 
@@ -1451,7 +1530,6 @@ restart:
 		bp->b_bcount = 0;
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
-		bp->b_usecount = 5;
 
 		LIST_INIT(&bp->b_dep);
 
@@ -1489,19 +1567,26 @@ restart:
 
 	/*
 	 * If we exhausted our list, sleep as appropriate.  We may have to
-	 * wakeup the pageout daemon to write out some dirty buffers.
+	 * wakeup various daemons and write out some dirty buffers.
+	 *
+	 * Generally we are sleeping due to insufficient buffer space.
 	 */
 
 	if (bp == NULL) {
 		int flags;
+		char *waitmsg;
 
 dosleep:
-		if (defrag > 0)
+		if (defrag > 0) {
 			flags = VFS_BIO_NEED_KVASPACE;
-		else if (outofspace > 0)
+			waitmsg = "nbufkv";
+		} else if (outofspace > 0) {
+			waitmsg = "nbufbs";
 			flags = VFS_BIO_NEED_BUFSPACE;
-		else
+		} else {
+			waitmsg = "newbuf";
 			flags = VFS_BIO_NEED_ANY;
+		}
 
 		/* XXX */
 
@@ -1509,7 +1594,7 @@ dosleep:
 		needsbuffer |= flags;
 		while (needsbuffer & flags) {
 			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
-			    "newbuf", slptimeo))
+			    waitmsg, slptimeo))
 				return (NULL);
 		}
 	} else {
@@ -1553,42 +1638,7 @@ dosleep:
 		}
 		bp->b_data = bp->b_kvabase;
 	}
-
-	/*
-	 * If we have slept at some point in this process and another
-	 * process has managed to allocate a new buffer while we slept,
-	 * we have to return NULL so that our caller can recheck to
-	 * ensure that the other process did not create an identically
-	 * identified buffer to the one we were requesting. We make this
-	 * check by incrementing the static int newbufcnt each time we
-	 * successfully allocate a new buffer. By saving the value of
-	 * newbufcnt in our local lastnewbuf, we can compare newbufcnt
-	 * with lastnewbuf to see if any other process managed to
-	 * allocate a buffer while we were doing so ourselves.
-	 *
-	 * Note that bp, if valid, is locked.
-	 */
-	if (lastnewbuf == newbufcnt) {
-		/*
-		 * No buffers allocated, so we can return one if we were
-		 * successful, or continue trying if we were not successful.
-		 */
-		if (bp != NULL) {
-			newbufcnt += 1;
-			return (bp);
-		}
-		goto restart;
-	}
-	/*
-	 * Another process allocated a buffer since we were called, so
-	 * we have to free the one we allocated and return NULL to let
-	 * our caller recheck to see if a new buffer is still needed.
-	 */
-	if (bp != NULL) {
-		bp->b_flags |= B_INVAL;
-		brelse(bp);
-	}
-	return (NULL);
+	return(bp);
 }
 
 /*
@@ -1601,7 +1651,6 @@ static void
 waitfreebuffers(int slpflag, int slptimeo) 
 {
 	while (numfreebuffers < hifreebuffers) {
-		bd_wakeup(0);
 		if (numfreebuffers >= hifreebuffers)
 			break;
 		needsbuffer |= VFS_BIO_NEED_FREE;
@@ -1646,60 +1695,72 @@ buf_daemon()
 		bd_request = 0;
 
 		/*
-		 * Do the flush.  
+		 * Do the flush.  Limit the number of buffers we flush in one
+		 * go.  The failure condition occurs when processes are writing
+		 * buffers faster then we can dispose of them.  In this case
+		 * we may be flushing so often that the previous set of flushes
+		 * have not had time to complete, causing us to run out of
+		 * physical buffers and block.
 		 */
 		{
-			while (numdirtybuffers > bd_flushto) {
+			int runcount = maxbdrun;
+
+			while (numdirtybuffers > bd_flushto && runcount) {
+				--runcount;
 				if (flushbufqueues() == 0)
 					break;
 			}
 		}
 
 		/*
-		 * Whew.  If nobody is requesting anything we sleep until the
-		 * next event.  If we sleep and the sleep times out and
-		 * nobody is waiting for interesting things we back-off.  
-		 * Otherwise we get more aggressive.
+		 * If nobody is requesting anything we sleep
 		 */
+		if (bd_request == 0)
+			tsleep(&bd_request, PVM, "psleep", bd_interval);
 
-		if (bd_request == 0 &&
-		    tsleep(&bd_request, PVM, "psleep", bd_interval) &&
-		    needsbuffer == 0) {
-			/*
-			 * timed out and nothing serious going on,
-			 * increase the flushto high water mark to reduce
-			 * the flush rate.
-			 */
-			bd_flushto += 10;
-		} else {
-			/*
-			 * We were woken up or hit a serious wall that needs
-			 * to be addressed.
-			 */
-			bd_flushto -= 10;
-			if (needsbuffer) {
-				int middb = (lodirtybuffers+hidirtybuffers)/2;
-				bd_interval >>= 1;
-				if (bd_flushto > middb)
-					bd_flushto = middb;
-			}
+		/*
+		 * We calculate how much to add or subtract from bd_flushto
+		 * and bd_interval based on how far off we are from the 
+		 * optimal number of dirty buffers, which is 20% below the
+		 * hidirtybuffers mark.  We cannot use hidirtybuffers straight
+		 * because being right on the mark will cause getnewbuf()
+		 * to oscillate our wakeup.
+		 *
+		 * The larger the error in either direction, the more we adjust
+		 * bd_flushto and bd_interval.  The time interval is adjusted
+		 * by 2 seconds per whole-buffer-range of error.  This is an
+		 * exponential convergence algorithm, with large errors
+		 * producing large changes and small errors producing small
+		 * changes.
+		 */
+
+		{
+			int brange = hidirtybuffers - lodirtybuffers;
+			int middb = hidirtybuffers - brange / 5;
+			int deltabuf = middb - numdirtybuffers;
+
+			bd_flushto += deltabuf / 20;
+			bd_interval += deltabuf * (2 * hz) / (brange * 1);
 		}
-		if (bd_flushto < lodirtybuffers) {
+		if (bd_flushto < lodirtybuffers)
 			bd_flushto = lodirtybuffers;
-			bd_interval -= hz / 10;
-		}
-		if (bd_flushto > hidirtybuffers) {
+		if (bd_flushto > hidirtybuffers)
 			bd_flushto = hidirtybuffers;
-			bd_interval += hz / 10;
-		}
 		if (bd_interval < hz / 10)
 			bd_interval = hz / 10;
-
 		if (bd_interval > 5 * hz)
 			bd_interval = 5 * hz;
 	}
 }
 
+/*
+ *	flushbufqueues:
+ *
+ *	Try to flush a buffer in the dirty queue.  We must be careful to
+ *	free up B_INVAL buffers instead of write them, which NFS is 
+ *	particularly sensitive to.
+ */
+
 static int
 flushbufqueues(void)
 {
@@ -1709,15 +1770,6 @@ flushbufqueues(void)
 	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 
 	while (bp) {
-		/*
-		 * Try to free up B_INVAL delayed-write buffers rather then
-		 * writing them out.  Note also that NFS is somewhat sensitive
-		 * to B_INVAL buffers so it is doubly important that we do 
-		 * this.
-		 *
-		 * We do not try to sync buffers whos vnodes are locked, we
-		 * cannot afford to block in this process.
-		 */
 		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
 		if ((bp->b_flags & B_DELWRI) != 0) {
 			if (bp->b_flags & B_INVAL) {
@@ -1728,11 +1780,9 @@ flushbufqueues(void)
 				++r;
 				break;
 			}
-			if (!VOP_ISLOCKED(bp->b_vp)) {
-				vfs_bio_awrite(bp);
-				++r;
-				break;
-			}
+			vfs_bio_awrite(bp);
+			++r;
+			break;
 		}
 		bp = TAILQ_NEXT(bp, b_freelist);
 	}
@@ -1957,8 +2007,6 @@ loop:
 		 */
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-			if (bp->b_usecount < BUF_MAXUSE)
-				++bp->b_usecount;
 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
 			    "getblk", slpflag, slptimeo) == ENOLCK)
 				goto loop;
@@ -2036,8 +2084,6 @@ loop:
 			goto loop;
 		}
 
-		if (bp->b_usecount < BUF_MAXUSE)
-			++bp->b_usecount;
 		splx(s);
 		bp->b_flags &= ~B_DONE;
 	} else {
@@ -2063,8 +2109,7 @@ loop:
 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize = imax(maxsize, bsize);
 
-		if ((bp = getnewbuf(vp, blkno,
-			slpflag, slptimeo, size, maxsize)) == NULL) {
+		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
 			if (slpflag || slptimeo) {
 				splx(s);
 				return NULL;
@@ -2079,7 +2124,8 @@ loop:
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.  There is now window
 		 * race because we are safely running at splbio() from the
-		 * point of the duplicate buffer creation through to here.
+		 * point of the duplicate buffer creation through to here,
+		 * and we've locked the buffer.
 		 */
 		if (gbincore(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
@@ -2096,7 +2142,7 @@ loop:
 
 		bgetvp(vp, bp);
 		LIST_REMOVE(bp, b_hash);
-		bh = BUFHASH(vp, blkno);
+		bh = bufhash(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
 		/*
@@ -2135,7 +2181,7 @@ geteblk(int size)
 	int s;
 
 	s = splbio();
-	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
+	while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
 	splx(s);
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
@@ -2218,7 +2264,8 @@ allocbuf(struct buf *bp, int size)
 #if !defined(NO_B_MALLOC)
 			/*
 			 * We only use malloced memory on the first allocation.
-			 * and revert to page-allocated memory when the buffer grows.
+			 * and revert to page-allocated memory when the buffer
+			 * grows.
 			 */
 			if ( (bufmallocspace < maxbufmallocspace) &&
 				(bp->b_bufsize == 0) &&
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 9a9eb60ebad0..f6fc890b3dca 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.85 1999/06/29 05:59:43 peter Exp $
+ * $Id: vfs_cluster.c,v 1.86 1999/07/04 00:31:17 mckusick Exp $
  */
 
 #include "opt_debug_cluster.h"
@@ -150,21 +150,12 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
 				}
 
 				/*
-				 * Set another read-ahead mark so we know to check
-				 * again.
+				 * Set another read-ahead mark so we know 
+				 * to check again.
 				 */
 				if (((i % racluster) == (racluster - 1)) ||
 					(i == (maxra - 1)))
 					tbp->b_flags |= B_RAM;
-
-#if 0
-				if ((tbp->b_usecount < 1) &&
-					BUF_REFCNT(tbp) == 0 &&
-					(tbp->b_qindex == QUEUE_LRU)) {
-					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist);
-					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist);
-				}
-#endif
 			}
 			splx(s);
 			if (i >= maxra) {
@@ -586,7 +577,7 @@ cluster_write(bp, filesize)
 			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
 				if (!async)
-					cluster_wbuild(vp, lblocksize,
+					cluster_wbuild_wb(vp, lblocksize,
 						vp->v_cstart, cursize);
 			} else {
 				struct buf **bpp, **endbp;
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index efca6c8a1578..4ef741c8e367 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $
+ * $Id: vfs_subr.c,v 1.206 1999/07/04 00:25:29 mckusick Exp $
  */
 
 /*
@@ -115,8 +115,8 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad,
 static int reassignbufmethod = 1;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
 
-int vfs_ioopt = 0;
 #ifdef ENABLE_VFS_IOOPT
+int vfs_ioopt = 0;
 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 #endif
 
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index efca6c8a1578..4ef741c8e367 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $
+ * $Id: vfs_subr.c,v 1.206 1999/07/04 00:25:29 mckusick Exp $
  */
 
 /*
@@ -115,8 +115,8 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad,
 static int reassignbufmethod = 1;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
 
-int vfs_ioopt = 0;
 #ifdef ENABLE_VFS_IOOPT
+int vfs_ioopt = 0;
 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 #endif
 
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 8a520d335594..87cdac2f4858 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
- * $Id: vfs_vnops.c,v 1.68 1999/04/28 11:37:12 phk Exp $
+ * $Id: vfs_vnops.c,v 1.69 1999/07/02 16:29:15 phk Exp $
  */
 
 #include <sys/param.h>
@@ -334,10 +334,14 @@ vn_write(fp, uio, cred, flags)
 	struct ucred *cred;
 	int flags;
 {
-	struct vnode *vp = (struct vnode *)fp->f_data;
+	struct vnode *vp;
 	struct proc *p = uio->uio_procp;
 	int error, ioflag;
 
+	vp = (struct vnode *)fp->f_data;
+	if (vp->v_type == VREG)
+		bwillwrite();
+	vp = (struct vnode *)fp->f_data;	/* XXX needed? */
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index e6d23d86d9cc..87043e30b95b 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $
+ * $Id: buf.h,v 1.75 1999/07/04 00:25:32 mckusick Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -100,7 +100,7 @@ struct buf {
 	TAILQ_ENTRY(buf) b_act;		/* Device driver queue when active. *new* */
 	long	b_flags;		/* B_* flags. */
 	unsigned short b_qindex;	/* buffer queue index */
-	unsigned char b_usecount;	/* buffer use count */
+	unsigned char b_unused1;	/* unused field */
 	unsigned char b_xflags;		/* extra flags */
 	struct lock b_lock;		/* Buffer lock */
 	int	b_error;		/* Errno value. */
@@ -410,18 +410,6 @@ bufq_first(struct buf_queue_head *head)
 
 #endif /* KERNEL */
 
-
-/*
- * number of buffer hash entries
- */
-#define BUFHSZ 512
-
-/*
- * buffer hash table calculation, originally by David Greenman
- */
-#define BUFHASH(vnp, bn)        \
-	(&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
-
 /*
  * Definitions for the buffer free lists.
  */
@@ -458,7 +446,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
 
 struct uio;
 
+vm_offset_t bufhashinit __P((vm_offset_t));
 void	bufinit __P((void));
+void	bwillwrite __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index e6d23d86d9cc..87043e30b95b 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $
+ * $Id: buf.h,v 1.75 1999/07/04 00:25:32 mckusick Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -100,7 +100,7 @@ struct buf {
 	TAILQ_ENTRY(buf) b_act;		/* Device driver queue when active. *new* */
 	long	b_flags;		/* B_* flags. */
 	unsigned short b_qindex;	/* buffer queue index */
-	unsigned char b_usecount;	/* buffer use count */
+	unsigned char b_unused1;	/* unused field */
 	unsigned char b_xflags;		/* extra flags */
 	struct lock b_lock;		/* Buffer lock */
 	int	b_error;		/* Errno value. */
@@ -410,18 +410,6 @@ bufq_first(struct buf_queue_head *head)
 
 #endif /* KERNEL */
 
-
-/*
- * number of buffer hash entries
- */
-#define BUFHSZ 512
-
-/*
- * buffer hash table calculation, originally by David Greenman
- */
-#define BUFHASH(vnp, bn)        \
-	(&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
-
 /*
  * Definitions for the buffer free lists.
  */
@@ -458,7 +446,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
 
 struct uio;
 
+vm_offset_t bufhashinit __P((vm_offset_t));
 void	bufinit __P((void));
+void	bwillwrite __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index d12a8020e2f9..53f980af8060 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
- * $Id: ufs_readwrite.c,v 1.57 1999/01/28 00:57:56 dillon Exp $
+ * $Id: ufs_readwrite.c,v 1.58 1999/04/05 19:38:30 julian Exp $
  */
 
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
@@ -106,7 +106,8 @@ READ(ap)
 
 	if (object)
 		vm_object_reference(object);
-#if 1
+
+#ifdef ENABLE_VFS_IOOPT
 	/*
 	 * If IO optimisation is turned on,
 	 * and we are NOT a VM based IO request, 
@@ -150,7 +151,7 @@ READ(ap)
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 			break;
-#if 1
+#ifdef ENABLE_VFS_IOOPT
 		if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
 			/*
 			 * Obviously we didn't finish above, but we
@@ -276,6 +277,7 @@ READ(ap)
 			xfersize = size;
 		}
 
+#ifdef ENABLE_VFS_IOOPT
 		if (vfs_ioopt && object &&
 		    (bp->b_flags & B_VMIO) &&
 		    ((blkoffset & PAGE_MASK) == 0) &&
@@ -289,7 +291,9 @@ READ(ap)
 			error =
 				uiomoveco((char *)bp->b_data + blkoffset,
 					(int)xfersize, uio, object);
-		} else {
+		} else 
+#endif
+		{
 			/*
 			 * otherwise use the general form
 			 */
author	Kirk McKusick <mckusick@FreeBSD.org>	1999-07-08 06:06:00 +0000
committer	Kirk McKusick <mckusick@FreeBSD.org>	1999-07-08 06:06:00 +0000
commit	ad8ac923fa48ba09c5aa52201f3e1e5a81e30eeb (patch)
tree	2284f4f780430039d2d5e44edfd2407ff2a99ebc
parent	bedf427650aa25ac43551bad76d88792626ddd49 (diff)