summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Elischer <julian@FreeBSD.org>1999-03-12 02:24:58 +0000
committerJulian Elischer <julian@FreeBSD.org>1999-03-12 02:24:58 +0000
commit4ef2094e457b4e11e04bb4e2c70ea3bf57cf9ac3 (patch)
tree45b7ce4c6bfa5f186dd372ee2194509ea0186cca
parented1ff184f320feeed3d731f03ab20cadb9a3d9ef (diff)
Notes
-rw-r--r--sys/kern/vfs_bio.c1036
-rw-r--r--sys/kern/vfs_cluster.c6
-rw-r--r--sys/kern/vfs_export.c35
-rw-r--r--sys/kern/vfs_subr.c35
-rw-r--r--sys/nfs/nfs_bio.c57
-rw-r--r--sys/nfs/nfs_vnops.c81
-rw-r--r--sys/nfsclient/nfs_bio.c57
-rw-r--r--sys/nfsclient/nfs_vnops.c81
-rw-r--r--sys/sys/bio.h15
-rw-r--r--sys/sys/buf.h15
-rw-r--r--sys/sys/proc.h7
11 files changed, 905 insertions, 520 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index a01230bfd1cd..76f14b2e61ed 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
* 2. Absolutely no warranty of function or purpose is made by the author
* John S. Dyson.
*
- * $Id: vfs_bio.c,v 1.200 1999/03/02 20:26:39 julian Exp $
+ * $Id: vfs_bio.c,v 1.201 1999/03/02 21:23:38 julian Exp $
*/
/*
@@ -83,8 +83,7 @@ static void vfs_clean_pages(struct buf * bp);
static void vfs_setdirty(struct buf *bp);
static void vfs_vmio_release(struct buf *bp);
static void flushdirtybuffers(int slpflag, int slptimeo);
-
-int needsbuffer;
+static int flushbufqueues(void);
/*
* Internal update daemon, process 3
@@ -92,11 +91,6 @@ int needsbuffer;
*/
int vfs_update_wakeup;
-
-/*
- * buffers base kva
- */
-
/*
* bogus page -- for I/O to/from partially complete buffers
* this is a temporary solution to the problem, but it is not
@@ -105,12 +99,13 @@ int vfs_update_wakeup;
* but the code is intricate enough already.
*/
vm_page_t bogus_page;
+int runningbufspace;
static vm_offset_t bogus_offset;
static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
- bufmallocspace, maxbufmallocspace;
-int numdirtybuffers;
-static int lodirtybuffers, hidirtybuffers;
+ bufmallocspace, maxbufmallocspace, hibufspace;
+static int needsbuffer;
+static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
static int numfreebuffers, lofreebuffers, hifreebuffers;
static int kvafreespace;
@@ -126,8 +121,12 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
&lofreebuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
&hifreebuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
+ &runningbufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
&maxbufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
+ &hibufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
&bufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
@@ -146,11 +145,81 @@ struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
extern int vm_swap_size;
-#define BUF_MAXUSE 24
+#define BUF_MAXUSE 24
+
+#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
+#define VFS_BIO_NEED_RESERVED02 0x02 /* unused */
+#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
+#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
+#define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */
-#define VFS_BIO_NEED_ANY 1
-#define VFS_BIO_NEED_LOWLIMIT 2
-#define VFS_BIO_NEED_FREE 4
+/*
+ * kvaspacewakeup:
+ *
+ * Called when kva space is potential available for recovery or when
+ * kva space is recovered in the buffer_map. This function wakes up
+ * anyone waiting for buffer_map kva space. Even though the buffer_map
+ * is larger then maxbufspace, this situation will typically occur
+ * when the buffer_map gets fragmented.
+ */
+
+static __inline void
+kvaspacewakeup(void)
+{
+ /*
+ * If someone is waiting for KVA space, wake them up. Even
+ * though we haven't freed the kva space yet, the waiting
+ * process will be able to now.
+ */
+ if (needsbuffer & VFS_BIO_NEED_KVASPACE) {
+ needsbuffer &= ~VFS_BIO_NEED_KVASPACE;
+ wakeup(&needsbuffer);
+ }
+}
+
+/*
+ * bufspacewakeup:
+ *
+ * Called when buffer space is potentially available for recovery or when
+ * buffer space is recovered. getnewbuf() will block on this flag when
+ * it is unable to free sufficient buffer space. Buffer space becomes
+ * recoverable when bp's get placed back in the queues.
+ */
+
+static __inline void
+bufspacewakeup(void)
+{
+ /*
+ * If someone is waiting for BUF space, wake them up. Even
+ * though we haven't freed the kva space yet, the waiting
+ * process will be able to now.
+ */
+ if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
+ needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
+ wakeup(&needsbuffer);
+ }
+}
+
+/*
+ * bufcountwakeup:
+ *
+ * Called when a buffer has been added to one of the free queues to
+ * account for the buffer and to wakeup anyone waiting for free buffers.
+ * This typically occurs when large amounts of metadata are being handled
+ * by the buffer cache ( else buffer space runs out first, usually ).
+ */
+
+static __inline void
+bufcountwakeup(void)
+{
+ ++numfreebuffers;
+ if (needsbuffer) {
+ needsbuffer &= ~VFS_BIO_NEED_ANY;
+ if (numfreebuffers >= hifreebuffers)
+ needsbuffer &= ~VFS_BIO_NEED_FREE;
+ wakeup(&needsbuffer);
+ }
+}
/*
* Initialize buffer headers and related structures.
@@ -186,17 +255,25 @@ bufinit()
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
}
-/*
- * maxbufspace is currently calculated to support all filesystem blocks
- * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
- * cache is still the same as it would be for 8K filesystems. This
- * keeps the size of the buffer cache "in check" for big block filesystems.
- */
+
+ /*
+ * maxbufspace is currently calculated to support all filesystem
+ * blocks to be 8K. If you happen to use a 16K filesystem, the size
+ * of the buffer cache is still the same as it would be for 8K
+ * filesystems. This keeps the size of the buffer cache "in check"
+ * for big block filesystems.
+ *
+ * maxbufspace is calculated as around 50% of the KVA available in
+ * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the
+ * effect of fragmentation.
+ */
maxbufspace = (nbuf + 8) * DFLTBSIZE;
+ if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE)
+ hibufspace = 3 * maxbufspace / 4;
/*
* reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
*/
- maxvmiobufspace = 2 * maxbufspace / 3;
+ maxvmiobufspace = 2 * hibufspace / 3;
/*
* Limit the amount of malloc memory since it is wired permanently into
* the kernel space. Even though this is accounted for in the buffer
@@ -204,18 +281,24 @@ bufinit()
* The malloc scheme improves memory utilization significantly on average
* (small) directories.
*/
- maxbufmallocspace = maxbufspace / 20;
+ maxbufmallocspace = hibufspace / 20;
/*
- * Remove the probability of deadlock conditions by limiting the
- * number of dirty buffers.
+ * Reduce the chance of a deadlock occuring by limiting the number
+ * of delayed-write dirty buffers we allow to stack up.
*/
- hidirtybuffers = nbuf / 8 + 20;
lodirtybuffers = nbuf / 16 + 10;
+ hidirtybuffers = nbuf / 8 + 20;
numdirtybuffers = 0;
+
+/*
+ * Try to keep the number of free buffers in the specified range,
+ * and give the syncer access to an emergency reserve.
+ */
lofreebuffers = nbuf / 18 + 5;
hifreebuffers = 2 * lofreebuffers;
numfreebuffers = nbuf;
+
kvafreespace = 0;
bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
@@ -233,24 +316,26 @@ bufinit()
static void
bfreekva(struct buf * bp)
{
- if (bp->b_kvasize == 0)
- return;
-
- vm_map_delete(buffer_map,
- (vm_offset_t) bp->b_kvabase,
- (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
-
- bp->b_kvasize = 0;
-
+ if (bp->b_kvasize) {
+ vm_map_delete(buffer_map,
+ (vm_offset_t) bp->b_kvabase,
+ (vm_offset_t) bp->b_kvabase + bp->b_kvasize
+ );
+ bp->b_kvasize = 0;
+ kvaspacewakeup();
+ }
}
/*
- * remove the buffer from the appropriate free list
+ * bremfree:
+ *
+ * Remove the buffer from the appropriate free list.
*/
void
bremfree(struct buf * bp)
{
int s = splbio();
+ int old_qindex = bp->b_qindex;
if (bp->b_qindex != QUEUE_NONE) {
if (bp->b_qindex == QUEUE_EMPTY) {
@@ -258,14 +343,29 @@ bremfree(struct buf * bp)
}
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
bp->b_qindex = QUEUE_NONE;
+ runningbufspace += bp->b_bufsize;
} else {
#if !defined(MAX_PERF)
panic("bremfree: removing a buffer when not on a queue");
#endif
}
- if ((bp->b_flags & B_INVAL) ||
- (bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
- --numfreebuffers;
+
+ /*
+ * Fixup numfreebuffers count. If the buffer is invalid or not
+ * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
+ * the buffer was free and we must decrement numfreebuffers.
+ */
+ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+ switch(old_qindex) {
+ case QUEUE_EMPTY:
+ case QUEUE_LRU:
+ case QUEUE_AGE:
+ --numfreebuffers;
+ break;
+ default:
+ break;
+ }
+ }
splx(s);
}
@@ -286,6 +386,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
if ((bp->b_flags & B_CACHE) == 0) {
if (curproc != NULL)
curproc->p_stats->p_ru.ru_inblock++;
+ KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
bp->b_flags |= B_READ;
bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
if (bp->b_rcred == NOCRED) {
@@ -330,6 +431,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
VOP_STRATEGY(vp, bp);
++readwait;
}
+
for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
if (inmem(vp, *rablkno))
continue;
@@ -369,7 +471,6 @@ bwrite(struct buf * bp)
struct vnode *vp;
struct mount *mp;
-
if (bp->b_flags & B_INVAL) {
brelse(bp);
return (0);
@@ -381,16 +482,12 @@ bwrite(struct buf * bp)
if ((bp->b_flags & B_BUSY) == 0)
panic("bwrite: buffer is not busy???");
#endif
+ s = splbio();
+ bundirty(bp);
- bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+ bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
bp->b_flags |= B_WRITEINPROG;
- s = splbio();
- if ((oldflags & B_DELWRI) == B_DELWRI) {
- --numdirtybuffers;
- reassignbuf(bp, bp->b_vp);
- }
-
bp->b_vp->v_numoutput++;
vfs_busy_pages(bp, 1);
if (curproc != NULL)
@@ -420,23 +517,8 @@ bwrite(struct buf * bp)
brelse(bp);
return (rtval);
}
- return (0);
-}
-void
-vfs_bio_need_satisfy(void) {
- ++numfreebuffers;
- if (!needsbuffer)
- return;
- if (numdirtybuffers < lodirtybuffers) {
- needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
- } else {
- needsbuffer &= ~VFS_BIO_NEED_ANY;
- }
- if (numfreebuffers >= hifreebuffers) {
- needsbuffer &= ~VFS_BIO_NEED_FREE;
- }
- wakeup(&needsbuffer);
+ return (0);
}
/*
@@ -457,12 +539,7 @@ bdwrite(struct buf * bp)
brelse(bp);
return;
}
- bp->b_flags &= ~(B_READ|B_RELBUF);
- if ((bp->b_flags & B_DELWRI) == 0) {
- bp->b_flags |= B_DONE | B_DELWRI;
- reassignbuf(bp, bp->b_vp);
- ++numdirtybuffers;
- }
+ bdirty(bp);
/*
* This bmap keeps the system from needing to do the bmap later,
@@ -506,32 +583,68 @@ bdwrite(struct buf * bp)
if (numdirtybuffers >= hidirtybuffers)
flushdirtybuffers(0, 0);
-
- return;
}
-
/*
- * Same as first half of bdwrite, mark buffer dirty, but do not release it.
- * Check how this compares with vfs_setdirty(); XXX [JRE]
+ * bdirty:
+ *
+ * Turn buffer into delayed write request. We must clear B_READ and
+ * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to
+ * itself to properly update it in the dirty/clean lists. We mark it
+ * B_DONE to ensure that any asynchronization of the buffer properly
+ * clears B_DONE ( else a panic will occur later ). Note that B_INVALID
+ * buffers are not considered dirty even if B_DELWRI is set.
+ *
+ * Since the buffer is not on a queue, we do not update the numfreebuffers
+ * count.
+ *
+ * Must be called at splbio().
+ * The buffer must be on QUEUE_NONE.
*/
void
bdirty(bp)
- struct buf *bp;
+ struct buf *bp;
{
-
- bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
+ KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
+ bp->b_flags &= ~(B_READ|B_RELBUF);
+
if ((bp->b_flags & B_DELWRI) == 0) {
- bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
+ bp->b_flags |= B_DONE | B_DELWRI;
reassignbuf(bp, bp->b_vp);
++numdirtybuffers;
}
}
/*
- * Asynchronous write.
- * Start output on a buffer, but do not wait for it to complete.
- * The buffer is released when the output completes.
+ * bundirty:
+ *
+ * Clear B_DELWRI for buffer.
+ *
+ * Since the buffer is not on a queue, we do not update the numfreebuffers
+ * count.
+ *
+ * Must be called at splbio().
+ * The buffer must be on QUEUE_NONE.
+ */
+
+void
+bundirty(bp)
+ struct buf *bp;
+{
+ KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
+
+ if (bp->b_flags & B_DELWRI) {
+ bp->b_flags &= ~B_DELWRI;
+ reassignbuf(bp, bp->b_vp);
+ --numdirtybuffers;
+ }
+}
+
+/*
+ * bawrite:
+ *
+ * Asynchronous write. Start output on a buffer, but do not wait for
+ * it to complete. The buffer is released when the output completes.
*/
void
bawrite(struct buf * bp)
@@ -541,39 +654,42 @@ bawrite(struct buf * bp)
}
/*
- * Ordered write.
- * Start output on a buffer, and flag it so that the device will write
- * it in the order it was queued. The buffer is released when the output
- * completes.
+ * bowrite:
+ *
+ * Ordered write. Start output on a buffer, and flag it so that the
+ * device will write it in the order it was queued. The buffer is
+ * released when the output completes.
*/
int
bowrite(struct buf * bp)
{
- bp->b_flags |= B_ORDERED|B_ASYNC;
+ bp->b_flags |= B_ORDERED | B_ASYNC;
return (VOP_BWRITE(bp));
}
/*
- * Release a buffer.
+ * brelse:
+ *
+ * Release a busy buffer and, if requested, free its resources. The
+ * buffer will be stashed in the appropriate bufqueue[] allowing it
+ * to be accessed later as a cache entity or reused for other purposes.
*/
void
brelse(struct buf * bp)
{
int s;
+ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+#if 0
if (bp->b_flags & B_CLUSTER) {
relpbuf(bp, NULL);
return;
}
+#endif
s = splbio();
- /* anyone need this block? */
- if (bp->b_flags & B_WANTED) {
- bp->b_flags &= ~(B_WANTED | B_AGE);
- wakeup(bp);
- }
-
if (bp->b_flags & B_LOCKED)
bp->b_flags &= ~B_ERROR;
@@ -717,8 +833,8 @@ brelse(struct buf * bp)
if (bp->b_qindex != QUEUE_NONE)
panic("brelse: free buffer onto another queue???");
#endif
-
/* enqueue */
+
/* buffers with no memory */
if (bp->b_bufsize == 0) {
bp->b_flags |= B_INVAL;
@@ -728,7 +844,8 @@ brelse(struct buf * bp)
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
bp->b_dev = NODEV;
kvafreespace += bp->b_kvasize;
-
+ if (bp->b_kvasize)
+ kvaspacewakeup();
/* buffers with junk contents */
} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
bp->b_flags |= B_INVAL;
@@ -754,15 +871,38 @@ brelse(struct buf * bp)
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
}
- if ((bp->b_flags & B_INVAL) ||
- (bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
- if (bp->b_flags & B_DELWRI) {
- --numdirtybuffers;
- bp->b_flags &= ~B_DELWRI;
- }
- vfs_bio_need_satisfy();
+ /*
+ * If B_INVAL, clear B_DELWRI.
+ */
+ if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
+ bp->b_flags &= ~B_DELWRI;
+ --numdirtybuffers;
}
+ runningbufspace -= bp->b_bufsize;
+
+ /*
+ * Fixup numfreebuffers count. The bp is on an appropriate queue
+ * unless locked. We then bump numfreebuffers if it is not B_DELWRI.
+ * We've already handled the B_INVAL case ( B_DELWRI will be clear
+ * if B_INVAL is set ).
+ */
+
+ if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
+ bufcountwakeup();
+
+ /*
+ * Something we can maybe free.
+ */
+
+ if (bp->b_bufsize)
+ bufspacewakeup();
+
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~(B_WANTED | B_AGE);
+ wakeup(bp);
+ }
+
/* unlock */
bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
@@ -770,7 +910,8 @@ brelse(struct buf * bp)
}
/*
- * Release a buffer.
+ * Release a buffer back to the appropriate queue but do not try to free
+ * it.
*/
void
bqrelse(struct buf * bp)
@@ -779,17 +920,12 @@ bqrelse(struct buf * bp)
s = splbio();
- /* anyone need this block? */
- if (bp->b_flags & B_WANTED) {
- bp->b_flags &= ~(B_WANTED | B_AGE);
- wakeup(bp);
- }
+ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
#if !defined(MAX_PERF)
if (bp->b_qindex != QUEUE_NONE)
panic("bqrelse: free buffer onto another queue???");
#endif
-
if (bp->b_flags & B_LOCKED) {
bp->b_flags &= ~B_ERROR;
bp->b_qindex = QUEUE_LOCKED;
@@ -800,10 +936,26 @@ bqrelse(struct buf * bp)
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
}
- if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
- vfs_bio_need_satisfy();
+ runningbufspace -= bp->b_bufsize;
+
+ if ((bp->b_flags & B_LOCKED) == 0 &&
+ ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
+ ) {
+ bufcountwakeup();
}
+ /*
+ * Something we can maybe wakeup
+ */
+ if (bp->b_bufsize)
+ bufspacewakeup();
+
+ /* anyone need this block? */
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~(B_WANTED | B_AGE);
+ wakeup(bp);
+ }
+
/* unlock */
bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
@@ -847,10 +999,13 @@ vfs_vmio_release(bp)
}
}
}
- splx(s);
bufspace -= bp->b_bufsize;
vmiospace -= bp->b_bufsize;
+ runningbufspace -= bp->b_bufsize;
+ splx(s);
pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+ if (bp->b_bufsize)
+ bufspacewakeup();
bp->b_npages = 0;
bp->b_bufsize = 0;
bp->b_flags &= ~B_VMIO;
@@ -902,7 +1057,8 @@ vfs_bio_awrite(struct buf * bp)
s = splbio();
/*
- * right now we support clustered writing only to regular files
+ * right now we support clustered writing only to regular files, and
+ * then only if our I/O system is not saturated.
*/
if ((vp->v_type == VREG) &&
(vp->v_mount != 0) && /* Only on nodes that have the size info */
@@ -943,279 +1099,358 @@ vfs_bio_awrite(struct buf * bp)
*/
nwritten = bp->b_bufsize;
(void) VOP_BWRITE(bp);
+
return nwritten;
}
-
/*
- * Find a buffer header which is available for use.
+ * getnewbuf:
+ *
+ * Find and initialize a new buffer header, freeing up existing buffers
+ * in the bufqueues as necessary.
+ *
+ * We block if:
+ * We have insufficient buffer headers
+ * We have insufficient buffer space
+ * buffer_map is too fragmented ( space reservation fails )
+ *
+ * We do *not* attempt to flush dirty buffers more then one level deep.
+ * I.e., if P_FLSINPROG is set we do not flush dirty buffers at all.
+ *
+ * If P_FLSINPROG is set, we are allowed to dip into our emergency
+ * reserve.
*/
static struct buf *
getnewbuf(struct vnode *vp, daddr_t blkno,
int slpflag, int slptimeo, int size, int maxsize)
{
- struct buf *bp, *bp1;
- int nbyteswritten = 0;
- vm_offset_t addr;
- static int writerecursion = 0;
-
-start:
- if (bufspace >= maxbufspace)
- goto trytofreespace;
+ struct buf *bp;
+ struct buf *nbp;
+ int outofspace;
+ int nqindex;
+ int defrag = 0;
+ int countawrites = 0;
+
+restart:
- /* can we constitute a new buffer? */
- if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
-#if !defined(MAX_PERF)
- if (bp->b_qindex != QUEUE_EMPTY)
- panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
- bp->b_qindex);
-#endif
- bp->b_flags |= B_BUSY;
- bremfree(bp);
- goto fillbuf;
- }
-trytofreespace:
/*
- * We keep the file I/O from hogging metadata I/O
- * This is desirable because file data is cached in the
- * VM/Buffer cache even if a buffer is freed.
+ * Setup for scan. If we do not have enough free buffers,
+ * we setup a degenerate case that falls through the while.
+ *
+ * If we are in the middle of a flush, we can dip into the
+ * emergency reserve.
*/
- if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
-#if !defined(MAX_PERF)
- if (bp->b_qindex != QUEUE_AGE)
- panic("getnewbuf: inconsistent AGE queue, qindex=%d",
- bp->b_qindex);
-#endif
- } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
-#if !defined(MAX_PERF)
- if (bp->b_qindex != QUEUE_LRU)
- panic("getnewbuf: inconsistent LRU queue, qindex=%d",
- bp->b_qindex);
-#endif
- }
- if (!bp) {
- /* wait for a free buffer of any kind */
- needsbuffer |= VFS_BIO_NEED_ANY;
- do
- tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
- slptimeo);
- while (needsbuffer & VFS_BIO_NEED_ANY);
- return (0);
+
+ if ((curproc->p_flag & P_FLSINPROG) == 0 &&
+ numfreebuffers < lofreebuffers
+ ) {
+ nqindex = QUEUE_LRU;
+ nbp = NULL;
+ } else {
+ nqindex = QUEUE_EMPTY;
+ if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY])) == NULL) {
+ nqindex = QUEUE_AGE;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
+ if (nbp == NULL) {
+ nqindex = QUEUE_LRU;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
+ }
+ }
}
- KASSERT(!(bp->b_flags & B_BUSY),
- ("getnewbuf: busy buffer on free list\n"));
+
/*
- * We are fairly aggressive about freeing VMIO buffers, but since
- * the buffering is intact without buffer headers, there is not
- * much loss. We gain by maintaining non-VMIOed metadata in buffers.
+ * Calculate whether we are out of buffer space. This state is
+ * recalculated on every restart. If we are out of space, we
+ * have to turn off defragmentation. The outofspace code will
+ * defragment too, but the looping conditionals will be messed up
+ * if both outofspace and defrag are on.
*/
- if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
- if ((bp->b_flags & B_VMIO) == 0 ||
- (vmiospace < maxvmiobufspace)) {
- --bp->b_usecount;
- TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
- if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
- goto start;
- }
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+
+ outofspace = 0;
+ if (bufspace >= hibufspace) {
+ if ((curproc->p_flag & P_FLSINPROG) == 0 ||
+ bufspace >= maxbufspace
+ ) {
+ outofspace = 1;
+ defrag = 0;
}
}
+ /*
+ * defrag state is semi-persistant. 1 means we are flagged for
+ * defragging. -1 means we actually defragged something.
+ */
+ /* nop */
- /* if we are a delayed write, convert to an async write */
- if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
+ /*
+ * Run scan, possibly freeing data and/or kva mappings on the fly
+ * depending.
+ */
+ while ((bp = nbp) != NULL) {
+ int qindex = nqindex;
/*
- * If our delayed write is likely to be used soon, then
- * recycle back onto the LRU queue.
+ * Calculate next bp ( we can only use it if we do not block
+ * or do other fancy things ).
*/
- if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
- (bp->b_lblkno >= blkno) && (maxsize > 0)) {
+ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
+ switch(qindex) {
+ case QUEUE_EMPTY:
+ nqindex = QUEUE_AGE;
+ if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_AGE])))
+ break;
+ /* fall through */
+ case QUEUE_AGE:
+ nqindex = QUEUE_LRU;
+ if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_LRU])))
+ break;
+ /* fall through */
+ case QUEUE_LRU:
+ /*
+ * nbp is NULL.
+ */
+ break;
+ }
+ }
- if (bp->b_usecount > 0) {
- if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
+ /*
+ * Sanity Checks
+ */
+ KASSERT(!(bp->b_flags & B_BUSY), ("getnewbuf: busy buffer %p on free list", bp));
+ KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
- TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ /*
+ * Here we try to move NON VMIO buffers to the end of the
+ * LRU queue in order to make VMIO buffers more readily
+ * freeable. We also try to move buffers with a positive
+ * usecount to the end.
+ *
+ * Note that by moving the bp to the end, we setup a following
+ * loop. Since we continue to decrement b_usecount this
+ * is ok and, in fact, desireable.
+ *
+ * If we are at the end of the list, we move ourself to the
+ * same place and need to fixup nbp and nqindex to handle
+ * the following case.
+ */
- if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
- bp->b_usecount--;
- goto start;
- }
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ if ((qindex == QUEUE_LRU) && bp->b_usecount > 0) {
+ if ((bp->b_flags & B_VMIO) == 0 ||
+ (vmiospace < maxvmiobufspace)
+ ) {
+ --bp->b_usecount;
+ TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ if (nbp == NULL) {
+ nqindex = qindex;
+ nbp = bp;
}
+ continue;
}
}
/*
- * Certain layered filesystems can recursively re-enter the vfs_bio
- * code, due to delayed writes. This helps keep the system from
- * deadlocking.
+ * If we come across a delayed write and numdirtybuffers should
+ * be flushed, try to write it out. Only if P_FLSINPROG is
+ * not set. We can't afford to recursively stack more then
+ * one deep due to the possibility of having deep VFS call
+ * stacks.
+ *
+ * Limit the number of dirty buffers we are willing to try
+ * to recover since it really isn't our job here.
*/
- if (writerecursion > 0) {
- if (writerecursion > 5) {
- bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
- while (bp) {
- if ((bp->b_flags & B_DELWRI) == 0)
- break;
- bp = TAILQ_NEXT(bp, b_freelist);
- }
- if (bp == NULL) {
- bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
- while (bp) {
- if ((bp->b_flags & B_DELWRI) == 0)
- break;
- bp = TAILQ_NEXT(bp, b_freelist);
- }
- }
- if (bp == NULL)
- panic("getnewbuf: cannot get buffer, infinite recursion failure");
- } else {
- bremfree(bp);
- bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
- nbyteswritten += bp->b_bufsize;
- ++writerecursion;
- VOP_BWRITE(bp);
- --writerecursion;
- if (!slpflag && !slptimeo) {
- return (0);
- }
- goto start;
+ if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
+ if ((curproc->p_flag & P_FLSINPROG) ||
+ numdirtybuffers < hidirtybuffers ||
+ countawrites > 16
+ ) {
+ continue;
}
- } else {
- ++writerecursion;
- nbyteswritten += vfs_bio_awrite(bp);
- --writerecursion;
- if (!slpflag && !slptimeo) {
- return (0);
+ curproc->p_flag |= P_FLSINPROG;
+ vfs_bio_awrite(bp);
+ curproc->p_flag &= ~P_FLSINPROG;
+ ++countawrites;
+ goto restart;
+ }
+
+ if (defrag > 0 && bp->b_kvasize == 0)
+ continue;
+ if (outofspace > 0 && bp->b_bufsize == 0)
+ continue;
+
+ /*
+ * Start freeing the bp. This is somewhat involved. nbp
+ * remains valid only for QUEUE_EMPTY bp's.
+ */
+
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+
+ if (qindex == QUEUE_LRU || qindex == QUEUE_AGE) {
+ if (bp->b_flags & B_VMIO) {
+ bp->b_flags &= ~B_ASYNC;
+ vfs_vmio_release(bp);
}
- goto start;
+ if (bp->b_vp)
+ brelvp(bp);
}
- }
- if (bp->b_flags & B_WANTED) {
- bp->b_flags &= ~B_WANTED;
- wakeup(bp);
- }
- bremfree(bp);
- bp->b_flags |= B_BUSY;
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~B_WANTED;
+ wakeup(bp);
+ }
- if (bp->b_flags & B_VMIO) {
- bp->b_flags &= ~B_ASYNC;
- vfs_vmio_release(bp);
- }
+ /*
+ * NOTE: nbp is now entirely invalid. We can only restart
+ * the scan from this point on.
+ *
+ * Get the rest of the buffer freed up. b_kva* is still
+ * valid after this operation.
+ */
- if (bp->b_vp)
- brelvp(bp);
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
+ (*bioops.io_deallocate)(bp);
-fillbuf:
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
- /* we are not free, nor do we contain interesting data */
- if (bp->b_rcred != NOCRED) {
- crfree(bp->b_rcred);
- bp->b_rcred = NOCRED;
- }
- if (bp->b_wcred != NOCRED) {
- crfree(bp->b_wcred);
- bp->b_wcred = NOCRED;
- }
- if (LIST_FIRST(&bp->b_dep) != NULL &&
- bioops.io_deallocate)
- (*bioops.io_deallocate)(bp);
-
- LIST_REMOVE(bp, b_hash);
- LIST_INSERT_HEAD(&invalhash, bp, b_hash);
- if (bp->b_bufsize) {
- allocbuf(bp, 0);
- }
- bp->b_flags = B_BUSY;
- bp->b_dev = NODEV;
- bp->b_vp = NULL;
- bp->b_blkno = bp->b_lblkno = 0;
- bp->b_offset = NOOFFSET;
- bp->b_iodone = 0;
- bp->b_error = 0;
- bp->b_resid = 0;
- bp->b_bcount = 0;
- bp->b_npages = 0;
- bp->b_dirtyoff = bp->b_dirtyend = 0;
- bp->b_validoff = bp->b_validend = 0;
- bp->b_usecount = 5;
- /* Here, not kern_physio.c, is where this should be done*/
- LIST_INIT(&bp->b_dep);
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
- maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
+ bp->b_flags = B_BUSY;
+ bp->b_dev = NODEV;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_validoff = bp->b_validend = 0;
+ bp->b_usecount = 5;
+
+ LIST_INIT(&bp->b_dep);
- /*
- * we assume that buffer_map is not at address 0
- */
- addr = 0;
- if (maxsize != bp->b_kvasize) {
- bfreekva(bp);
-
-findkvaspace:
/*
- * See if we have buffer kva space
+ * Ok, now that we have a free buffer, if we are defragging
+ * we have to recover the kvaspace.
*/
- if (vm_map_findspace(buffer_map,
- vm_map_min(buffer_map), maxsize, &addr)) {
- if (kvafreespace > 0) {
- int totfree = 0, freed;
- do {
- freed = 0;
- for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
- bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
- if (bp1->b_kvasize != 0) {
- totfree += bp1->b_kvasize;
- freed = bp1->b_kvasize;
- bremfree(bp1);
- bfreekva(bp1);
- brelse(bp1);
- break;
- }
- }
- } while (freed);
- /*
- * if we found free space, then retry with the same buffer.
- */
- if (totfree)
- goto findkvaspace;
- }
+
+ if (defrag > 0) {
+ defrag = -1;
bp->b_flags |= B_INVAL;
+ bfreekva(bp);
brelse(bp);
- goto trytofreespace;
+ goto restart;
}
- }
- /*
- * See if we are below are allocated minimum
- */
- if (bufspace >= (maxbufspace + nbyteswritten)) {
- bp->b_flags |= B_INVAL;
- brelse(bp);
- goto trytofreespace;
+ if (outofspace > 0) {
+ outofspace = -1;
+ bp->b_flags |= B_INVAL;
+ bfreekva(bp);
+ brelse(bp);
+ goto restart;
+ }
+
+ /*
+ * We are done
+ */
+ break;
}
/*
- * create a map entry for the buffer -- in essence
- * reserving the kva space.
+ * If we exhausted our list, sleep as appropriate.
*/
- if (addr) {
- vm_map_insert(buffer_map, NULL, 0,
- addr, addr + maxsize,
- VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
- bp->b_kvabase = (caddr_t) addr;
- bp->b_kvasize = maxsize;
+ if (bp == NULL) {
+ int flags;
+
+dosleep:
+ if (defrag > 0)
+ flags = VFS_BIO_NEED_KVASPACE;
+ else if (outofspace > 0)
+ flags = VFS_BIO_NEED_BUFSPACE;
+ else
+ flags = VFS_BIO_NEED_ANY;
+
+ if (rushjob < syncdelay / 2)
+ ++rushjob;
+ needsbuffer |= flags;
+ while (needsbuffer & flags) {
+ tsleep(
+ &needsbuffer,
+ (PRIBIO + 4) | slpflag,
+ "newbuf",
+ slptimeo
+ );
+ }
+ } else {
+ /*
+ * We finally have a valid bp. We aren't quite out of the
+ * woods, we still have to reserve kva space.
+ */
+ vm_offset_t addr = 0;
+
+ maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
+
+ if (maxsize != bp->b_kvasize) {
+ bfreekva(bp);
+
+ if (vm_map_findspace(buffer_map,
+ vm_map_min(buffer_map), maxsize, &addr)
+ ) {
+ /*
+ * Uh oh. Buffer map is to fragmented. Try
+ * to defragment.
+ */
+ if (defrag <= 0) {
+ defrag = 1;
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto restart;
+ }
+ /*
+ * Uh oh. We couldn't seem to defragment
+ */
+ bp = NULL;
+ goto dosleep;
+ }
+ }
+ if (addr) {
+ vm_map_insert(buffer_map, NULL, 0,
+ addr, addr + maxsize,
+ VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+
+ bp->b_kvabase = (caddr_t) addr;
+ bp->b_kvasize = maxsize;
+ bp->b_data = bp->b_kvabase;
+ }
}
- bp->b_data = bp->b_kvabase;
return (bp);
}
+/*
+ * waitfreebuffers:
+ *
+ * Wait for sufficient free buffers. This routine is not called if
+ * curproc is the update process so we do not have to do anything
+ * fancy.
+ */
+
static void
-waitfreebuffers(int slpflag, int slptimeo) {
+waitfreebuffers(int slpflag, int slptimeo)
+{
while (numfreebuffers < hifreebuffers) {
flushdirtybuffers(slpflag, slptimeo);
if (numfreebuffers < hifreebuffers)
@@ -1226,48 +1461,80 @@ waitfreebuffers(int slpflag, int slptimeo) {
}
}
+/*
+ * flushdirtybuffers:
+ *
+ * This routine is called when we get too many dirty buffers.
+ *
+ * We have to protect ourselves from recursion, but we also do not want
+ * other process's flushdirtybuffers() to interfere with the syncer if
+ * it decides to flushdirtybuffers().
+ *
+ * In order to maximize operations, we allow any process to flush
+ * dirty buffers and use P_FLSINPROG to prevent recursion.
+ */
+
static void
-flushdirtybuffers(int slpflag, int slptimeo) {
+flushdirtybuffers(int slpflag, int slptimeo)
+{
int s;
- static pid_t flushing = 0;
s = splbio();
- if (flushing) {
- if (flushing == curproc->p_pid) {
- splx(s);
- return;
- }
- while (flushing) {
- if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
- splx(s);
- return;
- }
- }
+ if (curproc->p_flag & P_FLSINPROG) {
+ splx(s);
+ return;
}
- flushing = curproc->p_pid;
+ curproc->p_flag |= P_FLSINPROG;
while (numdirtybuffers > lodirtybuffers) {
- struct buf *bp;
- needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
- bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
- if (bp == NULL)
- bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
-
- while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
- bp = TAILQ_NEXT(bp, b_freelist);
+ if (flushbufqueues() == 0)
+ break;
+ }
+
+ curproc->p_flag &= ~P_FLSINPROG;
+
+ splx(s);
+}
+
+static int
+flushbufqueues(void)
+{
+ struct buf *bp;
+ int qindex;
+ int r = 0;
+
+ qindex = QUEUE_AGE;
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
+
+ for (;;) {
+ if (bp == NULL) {
+ if (qindex == QUEUE_LRU)
+ break;
+ qindex = QUEUE_LRU;
+ if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU])) == NULL)
+ break;
}
- if (bp) {
- vfs_bio_awrite(bp);
- continue;
+ /*
+ * XXX NFS does weird things with B_INVAL bps if we bwrite
+ * them ( vfs_bio_awrite/bawrite/bdwrite/etc ) Why?
+ *
+ */
+ if ((bp->b_flags & B_DELWRI) != 0) {
+ if (bp->b_flags & B_INVAL) {
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ brelse(bp);
+ } else {
+ vfs_bio_awrite(bp);
+ }
+ ++r;
+ break;
}
- break;
+ bp = TAILQ_NEXT(bp, b_freelist);
}
-
- flushing = 0;
- wakeup(&flushing);
- splx(s);
+ return(r);
}
/*
@@ -1335,21 +1602,29 @@ inmem(struct vnode * vp, daddr_t blkno)
* code, and used by the nfs read code.
*/
static void
-vfs_setdirty(struct buf *bp) {
+vfs_setdirty(struct buf *bp)
+{
int i;
vm_object_t object;
vm_offset_t boffset;
-#if 0
- vm_offset_t offset;
-#endif
/*
* We qualify the scan for modified pages on whether the
* object has been flushed yet. The OBJ_WRITEABLE flag
* is not cleared simply by protecting pages off.
*/
- if ((bp->b_flags & B_VMIO) &&
- ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
+
+ if ((bp->b_flags & B_VMIO) == 0)
+ return;
+
+ object = bp->b_pages[0]->object;
+
+ if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
+ printf("Warning: object %p writeable but not mightbedirty\n", object);
+ if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
+ printf("Warning: object %p mightbedirty but not writeable\n", object);
+
+ if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
/*
* test the pages to see if they have been modified directly
* by users through the VM system.
@@ -1410,7 +1685,15 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
s = splbio();
loop:
- if (numfreebuffers < lofreebuffers) {
+ /*
+ * Block if we are low on buffers. The syncer is allowed more
+ * buffers in order to avoid a deadlock.
+ */
+ if (curproc == updateproc && numfreebuffers == 0) {
+ needsbuffer |= VFS_BIO_NEED_ANY;
+ tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
+ slptimeo);
+ } else if (curproc != updateproc && numfreebuffers < lofreebuffers) {
waitfreebuffers(slpflag, slptimeo);
}
@@ -1655,6 +1938,9 @@ allocbuf(struct buf *bp, int size)
free(bp->b_data, M_BIOBUF);
bufspace -= bp->b_bufsize;
bufmallocspace -= bp->b_bufsize;
+ runningbufspace -= bp->b_bufsize;
+ if (bp->b_bufsize)
+ bufspacewakeup();
bp->b_data = bp->b_kvabase;
bp->b_bufsize = 0;
bp->b_bcount = 0;
@@ -1683,6 +1969,7 @@ allocbuf(struct buf *bp, int size)
bp->b_flags |= B_MALLOC;
bufspace += mbsize;
bufmallocspace += mbsize;
+ runningbufspace += bp->b_bufsize;
return 1;
}
#endif
@@ -1699,6 +1986,9 @@ allocbuf(struct buf *bp, int size)
bp->b_data = bp->b_kvabase;
bufspace -= bp->b_bufsize;
bufmallocspace -= bp->b_bufsize;
+ runningbufspace -= bp->b_bufsize;
+ if (bp->b_bufsize)
+ bufspacewakeup();
bp->b_bufsize = 0;
bp->b_flags &= ~B_MALLOC;
newbsize = round_page(newbsize);
@@ -1862,6 +2152,9 @@ allocbuf(struct buf *bp, int size)
if (bp->b_flags & B_VMIO)
vmiospace += (newbsize - bp->b_bufsize);
bufspace += (newbsize - bp->b_bufsize);
+ runningbufspace += (newbsize - bp->b_bufsize);
+ if (newbsize < bp->b_bufsize)
+ bufspacewakeup();
bp->b_bufsize = newbsize;
bp->b_bcount = size;
return 1;
@@ -1909,18 +2202,9 @@ biodone(register struct buf * bp)
s = splbio();
-#if !defined(MAX_PERF)
- if (!(bp->b_flags & B_BUSY))
- panic("biodone: buffer not busy");
-#endif
+ KASSERT((bp->b_flags & B_BUSY), ("biodone: bp %p not busy", bp));
+ KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
- if (bp->b_flags & B_DONE) {
- splx(s);
-#if !defined(MAX_PERF)
- printf("biodone: buffer already done\n");
-#endif
- return;
- }
bp->b_flags |= B_DONE;
if (bp->b_flags & B_FREEBUF) {
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 27e9167d83cf..f7bd95e2947e 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.78 1999/01/21 08:29:05 dillon Exp $
+ * $Id: vfs_cluster.c,v 1.79 1999/01/27 21:49:58 dillon Exp $
*/
#include "opt_debug_cluster.h"
@@ -778,8 +778,8 @@ cluster_wbuild(vp, size, start_lbn, len)
bp->b_bufsize += size;
s = splbio();
- --numdirtybuffers;
- tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+ bundirty(tbp);
+ tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
tbp->b_flags |= B_ASYNC;
reassignbuf(tbp, tbp->b_vp); /* put on clean list */
++tbp->b_vp->v_numoutput;
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index d718a3f95d18..97edbdb430cf 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.187 1999/02/19 17:36:58 dillon Exp $
+ * $Id: vfs_subr.c,v 1.188 1999/02/25 05:22:29 dillon Exp $
*/
/*
@@ -901,7 +901,7 @@ vn_syncer_add_to_worklist(struct vnode *vp, int delay)
splx(s);
}
-static struct proc *updateproc;
+struct proc *updateproc;
static void sched_sync __P((void));
static const struct kproc_desc up_kp = {
"syncer",
@@ -937,11 +937,19 @@ sched_sync(void)
splx(s);
while ((vp = LIST_FIRST(slp)) != NULL) {
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
- (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
- VOP_UNLOCK(vp, 0, p);
+ if (VOP_ISLOCKED(vp) == 0) {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
+ VOP_UNLOCK(vp, 0, p);
+ }
s = splbio();
if (LIST_FIRST(slp) == vp) {
+ /*
+ * Note: v_tag VT_VFS vps can remain on the
+ * worklist too with no dirty blocks, but
+ * since sync_fsync() moves it to a different
+ * slot we are safe.
+ */
if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
vp->v_type != VBLK)
panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
@@ -1063,7 +1071,6 @@ reassignbuf(bp, newvp)
register struct vnode *newvp;
{
struct buflists *listheadp;
- struct vnode *oldvp;
int delay;
int s;
@@ -1086,14 +1093,16 @@ reassignbuf(bp, newvp)
* Delete from old vnode list, if on one.
*/
if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
- oldvp = bp->b_vp;
if (bp->b_xflags & B_VNDIRTY)
- listheadp = &oldvp->v_dirtyblkhd;
+ listheadp = &bp->b_vp->v_dirtyblkhd;
else
- listheadp = &oldvp->v_cleanblkhd;
+ listheadp = &bp->b_vp->v_cleanblkhd;
TAILQ_REMOVE(listheadp, bp, b_vnbufs);
bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
- vdrop(oldvp);
+ if (bp->b_vp != newvp) {
+ vdrop(bp->b_vp);
+ bp->b_vp = NULL; /* for clarification */
+ }
}
/*
* If dirty, put on list of dirty buffers; otherwise insert onto list
@@ -1145,8 +1154,10 @@ reassignbuf(bp, newvp)
LIST_REMOVE(newvp, v_synclist);
}
}
- bp->b_vp = newvp;
- vhold(bp->b_vp);
+ if (bp->b_vp != newvp) {
+ bp->b_vp = newvp;
+ vhold(bp->b_vp);
+ }
splx(s);
}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index d718a3f95d18..97edbdb430cf 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.187 1999/02/19 17:36:58 dillon Exp $
+ * $Id: vfs_subr.c,v 1.188 1999/02/25 05:22:29 dillon Exp $
*/
/*
@@ -901,7 +901,7 @@ vn_syncer_add_to_worklist(struct vnode *vp, int delay)
splx(s);
}
-static struct proc *updateproc;
+struct proc *updateproc;
static void sched_sync __P((void));
static const struct kproc_desc up_kp = {
"syncer",
@@ -937,11 +937,19 @@ sched_sync(void)
splx(s);
while ((vp = LIST_FIRST(slp)) != NULL) {
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
- (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
- VOP_UNLOCK(vp, 0, p);
+ if (VOP_ISLOCKED(vp) == 0) {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
+ VOP_UNLOCK(vp, 0, p);
+ }
s = splbio();
if (LIST_FIRST(slp) == vp) {
+ /*
+ * Note: v_tag VT_VFS vps can remain on the
+ * worklist too with no dirty blocks, but
+ * since sync_fsync() moves it to a different
+ * slot we are safe.
+ */
if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
vp->v_type != VBLK)
panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
@@ -1063,7 +1071,6 @@ reassignbuf(bp, newvp)
register struct vnode *newvp;
{
struct buflists *listheadp;
- struct vnode *oldvp;
int delay;
int s;
@@ -1086,14 +1093,16 @@ reassignbuf(bp, newvp)
* Delete from old vnode list, if on one.
*/
if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
- oldvp = bp->b_vp;
if (bp->b_xflags & B_VNDIRTY)
- listheadp = &oldvp->v_dirtyblkhd;
+ listheadp = &bp->b_vp->v_dirtyblkhd;
else
- listheadp = &oldvp->v_cleanblkhd;
+ listheadp = &bp->b_vp->v_cleanblkhd;
TAILQ_REMOVE(listheadp, bp, b_vnbufs);
bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
- vdrop(oldvp);
+ if (bp->b_vp != newvp) {
+ vdrop(bp->b_vp);
+ bp->b_vp = NULL; /* for clarification */
+ }
}
/*
* If dirty, put on list of dirty buffers; otherwise insert onto list
@@ -1145,8 +1154,10 @@ reassignbuf(bp, newvp)
LIST_REMOVE(newvp, v_synclist);
}
}
- bp->b_vp = newvp;
- vhold(bp->b_vp);
+ if (bp->b_vp != newvp) {
+ bp->b_vp = newvp;
+ vhold(bp->b_vp);
+ }
splx(s);
}
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index fb437a530430..2fb535399d3a 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $
+ * $Id: nfs_bio.c,v 1.66 1999/01/21 08:29:07 dillon Exp $
*/
@@ -418,6 +418,7 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
return (EINTR);
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
rabp->b_flags |= (B_READ | B_ASYNC);
+ rabp->b_flags &= ~B_DONE;
vfs_busy_pages(rabp, 0);
if (nfs_asyncio(rabp, cred)) {
rabp->b_flags |= B_INVAL|B_ERROR;
@@ -513,6 +514,7 @@ again:
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
+ bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
@@ -537,6 +539,7 @@ again:
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
+ bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
@@ -560,6 +563,7 @@ again:
return (EINTR);
if ((bp->b_flags & B_DONE) == 0) {
bp->b_flags |= B_READ;
+ bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error == 0 && (bp->b_flags & B_INVAL))
@@ -591,6 +595,7 @@ again:
if (rabp) {
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
rabp->b_flags |= (B_READ | B_ASYNC);
+ rabp->b_flags &= ~B_DONE;
vfs_busy_pages(rabp, 0);
if (nfs_asyncio(rabp, cred)) {
rabp->b_flags |= B_INVAL|B_ERROR;
@@ -840,6 +845,12 @@ again:
bp->b_dirtyoff = on;
bp->b_dirtyend = on + n;
}
+ /*
+ * To avoid code complexity, we may have to throw away
+ * previously valid ranges when merging the new dirty range
+ * into the valid range. As long as we do not *ADD* an
+ * invalid valid range, we are ok.
+ */
if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
bp->b_validoff > bp->b_dirtyend) {
bp->b_validoff = bp->b_dirtyoff;
@@ -1004,7 +1015,7 @@ nfs_asyncio(bp, cred)
if (nfs_numasync == 0)
return (EIO);
-
+
nmp = VFSTONFS(bp->b_vp->v_mount);
again:
if (nmp->nm_flag & NFSMNT_INT)
@@ -1109,12 +1120,12 @@ again:
*/
int
nfs_doio(bp, cr, p)
- register struct buf *bp;
+ struct buf *bp;
struct ucred *cr;
struct proc *p;
{
- register struct uio *uiop;
- register struct vnode *vp;
+ struct uio *uiop;
+ struct vnode *vp;
struct nfsnode *np;
struct nfsmount *nmp;
int error = 0, diff, len, iomode, must_commit = 0;
@@ -1130,6 +1141,8 @@ nfs_doio(bp, cr, p)
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_procp = p;
+ KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
+
/*
* Historically, paging was done with physio, but no more.
*/
@@ -1236,10 +1249,12 @@ nfs_doio(bp, cr, p)
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
uiop->uio_rw = UIO_WRITE;
nfsstats.write_bios++;
+
if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
iomode = NFSV3WRITE_UNSTABLE;
else
iomode = NFSV3WRITE_FILESYNC;
+
bp->b_flags |= B_WRITEINPROG;
error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
if (!error && iomode == NFSV3WRITE_UNSTABLE) {
@@ -1247,8 +1262,9 @@ nfs_doio(bp, cr, p)
if (bp->b_dirtyoff == 0
&& bp->b_dirtyend == bp->b_bufsize)
bp->b_flags |= B_CLUSTEROK;
- } else
+ } else {
bp->b_flags &= ~B_NEEDCOMMIT;
+ }
bp->b_flags &= ~B_WRITEINPROG;
/*
@@ -1265,31 +1281,30 @@ nfs_doio(bp, cr, p)
* the B_DELWRI and B_NEEDCOMMIT flags.
*
* If the buffer is marked B_PAGING, it does not reside on
- * the vp's paging queues so we do not ( and cannot ) reassign
- * it. XXX numdirtybuffers should be integrated into
- * reassignbuf() call.
+ * the vp's paging queues so we cannot call bdirty(). The
+ * bp in this case is not an NFS cache block so we should
+ * be safe. XXX
*/
if (error == EINTR
|| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
int s;
+ s = splbio();
bp->b_flags &= ~(B_INVAL|B_NOCACHE);
if ((bp->b_flags & B_PAGING) == 0) {
- ++numdirtybuffers;
- bp->b_flags |= B_DELWRI;
- s = splbio();
- reassignbuf(bp, vp);
- splx(s);
+ bdirty(bp);
+ bp->b_flags &= ~B_DONE;
}
if ((bp->b_flags & B_ASYNC) == 0)
bp->b_flags |= B_EINTR;
+ splx(s);
} else {
- if (error) {
- bp->b_flags |= B_ERROR;
- bp->b_error = np->n_error = error;
- np->n_flag |= NWRITEERR;
- }
- bp->b_dirtyoff = bp->b_dirtyend = 0;
+ if (error) {
+ bp->b_flags |= B_ERROR;
+ bp->b_error = np->n_error = error;
+ np->n_flag |= NWRITEERR;
+ }
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
}
} else {
bp->b_resid = 0;
@@ -1299,7 +1314,7 @@ nfs_doio(bp, cr, p)
}
bp->b_resid = uiop->uio_resid;
if (must_commit)
- nfs_clearcommit(vp->v_mount);
+ nfs_clearcommit(vp->v_mount);
biodone(bp);
return (error);
}
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index 4afb4697c56b..a92bb2295811 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.122 1999/02/13 09:47:30 dillon Exp $
+ * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
*/
@@ -2648,6 +2648,9 @@ nfs_strategy(ap)
struct proc *p;
int error = 0;
+ KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
+ KASSERT((bp->b_flags & B_BUSY), ("nfs_strategy: buffer %p not B_BUSY", bp));
+
if (bp->b_flags & B_PHYS)
panic("nfs physio");
@@ -2797,6 +2800,10 @@ again:
/*
* Work out if all buffers are using the same cred
* so we can deal with them all with one commit.
+ *
+ * NOTE: we are not clearing B_DONE here, so we have
+ * to do it later on in this routine if we intend to
+ * initiate I/O on the bp.
*/
if (wcred == NULL)
wcred = bp->b_wcred;
@@ -2804,6 +2811,14 @@ again:
wcred = NOCRED;
bp->b_flags |= (B_BUSY | B_WRITEINPROG);
vfs_busy_pages(bp, 1);
+
+ /*
+ * bp is protected by being B_BUSY, but nbp is not
+ * and vfs_busy_pages() may sleep. We have to
+ * recalculate nbp.
+ */
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+
/*
* A list of these buffers is kept so that the
* second loop knows which buffers have actually
@@ -2849,6 +2864,7 @@ again:
if (retv == NFSERR_STALEWRITEVERF)
nfs_clearcommit(vp->v_mount);
+
/*
* Now, either mark the blocks I/O done or mark the
* blocks dirty, depending on whether the commit
@@ -2858,23 +2874,27 @@ again:
bp = bvec[i];
bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG);
if (retv) {
- vfs_unbusy_pages(bp);
- brelse(bp);
+ /*
+ * Error, leave B_DELWRI intact
+ */
+ vfs_unbusy_pages(bp);
+ brelse(bp);
} else {
- s = splbio(); /* XXX check this positionning */
- vp->v_numoutput++;
- bp->b_flags |= B_ASYNC;
- if (bp->b_flags & B_DELWRI) {
- --numdirtybuffers;
- if (needsbuffer) {
- vfs_bio_need_satisfy();
- }
- }
- bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
- bp->b_dirtyoff = bp->b_dirtyend = 0;
- reassignbuf(bp, vp);
- splx(s);
- biodone(bp);
+ /*
+ * Success, remove B_DELWRI ( bundirty() ).
+ *
+ * b_dirtyoff/b_dirtyend seem to be NFS
+ * specific. We should probably move that
+ * into bundirty(). XXX
+ */
+ s = splbio();
+ vp->v_numoutput++;
+ bp->b_flags |= B_ASYNC;
+ bundirty(bp);
+ bp->b_flags &= ~(B_READ|B_DONE|B_ERROR);
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ splx(s);
+ biodone(bp);
}
}
}
@@ -2999,6 +3019,8 @@ nfs_print(ap)
/*
* Just call nfs_writebp() with the force argument set to 1.
+ *
+ * NOTE: B_DONE may or may not be set in a_bp on call.
*/
static int
nfs_bwrite(ap)
@@ -3020,26 +3042,24 @@ nfs_writebp(bp, force)
int force;
{
int s;
- register int oldflags = bp->b_flags, retv = 1;
+ int oldflags = bp->b_flags;
+ int retv = 1;
off_t off;
if(!(bp->b_flags & B_BUSY))
panic("bwrite: buffer is not busy???");
if (bp->b_flags & B_INVAL)
- bp->b_flags |= B_INVAL | B_NOCACHE;
+ bp->b_flags |= B_NOCACHE;
- if (bp->b_flags & B_DELWRI) {
- --numdirtybuffers;
- if (needsbuffer)
- vfs_bio_need_satisfy();
- }
- s = splbio(); /* XXX check if needed */
- bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
+ /*
+ * XXX we bundirty() the bp here. Shouldn't we do it later after
+ * the I/O has completed??
+ */
- if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
- reassignbuf(bp, bp->b_vp);
- }
+ s = splbio();
+ bundirty(bp);
+ bp->b_flags &= ~(B_READ|B_DONE|B_ERROR);
bp->b_vp->v_numoutput++;
curproc->p_stats->p_ru.ru_oublock++;
@@ -3061,8 +3081,9 @@ nfs_writebp(bp, force)
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_flags &= ~B_NEEDCOMMIT;
biodone(bp);
- } else if (retv == NFSERR_STALEWRITEVERF)
+ } else if (retv == NFSERR_STALEWRITEVERF) {
nfs_clearcommit(bp->b_vp->v_mount);
+ }
}
if (retv) {
if (force)
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index fb437a530430..2fb535399d3a 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $
+ * $Id: nfs_bio.c,v 1.66 1999/01/21 08:29:07 dillon Exp $
*/
@@ -418,6 +418,7 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
return (EINTR);
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
rabp->b_flags |= (B_READ | B_ASYNC);
+ rabp->b_flags &= ~B_DONE;
vfs_busy_pages(rabp, 0);
if (nfs_asyncio(rabp, cred)) {
rabp->b_flags |= B_INVAL|B_ERROR;
@@ -513,6 +514,7 @@ again:
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
+ bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
@@ -537,6 +539,7 @@ again:
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
+ bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
@@ -560,6 +563,7 @@ again:
return (EINTR);
if ((bp->b_flags & B_DONE) == 0) {
bp->b_flags |= B_READ;
+ bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error == 0 && (bp->b_flags & B_INVAL))
@@ -591,6 +595,7 @@ again:
if (rabp) {
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
rabp->b_flags |= (B_READ | B_ASYNC);
+ rabp->b_flags &= ~B_DONE;
vfs_busy_pages(rabp, 0);
if (nfs_asyncio(rabp, cred)) {
rabp->b_flags |= B_INVAL|B_ERROR;
@@ -840,6 +845,12 @@ again:
bp->b_dirtyoff = on;
bp->b_dirtyend = on + n;
}
+ /*
+ * To avoid code complexity, we may have to throw away
+ * previously valid ranges when merging the new dirty range
+ * into the valid range. As long as we do not *ADD* an
+ * invalid valid range, we are ok.
+ */
if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
bp->b_validoff > bp->b_dirtyend) {
bp->b_validoff = bp->b_dirtyoff;
@@ -1004,7 +1015,7 @@ nfs_asyncio(bp, cred)
if (nfs_numasync == 0)
return (EIO);
-
+
nmp = VFSTONFS(bp->b_vp->v_mount);
again:
if (nmp->nm_flag & NFSMNT_INT)
@@ -1109,12 +1120,12 @@ again:
*/
int
nfs_doio(bp, cr, p)
- register struct buf *bp;
+ struct buf *bp;
struct ucred *cr;
struct proc *p;
{
- register struct uio *uiop;
- register struct vnode *vp;
+ struct uio *uiop;
+ struct vnode *vp;
struct nfsnode *np;
struct nfsmount *nmp;
int error = 0, diff, len, iomode, must_commit = 0;
@@ -1130,6 +1141,8 @@ nfs_doio(bp, cr, p)
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_procp = p;
+ KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
+
/*
* Historically, paging was done with physio, but no more.
*/
@@ -1236,10 +1249,12 @@ nfs_doio(bp, cr, p)
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
uiop->uio_rw = UIO_WRITE;
nfsstats.write_bios++;
+
if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
iomode = NFSV3WRITE_UNSTABLE;
else
iomode = NFSV3WRITE_FILESYNC;
+
bp->b_flags |= B_WRITEINPROG;
error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
if (!error && iomode == NFSV3WRITE_UNSTABLE) {
@@ -1247,8 +1262,9 @@ nfs_doio(bp, cr, p)
if (bp->b_dirtyoff == 0
&& bp->b_dirtyend == bp->b_bufsize)
bp->b_flags |= B_CLUSTEROK;
- } else
+ } else {
bp->b_flags &= ~B_NEEDCOMMIT;
+ }
bp->b_flags &= ~B_WRITEINPROG;
/*
@@ -1265,31 +1281,30 @@ nfs_doio(bp, cr, p)
* the B_DELWRI and B_NEEDCOMMIT flags.
*
* If the buffer is marked B_PAGING, it does not reside on
- * the vp's paging queues so we do not ( and cannot ) reassign
- * it. XXX numdirtybuffers should be integrated into
- * reassignbuf() call.
+ * the vp's paging queues so we cannot call bdirty(). The
+ * bp in this case is not an NFS cache block so we should
+ * be safe. XXX
*/
if (error == EINTR
|| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
int s;
+ s = splbio();
bp->b_flags &= ~(B_INVAL|B_NOCACHE);
if ((bp->b_flags & B_PAGING) == 0) {
- ++numdirtybuffers;
- bp->b_flags |= B_DELWRI;
- s = splbio();
- reassignbuf(bp, vp);
- splx(s);
+ bdirty(bp);
+ bp->b_flags &= ~B_DONE;
}
if ((bp->b_flags & B_ASYNC) == 0)
bp->b_flags |= B_EINTR;
+ splx(s);
} else {
- if (error) {
- bp->b_flags |= B_ERROR;
- bp->b_error = np->n_error = error;
- np->n_flag |= NWRITEERR;
- }
- bp->b_dirtyoff = bp->b_dirtyend = 0;
+ if (error) {
+ bp->b_flags |= B_ERROR;
+ bp->b_error = np->n_error = error;
+ np->n_flag |= NWRITEERR;
+ }
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
}
} else {
bp->b_resid = 0;
@@ -1299,7 +1314,7 @@ nfs_doio(bp, cr, p)
}
bp->b_resid = uiop->uio_resid;
if (must_commit)
- nfs_clearcommit(vp->v_mount);
+ nfs_clearcommit(vp->v_mount);
biodone(bp);
return (error);
}
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index 4afb4697c56b..a92bb2295811 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.122 1999/02/13 09:47:30 dillon Exp $
+ * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
*/
@@ -2648,6 +2648,9 @@ nfs_strategy(ap)
struct proc *p;
int error = 0;
+ KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
+ KASSERT((bp->b_flags & B_BUSY), ("nfs_strategy: buffer %p not B_BUSY", bp));
+
if (bp->b_flags & B_PHYS)
panic("nfs physio");
@@ -2797,6 +2800,10 @@ again:
/*
* Work out if all buffers are using the same cred
* so we can deal with them all with one commit.
+ *
+ * NOTE: we are not clearing B_DONE here, so we have
+ * to do it later on in this routine if we intend to
+ * initiate I/O on the bp.
*/
if (wcred == NULL)
wcred = bp->b_wcred;
@@ -2804,6 +2811,14 @@ again:
wcred = NOCRED;
bp->b_flags |= (B_BUSY | B_WRITEINPROG);
vfs_busy_pages(bp, 1);
+
+ /*
+ * bp is protected by being B_BUSY, but nbp is not
+ * and vfs_busy_pages() may sleep. We have to
+ * recalculate nbp.
+ */
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+
/*
* A list of these buffers is kept so that the
* second loop knows which buffers have actually
@@ -2849,6 +2864,7 @@ again:
if (retv == NFSERR_STALEWRITEVERF)
nfs_clearcommit(vp->v_mount);
+
/*
* Now, either mark the blocks I/O done or mark the
* blocks dirty, depending on whether the commit
@@ -2858,23 +2874,27 @@ again:
bp = bvec[i];
bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG);
if (retv) {
- vfs_unbusy_pages(bp);
- brelse(bp);
+ /*
+ * Error, leave B_DELWRI intact
+ */
+ vfs_unbusy_pages(bp);
+ brelse(bp);
} else {
- s = splbio(); /* XXX check this positionning */
- vp->v_numoutput++;
- bp->b_flags |= B_ASYNC;
- if (bp->b_flags & B_DELWRI) {
- --numdirtybuffers;
- if (needsbuffer) {
- vfs_bio_need_satisfy();
- }
- }
- bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
- bp->b_dirtyoff = bp->b_dirtyend = 0;
- reassignbuf(bp, vp);
- splx(s);
- biodone(bp);
+ /*
+ * Success, remove B_DELWRI ( bundirty() ).
+ *
+ * b_dirtyoff/b_dirtyend seem to be NFS
+ * specific. We should probably move that
+ * into bundirty(). XXX
+ */
+ s = splbio();
+ vp->v_numoutput++;
+ bp->b_flags |= B_ASYNC;
+ bundirty(bp);
+ bp->b_flags &= ~(B_READ|B_DONE|B_ERROR);
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ splx(s);
+ biodone(bp);
}
}
}
@@ -2999,6 +3019,8 @@ nfs_print(ap)
/*
* Just call nfs_writebp() with the force argument set to 1.
+ *
+ * NOTE: B_DONE may or may not be set in a_bp on call.
*/
static int
nfs_bwrite(ap)
@@ -3020,26 +3042,24 @@ nfs_writebp(bp, force)
int force;
{
int s;
- register int oldflags = bp->b_flags, retv = 1;
+ int oldflags = bp->b_flags;
+ int retv = 1;
off_t off;
if(!(bp->b_flags & B_BUSY))
panic("bwrite: buffer is not busy???");
if (bp->b_flags & B_INVAL)
- bp->b_flags |= B_INVAL | B_NOCACHE;
+ bp->b_flags |= B_NOCACHE;
- if (bp->b_flags & B_DELWRI) {
- --numdirtybuffers;
- if (needsbuffer)
- vfs_bio_need_satisfy();
- }
- s = splbio(); /* XXX check if needed */
- bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
+ /*
+ * XXX we bundirty() the bp here. Shouldn't we do it later after
+ * the I/O has completed??
+ */
- if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
- reassignbuf(bp, bp->b_vp);
- }
+ s = splbio();
+ bundirty(bp);
+ bp->b_flags &= ~(B_READ|B_DONE|B_ERROR);
bp->b_vp->v_numoutput++;
curproc->p_stats->p_ru.ru_oublock++;
@@ -3061,8 +3081,9 @@ nfs_writebp(bp, force)
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_flags &= ~B_NEEDCOMMIT;
biodone(bp);
- } else if (retv == NFSERR_STALEWRITEVERF)
+ } else if (retv == NFSERR_STALEWRITEVERF) {
nfs_clearcommit(bp->b_vp->v_mount);
+ }
}
if (retv) {
if (force)
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index 5ce4039ce904..d2ce212b4d12 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.63 1999/01/21 13:41:12 peter Exp $
+ * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
*/
#ifndef _SYS_BUF_H_
@@ -127,6 +127,10 @@ struct buf {
struct vm_page *b_pages[btoc(MAXPHYS)];
int b_npages;
struct workhead b_dep; /* List of filesystem dependencies. */
+ struct chain_info { /* buffer chaining */
+ struct buf *parent;
+ int count;
+ } b_chain;
};
#define b_spc b_pager.pg_spc
@@ -184,12 +188,12 @@ struct buf {
#define B_RAM 0x10000000 /* Read ahead mark (flag) */
#define B_VMIO 0x20000000 /* VMIO flag */
#define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */
-#define B_AVAIL1 0x80000000 /* Available flag */
+#define B_AUTOCHAINDONE 0x80000000 /* Available flag */
-#define PRINT_BUF_FLAGS "\20\40avail1\37cluster\36vmio\35ram\34ordered" \
+#define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \
"\33paging\32xxx\31writeinprog\30wanted\27relbuf\26dirty" \
"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
- "\17locked\16inval\15avail2\14error\13eintr\12done\11freebuf" \
+ "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
"\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age"
/*
@@ -315,7 +319,6 @@ extern char *buffers; /* The buffer contents. */
extern int bufpages; /* Number of memory pages in the buffer pool. */
extern struct buf *swbuf; /* Swap I/O buffer headers. */
extern int nswbuf; /* Number of swap I/O buffer headers. */
-extern int needsbuffer, numdirtybuffers;
extern TAILQ_HEAD(swqueue, buf) bswlist;
extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
@@ -331,6 +334,7 @@ int bwrite __P((struct buf *));
void bdwrite __P((struct buf *));
void bawrite __P((struct buf *));
void bdirty __P((struct buf *));
+void bundirty __P((struct buf *));
int bowrite __P((struct buf *));
void brelse __P((struct buf *));
void bqrelse __P((struct buf *));
@@ -367,7 +371,6 @@ int allocbuf __P((struct buf *bp, int size));
void reassignbuf __P((struct buf *, struct vnode *));
void pbreassignbuf __P((struct buf *, struct vnode *));
struct buf *trypbuf __P((int *));
-void vfs_bio_need_satisfy __P((void));
#endif /* KERNEL */
#endif /* !_SYS_BUF_H_ */
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 5ce4039ce904..d2ce212b4d12 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.63 1999/01/21 13:41:12 peter Exp $
+ * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
*/
#ifndef _SYS_BUF_H_
@@ -127,6 +127,10 @@ struct buf {
struct vm_page *b_pages[btoc(MAXPHYS)];
int b_npages;
struct workhead b_dep; /* List of filesystem dependencies. */
+ struct chain_info { /* buffer chaining */
+ struct buf *parent;
+ int count;
+ } b_chain;
};
#define b_spc b_pager.pg_spc
@@ -184,12 +188,12 @@ struct buf {
#define B_RAM 0x10000000 /* Read ahead mark (flag) */
#define B_VMIO 0x20000000 /* VMIO flag */
#define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */
-#define B_AVAIL1 0x80000000 /* Available flag */
+#define B_AUTOCHAINDONE 0x80000000 /* Available flag */
-#define PRINT_BUF_FLAGS "\20\40avail1\37cluster\36vmio\35ram\34ordered" \
+#define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \
"\33paging\32xxx\31writeinprog\30wanted\27relbuf\26dirty" \
"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
- "\17locked\16inval\15avail2\14error\13eintr\12done\11freebuf" \
+ "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
"\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age"
/*
@@ -315,7 +319,6 @@ extern char *buffers; /* The buffer contents. */
extern int bufpages; /* Number of memory pages in the buffer pool. */
extern struct buf *swbuf; /* Swap I/O buffer headers. */
extern int nswbuf; /* Number of swap I/O buffer headers. */
-extern int needsbuffer, numdirtybuffers;
extern TAILQ_HEAD(swqueue, buf) bswlist;
extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
@@ -331,6 +334,7 @@ int bwrite __P((struct buf *));
void bdwrite __P((struct buf *));
void bawrite __P((struct buf *));
void bdirty __P((struct buf *));
+void bundirty __P((struct buf *));
int bowrite __P((struct buf *));
void brelse __P((struct buf *));
void bqrelse __P((struct buf *));
@@ -367,7 +371,6 @@ int allocbuf __P((struct buf *bp, int size));
void reassignbuf __P((struct buf *, struct vnode *));
void pbreassignbuf __P((struct buf *, struct vnode *));
struct buf *trypbuf __P((int *));
-void vfs_bio_need_satisfy __P((void));
#endif /* KERNEL */
#endif /* !_SYS_BUF_H_ */
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 6f51c57c4922..910760bf199c 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)proc.h 8.15 (Berkeley) 5/19/95
- * $Id: proc.h,v 1.73 1999/03/03 18:15:29 julian Exp $
+ * $Id: proc.h,v 1.74 1999/03/05 16:38:12 bde Exp $
*/
#ifndef _SYS_PROC_H_
@@ -262,10 +262,11 @@ struct proc {
#define P_SWAPINREQ 0x80000 /* Swapin request due to wakeup */
/* Marked a kernel thread */
+#define P_FLSINPROG 0x100000 /* dirty buffers flush is in progress */
#define P_KTHREADP 0x200000 /* Process is really a kernel thread */
#define P_NOCLDWAIT 0x400000 /* No zombies if child dies */
-
+#define P_DEADLKTREAT 0x800000 /* lock aquisition - deadlock treatment */
/*
* MOVE TO ucred.h?
@@ -336,7 +337,7 @@ extern struct timeval switchtime; /* Uptime at last context switch */
LIST_HEAD(proclist, proc);
extern struct proclist allproc; /* List of all processes. */
extern struct proclist zombproc; /* List of zombie processes. */
-extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
+extern struct proc *initproc, *pageproc, *updateproc; /* Process slots for init, pager. */
#define NQS 32 /* 32 run queues. */
extern struct prochd qs[];