diff options
Diffstat (limited to 'sys/kern/vfs_cluster.c')
| -rw-r--r-- | sys/kern/vfs_cluster.c | 651 |
1 files changed, 264 insertions, 387 deletions
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index e58dfc178d29..bca74820a785 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -1,6 +1,8 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. + * Modifications/enhancements: + * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,7 +33,7 @@ * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 - * $Id: vfs_cluster.c,v 1.6 1994/10/08 22:33:41 phk Exp $ + * $Id: vfs_cluster.c,v 1.7 1994/12/18 03:05:49 davidg Exp $ */ #include <sys/param.h> @@ -43,12 +45,15 @@ #include <sys/trace.h> #include <sys/malloc.h> #include <sys/resourcevar.h> +#include <sys/vmmeter.h> +#include <miscfs/specfs/specdev.h> #ifdef DEBUG #include <vm/vm.h> #include <sys/sysctl.h> int doreallocblks = 0; -struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; +struct ctldebug debug13 = {"doreallocblks", &doreallocblks}; + #else /* XXX for cluster_write */ #define doreallocblks 0 @@ -57,14 +62,14 @@ struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; /* * Local declarations */ -struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, - daddr_t, long, int)); struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, - daddr_t, daddr_t, long, int, long)); -void cluster_wbuild __P((struct vnode *, struct buf *, long, - daddr_t, int, daddr_t)); + daddr_t, daddr_t, long, int, long)); +void cluster_wbuild __P((struct vnode *, struct buf *, long, daddr_t, int, daddr_t)); struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); +int totreads; +int totreadblocks; + #ifdef DIAGNOSTIC /* * Set to 1 if reads of block zero should cause readahead to be done. @@ -78,7 +83,8 @@ struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); * blocks from the cache. The former seems intuitive, but some quick tests * showed that the latter performed better from a system-wide point of view. */ -int doclusterraz = 0; + int doclusterraz = 0; + #define ISSEQREAD(vp, blk) \ (((blk) != 0 || doclusterraz) && \ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) @@ -92,17 +98,6 @@ int doclusterraz = 0; * lastr is 0, we assume this is the first read and we'll read up to two * blocks if they are sequential. After that, we'll do regular read ahead * in clustered chunks. - * - * There are 4 or 5 cases depending on how you count: - * Desired block is in the cache: - * 1 Not sequential access (0 I/Os). - * 2 Access is sequential, do read-ahead (1 ASYNC). - * Desired block is not in cache: - * 3 Not sequential access (1 SYNC). - * 4 Sequential access, next block is contiguous (1 SYNC). - * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) - * - * There are potentially two buffers that require I/O. * bp is the block requested. * rbp is the read-ahead block. * If either is NULL, then you don't have to do the I/O. @@ -117,156 +112,136 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp) struct buf **bpp; { struct buf *bp, *rbp; - daddr_t blkno, ioblkno; + daddr_t blkno, rablkno, origlblkno; long flags; int error, num_ra, alreadyincore; -#ifdef DIAGNOSTIC - if (size == 0) - panic("cluster_read: size = 0"); -#endif - + origlblkno = lblkno; error = 0; - flags = B_READ; + /* + * get the requested block + */ *bpp = bp = getblk(vp, lblkno, size, 0, 0); + /* + * if it is in the cache, then check to see if the reads have been + * sequential. If they have, then try some read-ahead, otherwise + * back-off on prospective read-aheads. + */ if (bp->b_flags & B_CACHE) { - /* - * Desired block is in cache; do any readahead ASYNC. - * Case 1, 2. - */ - trace(TR_BREADHIT, pack(vp, size), lblkno); - flags |= B_ASYNC; - ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); - alreadyincore = (int)incore(vp, ioblkno); + int i; + + if (!ISSEQREAD(vp, origlblkno)) { + vp->v_ralen >>= 1; + return 0; + } bp = NULL; } else { - /* Block wasn't in cache, case 3, 4, 5. */ - trace(TR_BREADMISS, pack(vp, size), lblkno); + /* + * if it isn't in the cache, then get a chunk from disk if + * sequential, otherwise just get the block. + */ bp->b_flags |= B_READ; - ioblkno = lblkno; - alreadyincore = 0; - curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + lblkno += 1; + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ } /* - * XXX - * Replace 1 with a window size based on some permutation of - * maxcontig and rot_delay. This will let you figure out how - * many blocks you should read-ahead (case 2, 4, 5). - * - * If the access isn't sequential, reset the window to 1. - * Note that a read to the same block is considered sequential. - * This catches the case where the file is being read sequentially, - * but at smaller than the filesystem block size. + * if ralen is "none", then try a little */ - rbp = NULL; - if (!ISSEQREAD(vp, lblkno)) { - vp->v_ralen = 0; - vp->v_maxra = lblkno; - } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && - !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && - blkno != -1) { + if (vp->v_ralen == 0) + vp->v_ralen = 1; + /* + * assume no read-ahead + */ + alreadyincore = 1; + rablkno = lblkno; + + /* + * if we have been doing sequential I/O, then do some read-ahead + */ + if (ISSEQREAD(vp, origlblkno)) { + int i; + /* - * Reading sequentially, and the next block is not in the - * cache. We are going to try reading ahead. + * this code makes sure that the stuff that we have read-ahead + * is still in the cache. If it isn't, we have been reading + * ahead too much, and we need to back-off, otherwise we might + * try to read more. */ - if (num_ra) { - /* - * If our desired readahead block had been read - * in a previous readahead but is no longer in - * core, then we may be reading ahead too far - * or are not using our readahead very rapidly. - * In this case we scale back the window. - */ - if (!alreadyincore && ioblkno <= vp->v_maxra) - vp->v_ralen = max(vp->v_ralen >> 1, 1); - /* - * There are more sequential blocks than our current - * window allows, scale up. Ideally we want to get - * in sync with the filesystem maxcontig value. - */ - else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) - vp->v_ralen = vp->v_ralen ? - min(num_ra, vp->v_ralen << 1) : 1; - - if (num_ra > vp->v_ralen) - num_ra = vp->v_ralen; + for (i = 0; i < vp->v_ralen; i++) { + rablkno = lblkno + i; + alreadyincore = (int) incore(vp, rablkno); + if (!alreadyincore) { + if (rablkno < vp->v_maxra) { + vp->v_maxra = rablkno; + vp->v_ralen >>= 1; + alreadyincore = 1; + } else { + if (inmem(vp, rablkno)) + continue; + if ((vp->v_ralen + 1) < MAXPHYS / size) + vp->v_ralen++; + } + break; + } } + } + /* + * we now build the read-ahead buffer if it is desirable. + */ + rbp = NULL; + if (!alreadyincore && + (rablkno + 1) * size <= filesize && + !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra)) && + blkno != -1) { + if (num_ra > vp->v_ralen) + num_ra = vp->v_ralen; - if (num_ra) /* case 2, 4 */ + if (num_ra && + ((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_reserved)) { rbp = cluster_rbuild(vp, filesize, - bp, ioblkno, blkno, size, num_ra, flags); - else if (ioblkno == lblkno) { - bp->b_blkno = blkno; - /* Case 5: check how many blocks to read ahead */ - ++ioblkno; - if ((ioblkno + 1) * size > filesize || - incore(vp, ioblkno) || (error = VOP_BMAP(vp, - ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) - goto skip_readahead; - /* - * Adjust readahead as above - */ - if (num_ra) { - if (!alreadyincore && ioblkno <= vp->v_maxra) - vp->v_ralen = max(vp->v_ralen >> 1, 1); - else if (num_ra > vp->v_ralen && - lblkno != vp->v_lastr) - vp->v_ralen = vp->v_ralen ? - min(num_ra,vp->v_ralen<<1) : 1; - if (num_ra > vp->v_ralen) - num_ra = vp->v_ralen; - } - flags |= B_ASYNC; - if (num_ra) - rbp = cluster_rbuild(vp, filesize, - NULL, ioblkno, blkno, size, num_ra, flags); - else { - rbp = getblk(vp, ioblkno, size, 0, 0); - rbp->b_flags |= flags; - rbp->b_blkno = blkno; - } + NULL, rablkno, blkno, size, num_ra, B_READ | B_ASYNC); } else { - /* case 2; read ahead single block */ - rbp = getblk(vp, ioblkno, size, 0, 0); - rbp->b_flags |= flags; + rbp = getblk(vp, rablkno, size, 0, 0); + rbp->b_flags |= B_READ | B_ASYNC; rbp->b_blkno = blkno; } - - if (rbp == bp) /* case 4 */ - rbp = NULL; - else if (rbp) { /* case 2, 5 */ - trace(TR_BREADMISSRA, - pack(vp, (num_ra + 1) * size), ioblkno); - curproc->p_stats->p_ru.ru_inblock++; /* XXX */ - } } - - /* XXX Kirk, do we need to make sure the bp has creds? */ skip_readahead: - if (bp) + /* + * if the synchronous read is a cluster, handle it, otherwise do a + * simple, non-clustered read. + */ + if (bp) { if (bp->b_flags & (B_DONE | B_DELWRI)) panic("cluster_read: DONE bp"); - else + else { + vfs_busy_pages(bp, 0); error = VOP_STRATEGY(bp); - - if (rbp) - if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { + vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; + totreads++; + totreadblocks += bp->b_bcount / size; + curproc->p_stats->p_ru.ru_inblock++; + } + } + /* + * and if we have read-aheads, do them too + */ + if (rbp) { + if (error || (rbp->b_flags & B_CACHE)) { rbp->b_flags &= ~(B_ASYNC | B_READ); brelse(rbp); - } else + } else { + vfs_busy_pages(rbp, 0); (void) VOP_STRATEGY(rbp); - - /* - * Recalculate our maximum readahead - */ - if (rbp == NULL) - rbp = bp; - if (rbp) - vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; - + vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size; + totreads++; + totreadblocks += rbp->b_bcount / size; + curproc->p_stats->p_ru.ru_inblock++; + } + } if (bp) - return(biowait(bp)); - return(error); + return (biowait(bp)); + return (error); } /* @@ -288,12 +263,12 @@ cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) struct cluster_save *b_save; struct buf *tbp; daddr_t bn; - int i, inc; + int i, inc, j; #ifdef DIAGNOSTIC if (size != vp->v_mount->mnt_stat.f_iosize) panic("cluster_rbuild: size %d != filesize %d\n", - size, vp->v_mount->mnt_stat.f_iosize); + size, vp->v_mount->mnt_stat.f_iosize); #endif if (size * (lbn + run + 1) > filesize) --run; @@ -303,97 +278,65 @@ cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) bp->b_blkno = blkno; bp->b_flags |= flags; } - return(bp); - } - - bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); - if (bp->b_flags & (B_DONE | B_DELWRI)) return (bp); + } + tbp = bp; + if (!tbp) { + tbp = getblk(vp, lbn, size, 0, 0); + } + if (tbp->b_flags & B_CACHE) { + return (tbp); + } else if (bp == NULL) { + tbp->b_flags |= B_ASYNC; + } + bp = getpbuf(); + bp->b_flags = flags | B_CALL | B_BUSY | B_CLUSTER; + bp->b_iodone = cluster_callback; + bp->b_blkno = blkno; + bp->b_lblkno = lbn; + pbgetvp(vp, bp); b_save = malloc(sizeof(struct buf *) * (run + 1) + sizeof(struct cluster_save), M_SEGMENT, M_WAITOK); - b_save->bs_bufsize = b_save->bs_bcount = size; b_save->bs_nchildren = 0; - b_save->bs_children = (struct buf **)(b_save + 1); - b_save->bs_saveaddr = bp->b_saveaddr; - bp->b_saveaddr = (caddr_t) b_save; + b_save->bs_children = (struct buf **) (b_save + 1); + bp->b_saveaddr = b_save; + + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; + + if (tbp->b_flags & B_VMIO) + bp->b_flags |= B_VMIO; inc = btodb(size); - for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { - if (incore(vp, lbn + i)) { - if (i == 1) { - bp->b_saveaddr = b_save->bs_saveaddr; - bp->b_flags &= ~B_CALL; - bp->b_iodone = NULL; - allocbuf(bp, size); - free(b_save, M_SEGMENT); - } else - allocbuf(bp, size * i); - break; - } - tbp = getblk(vp, lbn + i, 0, 0, 0); - /* - * getblk may return some memory in the buffer if there were - * no empty buffers to shed it to. If there is currently - * memory in the buffer, we move it down size bytes to make - * room for the valid pages that cluster_callback will insert. - * We do this now so we don't have to do it at interrupt time - * in the callback routine. - */ - if (tbp->b_bufsize != 0) { - caddr_t bdata = (char *)tbp->b_data; - - if (tbp->b_bufsize + size > MAXBSIZE) - panic("cluster_rbuild: too much memory"); - if (tbp->b_bufsize > size) { - /* - * XXX if the source and destination regions - * overlap we have to copy backward to avoid - * clobbering any valid pages (i.e. pagemove - * implementations typically can't handle - * overlap). - */ - bdata += tbp->b_bufsize; - while (bdata > (char *)tbp->b_data) { - bdata -= CLBYTES; - pagemove(bdata, bdata + size, CLBYTES); - } - } else - pagemove(bdata, bdata + size, tbp->b_bufsize); + for (bn = blkno, i = 0; i <= run; ++i, bn += inc) { + if (i != 0) { + if (inmem(vp, lbn + i)) { + break; + } + tbp = getblk(vp, lbn + i, size, 0, 0); + if ((tbp->b_flags & B_CACHE) || + (tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO)) { + brelse(tbp); + break; + } + tbp->b_blkno = bn; + tbp->b_flags |= flags | B_READ | B_ASYNC; + } else { + tbp->b_flags |= flags | B_READ; } - tbp->b_blkno = bn; - tbp->b_flags |= flags | B_READ | B_ASYNC; ++b_save->bs_nchildren; - b_save->bs_children[i - 1] = tbp; - } - return(bp); -} - -/* - * Either get a new buffer or grow the existing one. - */ -struct buf * -cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) - struct vnode *vp; - struct buf *bp; - long flags; - daddr_t blkno; - daddr_t lblkno; - long size; - int run; -{ - if (!bp) { - bp = getblk(vp, lblkno, size, 0, 0); - if (bp->b_flags & (B_DONE | B_DELWRI)) { - bp->b_blkno = blkno; - return(bp); + b_save->bs_children[i] = tbp; + for (j = 0; j < tbp->b_npages; j += 1) { + bp->b_pages[j + bp->b_npages] = tbp->b_pages[j]; } + bp->b_npages += tbp->b_npages; + bp->b_bcount += size; + bp->b_bufsize += size; } - allocbuf(bp, run * size); - bp->b_blkno = blkno; - bp->b_iodone = cluster_callback; - bp->b_flags |= flags | B_CALL; - return(bp); + pmap_qenter(bp->b_data, bp->b_pages, bp->b_npages); + return (bp); } /* @@ -408,7 +351,6 @@ cluster_callback(bp) { struct cluster_save *b_save; struct buf **bpp, *tbp; - long bsize; caddr_t cp; int error = 0; @@ -418,46 +360,22 @@ cluster_callback(bp) if (bp->b_flags & B_ERROR) error = bp->b_error; - b_save = (struct cluster_save *)(bp->b_saveaddr); - bp->b_saveaddr = b_save->bs_saveaddr; - - bsize = b_save->bs_bufsize; - cp = (char *)bp->b_data + bsize; + b_save = (struct cluster_save *) (bp->b_saveaddr); + pmap_qremove(bp->b_data, bp->b_npages); /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { tbp = *bpp; - pagemove(cp, tbp->b_data, bsize); - tbp->b_bufsize += bsize; - tbp->b_bcount = bsize; if (error) { tbp->b_flags |= B_ERROR; tbp->b_error = error; } biodone(tbp); - bp->b_bufsize -= bsize; - cp += bsize; - } - /* - * If there was excess memory in the cluster buffer, - * slide it up adjacent to the remaining valid data. - */ - if (bp->b_bufsize != bsize) { - if (bp->b_bufsize < bsize) - panic("cluster_callback: too little memory"); - pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); } - bp->b_bcount = bsize; - bp->b_iodone = NULL; free(b_save, M_SEGMENT); - if (bp->b_flags & B_ASYNC) - brelse(bp); - else { - bp->b_flags &= ~B_WANTED; - wakeup((caddr_t)bp); - } + relpbuf(bp); } /* @@ -472,78 +390,47 @@ cluster_callback(bp) */ void cluster_write(bp, filesize) - struct buf *bp; + struct buf *bp; u_quad_t filesize; { - struct vnode *vp; - daddr_t lbn; - int maxclen, cursize; + struct vnode *vp; + daddr_t lbn; + int maxclen, cursize; + int lblocksize; - vp = bp->b_vp; - lbn = bp->b_lblkno; + vp = bp->b_vp; + lblocksize = vp->v_mount->mnt_stat.f_iosize; + lbn = bp->b_lblkno; /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; - if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || - (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { - maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; + if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || + (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { + maxclen = MAXPHYS / lblocksize; if (vp->v_clen != 0) { /* * Next block is not sequential. - * + * * If we are not writing at end of file, the process - * seeked to another point in the file since its - * last write, or we have reached our maximum - * cluster size, then push the previous cluster. - * Otherwise try reallocating to make it sequential. + * seeked to another point in the file since its last + * write, or we have reached our maximum cluster size, + * then push the previous cluster. Otherwise try + * reallocating to make it sequential. */ cursize = vp->v_lastw - vp->v_cstart + 1; - if (!doreallocblks || - (lbn + 1) * bp->b_bcount != filesize || - lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { - cluster_wbuild(vp, NULL, bp->b_bcount, - vp->v_cstart, cursize, lbn); - } else { - struct buf **bpp, **endbp; - struct cluster_save *buflist; - - buflist = cluster_collectbufs(vp, bp); - endbp = &buflist->bs_children - [buflist->bs_nchildren - 1]; - if (VOP_REALLOCBLKS(vp, buflist)) { - /* - * Failed, push the previous cluster. - */ - for (bpp = buflist->bs_children; - bpp < endbp; bpp++) - brelse(*bpp); - free(buflist, M_SEGMENT); - cluster_wbuild(vp, NULL, bp->b_bcount, - vp->v_cstart, cursize, lbn); - } else { - /* - * Succeeded, keep building cluster. - */ - for (bpp = buflist->bs_children; - bpp <= endbp; bpp++) - bdwrite(*bpp); - free(buflist, M_SEGMENT); - vp->v_lastw = lbn; - vp->v_lasta = bp->b_blkno; - return; - } - } + cluster_wbuild(vp, NULL, lblocksize, + vp->v_cstart, cursize, lbn); } /* - * Consider beginning a cluster. - * If at end of file, make cluster as large as possible, - * otherwise find size of existing cluster. + * Consider beginning a cluster. If at end of file, make + * cluster as large as possible, otherwise find size of + * existing cluster. */ - if ((lbn + 1) * bp->b_bcount != filesize && + if ((lbn + 1) * lblocksize != filesize && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || - bp->b_blkno == -1)) { + bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; @@ -551,13 +438,13 @@ cluster_write(bp, filesize) vp->v_lastw = lbn; return; } - vp->v_clen = maxclen; - if (maxclen == 0) { /* I/O not contiguous */ + vp->v_clen = maxclen; + if (maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; - bawrite(bp); - } else { /* Wait for rest of cluster */ + bawrite(bp); + } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; - bdwrite(bp); + bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* @@ -569,8 +456,7 @@ cluster_write(bp, filesize) vp->v_cstart = lbn + 1; } else /* - * In the middle of a cluster, so just delay the - * I/O for now. + * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); vp->v_lastw = lbn; @@ -591,17 +477,17 @@ cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) long size; daddr_t start_lbn; int len; - daddr_t lbn; + daddr_t lbn; { struct cluster_save *b_save; struct buf *bp, *tbp; - caddr_t cp; - int i, s; + caddr_t cp; + int i, j, s; #ifdef DIAGNOSTIC if (size != vp->v_mount->mnt_stat.f_iosize) panic("cluster_wbuild: size %d != filesize %d\n", - size, vp->v_mount->mnt_stat.f_iosize); + size, vp->v_mount->mnt_stat.f_iosize); #endif redo: while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { @@ -619,104 +505,95 @@ redo: } return; } - - bp = getblk(vp, start_lbn, size, 0, 0); - if (!(bp->b_flags & B_DELWRI)) { + tbp = getblk(vp, start_lbn, size, 0, 0); + if (!(tbp->b_flags & B_DELWRI)) { ++start_lbn; --len; - brelse(bp); + brelse(tbp); goto redo; } - /* - * Extra memory in the buffer, punt on this buffer. - * XXX we could handle this in most cases, but we would have to - * push the extra memory down to after our max possible cluster - * size and then potentially pull it back up if the cluster was - * terminated prematurely--too much hassle. + * Extra memory in the buffer, punt on this buffer. XXX we could + * handle this in most cases, but we would have to push the extra + * memory down to after our max possible cluster size and then + * potentially pull it back up if the cluster was terminated + * prematurely--too much hassle. */ - if (bp->b_bcount != bp->b_bufsize) { + if (tbp->b_bcount != tbp->b_bufsize) { ++start_lbn; --len; - bawrite(bp); + bawrite(tbp); goto redo; } - - --len; - b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), + bp = getpbuf(); + b_save = malloc(sizeof(struct buf *) * (len + 1) + sizeof(struct cluster_save), M_SEGMENT, M_WAITOK); - b_save->bs_bcount = bp->b_bcount; - b_save->bs_bufsize = bp->b_bufsize; b_save->bs_nchildren = 0; - b_save->bs_children = (struct buf **)(b_save + 1); - b_save->bs_saveaddr = bp->b_saveaddr; - bp->b_saveaddr = (caddr_t) b_save; - - bp->b_flags |= B_CALL; + b_save->bs_children = (struct buf **) (b_save + 1); + bp->b_saveaddr = b_save; + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; + + if (tbp->b_flags & B_VMIO) + bp->b_flags |= B_VMIO; + + bp->b_blkno = tbp->b_blkno; + bp->b_lblkno = tbp->b_lblkno; + bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER; bp->b_iodone = cluster_callback; - cp = (char *)bp->b_data + size; - for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { - /* - * Block is not in core or the non-sequential block - * ending our cluster was part of the cluster (in which - * case we don't want to write it twice). - */ - if (!incore(vp, start_lbn) || - (last_bp == NULL && start_lbn == lbn)) - break; + pbgetvp(vp, bp); - /* - * Get the desired block buffer (unless it is the final - * sequential block whose buffer was passed in explictly - * as last_bp). - */ - if (last_bp == NULL || start_lbn != lbn) { - tbp = getblk(vp, start_lbn, size, 0, 0); - if (!(tbp->b_flags & B_DELWRI)) { - brelse(tbp); + for (i = 0; i < len; ++i, ++start_lbn) { + if (i != 0) { + /* + * Block is not in core or the non-sequential block + * ending our cluster was part of the cluster (in + * which case we don't want to write it twice). + */ + if (!(tbp = incore(vp, start_lbn)) || + (last_bp == NULL && start_lbn == lbn)) break; - } - } else - tbp = last_bp; - ++b_save->bs_nchildren; + if ((tbp->b_flags & (B_INVAL | B_BUSY | B_CLUSTEROK)) != B_CLUSTEROK) + break; - /* Move memory from children to parent */ - if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { - printf("Clustered Block: %lu addr %lx bufsize: %ld\n", - (u_long)bp->b_lblkno, bp->b_blkno, bp->b_bufsize); - printf("Child Block: %lu addr: %lx\n", - (u_long)tbp->b_lblkno, tbp->b_blkno); - panic("Clustered write to wrong blocks"); + /* + * Get the desired block buffer (unless it is the + * final sequential block whose buffer was passed in + * explictly as last_bp). + */ + if (last_bp == NULL || start_lbn != lbn) { + tbp = getblk(vp, start_lbn, size, 0, 0); + if (!(tbp->b_flags & B_DELWRI) || + ((tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO))) { + brelse(tbp); + break; + } + } else + tbp = last_bp; } - - pagemove(tbp->b_data, cp, size); + for (j = 0; j < tbp->b_npages; j += 1) { + bp->b_pages[j + bp->b_npages] = tbp->b_pages[j]; + } + bp->b_npages += tbp->b_npages; bp->b_bcount += size; bp->b_bufsize += size; - tbp->b_bufsize -= size; tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); tbp->b_flags |= B_ASYNC; s = splbio(); - reassignbuf(tbp, tbp->b_vp); /* put on clean list */ + reassignbuf(tbp, tbp->b_vp); /* put on clean list */ ++tbp->b_vp->v_numoutput; splx(s); b_save->bs_children[i] = tbp; - - cp += size; - } - - if (i == 0) { - /* None to cluster */ - bp->b_saveaddr = b_save->bs_saveaddr; - bp->b_flags &= ~B_CALL; - bp->b_iodone = NULL; - free(b_save, M_SEGMENT); } + b_save->bs_nchildren = i; + pmap_qenter(bp->b_data, bp->b_pages, bp->b_npages); bawrite(bp); + if (i < len) { - len -= i + 1; - start_lbn += 1; + len -= i; goto redo; } } @@ -731,17 +608,17 @@ cluster_collectbufs(vp, last_bp) struct buf *last_bp; { struct cluster_save *buflist; - daddr_t lbn; + daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; - buflist->bs_children = (struct buf **)(buflist + 1); + buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) - (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, - &buflist->bs_children[i]); + (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, + &buflist->bs_children[i]); buflist->bs_children[i] = last_bp; buflist->bs_nchildren = i + 1; return (buflist); |
