1 files changed, 198 insertions, 71 deletions
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 606981f819e4..06f24d63479e 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -65,7 +65,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pageout.c,v 1.128 1998/10/25 17:44:59 phk Exp $
+ * $Id: vm_pageout.c,v 1.129 1998/10/31 17:21:31 peter Exp $
  */
 
 /*
@@ -211,13 +211,10 @@ void pmap_collect(void);
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
- * block.
- * 
- * And we set pageout-in-progress to keep the object from disappearing
- * during pageout.  This guarantees that the page won't move from the
- * inactive queue.  (However, any other page on the inactive queue may
- * move!)
+ * block.  Note the careful timing, however, the busy bit isn't set till
+ * late and we cannot do anything that will mess with the page.
  */
+
 static int
 vm_pageout_clean(m)
 	vm_page_t m;
@@ -231,12 +228,23 @@ vm_pageout_clean(m)
 	object = m->object;
 
 	/*
+	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
+	 * with the new swapper, but we could have serious problems paging
+	 * out other object types if there is insufficient memory.  
+	 *
+	 * Unfortunately, checking free memory here is far too late, so the
+	 * check has been moved up a procedural level.
+	 */
+
+#if 0
+	/*
 	 * If not OBJT_SWAP, additional memory may be needed to do the pageout.
 	 * Try to avoid the deadlock.
 	 */
 	if ((object->type == OBJT_DEFAULT) &&
 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min))
 		return 0;
+#endif
 
 	/*
 	 * Don't mess with the page if it's busy.
@@ -245,12 +253,21 @@ vm_pageout_clean(m)
 	    ((m->busy != 0) || (m->flags & PG_BUSY)))
 		return 0;
 
+#if 0
+	/*
+	 * XXX REMOVED XXX.  vm_object_collapse() can block, which can
+	 * change the page state.  Calling vm_object_collapse() might also
+	 * destroy or rename the page because we have not busied it yet!!!
+	 * So this code segment is removed.
+	 */
 	/*
-	 * Try collapsing before it's too late.
+	 * Try collapsing before it's too late.   XXX huh?  Why are we doing
+	 * this here?
 	 */
 	if (object->backing_object) {
 		vm_object_collapse(object);
 	}
+#endif
 
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
@@ -351,6 +368,16 @@ do_backward:
 	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
 }
 
+/*
+ * vm_pageout_flush() - launder the given pages
+ *
+ *	The given pages are laundered.  Note that we setup for the start of
+ *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
+ *	reference count all in here rather then in the parent.  If we want
+ *	the parent to do more sophisticated things we may have to change
+ *	the ordering.
+ */
+
 int
 vm_pageout_flush(mc, count, flags)
 	vm_page_t *mc;
@@ -362,6 +389,14 @@ vm_pageout_flush(mc, count, flags)
 	int numpagedout = 0;
 	int i;
 
+	/*
+	 * Initiate I/O.  Bump the vm_page_t->busy counter and
+	 * mark the pages read-only.
+	 *
+	 * We do not have to fixup the clean/dirty bits here... we can
+	 * allow the pager to do it after the I/O completes.
+	 */
+
 	for (i = 0; i < count; i++) {
 		vm_page_io_start(mc[i]);
 		vm_page_protect(mc[i], VM_PROT_READ);
@@ -585,25 +620,24 @@ vm_pageout_map_deactivate_pages(map, desired)
 }
 #endif
 
+/*
+ * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
+ * to vnode deadlocks.  We only do it for OBJT_DEFAULT and OBJT_SWAP objects
+ * which we know can be trivially freed.
+ */
+
 void
 vm_pageout_page_free(vm_page_t m) {
-	struct vnode *vp;
-	vm_object_t object;
-
-	object = m->object;
-	object->ref_count++;
-
-	if (object->type == OBJT_VNODE) {
-		vp = object->handle;
-		vp->v_usecount++;
-		if (VSHOULDBUSY(vp))
-			vbusy(vp);
-	}
+	vm_object_t object = m->object;
+	int type = object->type;
 
+	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
+		vm_object_reference(object);
 	vm_page_busy(m);
 	vm_page_protect(m, VM_PROT_NONE);
 	vm_page_free(m);
-	vm_object_deallocate(object);
+	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
+		vm_object_deallocate(object);
 }
 
 /*
@@ -613,9 +647,10 @@ static int
 vm_pageout_scan()
 {
 	vm_page_t m, next;
-	int page_shortage, addl_page_shortage, maxscan, pcount;
+	int page_shortage, maxscan, pcount;
+	int addl_page_shortage, addl_page_shortage_init;
 	int maxlaunder;
-	int pages_freed;
+	int launder_loop = 0;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
@@ -629,31 +664,53 @@ vm_pageout_scan()
 	 */
 	pmap_collect();
 
-	/*
-	 * Start scanning the inactive queue for pages we can free. We keep
-	 * scanning until we have enough free pages or we have scanned through
-	 * the entire queue.  If we encounter dirty pages, we start cleaning
-	 * them.
-	 */
-
-	pages_freed = 0;
-	addl_page_shortage = vm_pageout_deficit;
+	addl_page_shortage_init = vm_pageout_deficit;
 	vm_pageout_deficit = 0;
 
 	if (max_page_launder == 0)
 		max_page_launder = 1;
-	maxlaunder = (cnt.v_inactive_target > max_page_launder) ?
-	    max_page_launder : cnt.v_inactive_target;
 
-rescan0:
-	maxscan = cnt.v_inactive_count;
-	for( m = TAILQ_FIRST(&vm_page_queue_inactive);
+	/*
+	 * Calculate the number of pages we want to either free or move
+	 * to the cache.
+	 */
+
+	page_shortage = (cnt.v_free_target + cnt.v_cache_min) -
+	    (cnt.v_free_count + cnt.v_cache_count);
+	page_shortage += addl_page_shortage_init;
+
+	/*
+	 * Figure out what to do with dirty pages when they are encountered.
+	 * Assume that 1/3 of the pages on the inactive list are clean.  If
+	 * we think we can reach our target, disable laundering (do not
+	 * clean any dirty pages).  If we miss the target we will loop back
+	 * up and do a laundering run.
+	 */
 
-		(m != NULL) && (maxscan-- > 0) &&
-			((cnt.v_cache_count + cnt.v_free_count) <
-			(cnt.v_cache_min + cnt.v_free_target));
+	if (cnt.v_inactive_count / 3 > page_shortage) {
+		maxlaunder = 0;
+		launder_loop = 0;
+	} else {
+		maxlaunder = 
+		    (cnt.v_inactive_target > max_page_launder) ?
+		    max_page_launder : cnt.v_inactive_target;
+		launder_loop = 1;
+	}
 
-		m = next) {
+	/*
+	 * Start scanning the inactive queue for pages we can move to the
+	 * cache or free.  The scan will stop when the target is reached or
+	 * we have scanned the entire inactive queue.
+	 */
+
+rescan0:
+	addl_page_shortage = addl_page_shortage_init;
+	maxscan = cnt.v_inactive_count;
+	for (
+	    m = TAILQ_FIRST(&vm_page_queue_inactive);
+	    m != NULL && maxscan-- > 0 && page_shortage > 0;
+	    m = next
+	) {
 
 		cnt.v_pdpages++;
 
@@ -681,19 +738,21 @@ rescan0:
 		}
 
 		/*
-		 * If the object is not being used, we ignore previous references.
+		 * If the object is not being used, we ignore previous 
+		 * references.
 		 */
 		if (m->object->ref_count == 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			pmap_clear_reference(VM_PAGE_TO_PHYS(m));
 
 		/*
-		 * Otherwise, if the page has been referenced while in the inactive
-		 * queue, we bump the "activation count" upwards, making it less
-		 * likely that the page will be added back to the inactive queue
-		 * prematurely again.  Here we check the page tables (or emulated
-		 * bits, if any), given the upper level VM system not knowing anything
-		 * about existing references.
+		 * Otherwise, if the page has been referenced while in the 
+		 * inactive queue, we bump the "activation count" upwards, 
+		 * making it less likely that the page will be added back to 
+		 * the inactive queue prematurely again.  Here we check the 
+		 * page tables (or emulated bits, if any), given the upper 
+		 * level VM system not knowing anything about existing 
+		 * references.
 		 */
 		} else if (((m->flags & PG_REFERENCED) == 0) &&
 			(actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(m)))) {
@@ -703,10 +762,10 @@ rescan0:
 		}
 
 		/*
-		 * If the upper level VM system knows about any page references,
-		 * we activate the page.  We also set the "activation count" higher
-		 * than normal so that we will less likely place pages back onto the
-		 * inactive queue again.
+		 * If the upper level VM system knows about any page 
+		 * references, we activate the page.  We also set the 
+		 * "activation count" higher than normal so that we will less 
+		 * likely place pages back onto the inactive queue again.
 		 */
 		if ((m->flags & PG_REFERENCED) != 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
@@ -717,9 +776,10 @@ rescan0:
 		}
 
 		/*
-		 * If the upper level VM system doesn't know anything about the
-		 * page being dirty, we have to check for it again.  As far as the
-		 * VM code knows, any partially dirty pages are fully dirty.
+		 * If the upper level VM system doesn't know anything about 
+		 * the page being dirty, we have to check for it again.  As 
+		 * far as the VM code knows, any partially dirty pages are 
+		 * fully dirty.
 		 */
 		if (m->dirty == 0) {
 			vm_page_test_dirty(m);
@@ -733,14 +793,14 @@ rescan0:
 		if (m->valid == 0) {
 			vm_pageout_page_free(m);
 			cnt.v_dfree++;
-			pages_freed++;
+			--page_shortage;
 
 		/*
 		 * Clean pages can be placed onto the cache queue.
 		 */
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
-			pages_freed++;
+			--page_shortage;
 
 		/*
 		 * Dirty pages need to be paged out.  Note that we clean
@@ -763,8 +823,8 @@ rescan0:
 			}
 
 			/*
-			 * We don't bother paging objects that are "dead".  Those
-			 * objects are in a "rundown" state.
+			 * We don't bother paging objects that are "dead".  
+			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
 				s = splvm();
@@ -774,10 +834,61 @@ rescan0:
 				continue;
 			}
 
-			if ((object->type == OBJT_VNODE) &&
-				(object->flags & OBJ_DEAD) == 0) {
+			/*
+			 * For now we protect against potential memory
+			 * deadlocks by requiring significant memory to be 
+			 * free if the object is not OBJT_DEFAULT or OBJT_SWAP.
+			 * We do not 'trust' any other object type to operate
+			 * with low memory, not even OBJT_DEVICE.  The VM
+			 * allocator will special case allocations done by
+			 * the pageout daemon so the check below actually 
+			 * does have some hysteresis in it.  It isn't the best
+			 * solution, though.
+			 */
+
+			if (
+			    object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    cnt.v_free_count < cnt.v_free_reserved
+			) {
+				s = splvm();
+				TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
+				TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
+				splx(s);
+				continue;
+			}
+
+			/*
+			 * Presumably we have sufficient free memory to do
+			 * the more sophisticated checks and locking required
+			 * for vnodes.
+			 *
+			 * The object is already known NOT to be dead.  The
+			 * vget() may still block, though, because 
+			 * VOP_ISLOCKED() doesn't check to see if an inode
+			 * (v_data) is associated with the vnode.  If it isn't,
+			 * vget() will load in it from disk.  Worse, vget()
+			 * may actually get stuck waiting on "inode" if another
+			 * process is in the process of bringing the inode in.
+			 * This is bad news for us either way.
+			 *
+			 * So for the moment we check v_data == NULL as a
+			 * workaround.  This means that vnodes which do not
+			 * use v_data in the way we expect probably will not
+			 * wind up being paged out by the pager and it will be
+			 * up to the syncer to get them.  That's better then
+			 * us blocking here.
+			 *
+			 * This whole code section is bogus - we need to fix
+			 * the vnode pager to handle vm_page_t's without us
+			 * having to do any sophisticated VOP tests.
+			 */
+
+			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
+
 				if (VOP_ISLOCKED(vp) ||
+				    vp->v_data == NULL ||
 				    vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
 					if ((m->queue == PQ_INACTIVE) &&
 						(m->hold_count == 0) &&
@@ -844,19 +955,34 @@ rescan0:
 	}
 
 	/*
-	 * Compute the page shortage.  If we are still very low on memory be
-	 * sure that we will move a minimal amount of pages from active to
-	 * inactive.
+	 * If we still have a page shortage and we didn't launder anything,
+	 * run the inactive scan again and launder something this time.
+	 */
+
+	if (launder_loop == 0 && page_shortage > 0) {
+		launder_loop = 1;
+		maxlaunder = 
+		    (cnt.v_inactive_target > max_page_launder) ?
+		    max_page_launder : cnt.v_inactive_target;
+		goto rescan0;
+	}
+
+	/*
+	 * Compute the page shortage from the point of view of having to
+	 * move pages from the active queue to the inactive queue.
 	 */
+
 	page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
 	page_shortage += addl_page_shortage;
-	if (page_shortage <= 0) {
-		page_shortage = 0;
-	}
+
+	/*
+	 * Scan the active queue for things we can deactivate
+	 */
 
 	pcount = cnt.v_active_count;
 	m = TAILQ_FIRST(&vm_page_queue_active);
+
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
 
 		/*
@@ -943,10 +1069,14 @@ rescan0:
 	}
 
 	s = splvm();
+
 	/*
 	 * We try to maintain some *really* free pages, this allows interrupt
-	 * code to be guaranteed space.
+	 * code to be guaranteed space.  Since both cache and free queues 
+	 * are considered basically 'free', moving pages from cache to free
+	 * does not effect other calculations.
 	 */
+
 	while (cnt.v_free_count < cnt.v_free_reserved) {
 		static int cache_rover = 0;
 		m = vm_page_list_find(PQ_CACHE, cache_rover);
@@ -995,7 +1125,6 @@ rescan0:
 #endif
 	}
 
-
 	/*
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
@@ -1242,10 +1371,8 @@ vm_pageout()
 			cnt.v_pdwakeups++;
 		vm_pages_needed = 0;
 		splx(s);
-		vm_pager_sync();
 		vm_pageout_scan();
 		vm_pageout_deficit = 0;
-		vm_pager_sync();
 		wakeup(&cnt.v_free_count);
 	}
 }