aboutsummaryrefslogtreecommitdiff
path: root/share/doc/papers
diff options
context:
space:
mode:
Diffstat (limited to 'share/doc/papers')
-rw-r--r--share/doc/papers/Makefile18
-rw-r--r--share/doc/papers/beyond4.3/Makefile6
-rw-r--r--share/doc/papers/beyond4.3/beyond43.ms512
-rw-r--r--share/doc/papers/bsdreferences.bib363
-rw-r--r--share/doc/papers/bufbio/Makefile12
-rw-r--r--share/doc/papers/bufbio/bio.ms828
-rw-r--r--share/doc/papers/bufbio/bufsize.eps479
-rw-r--r--share/doc/papers/contents/Makefile6
-rw-r--r--share/doc/papers/contents/contents.ms216
-rw-r--r--share/doc/papers/devfs/Makefile7
-rw-r--r--share/doc/papers/devfs/paper.me1276
-rw-r--r--share/doc/papers/diskperf/Makefile8
-rw-r--r--share/doc/papers/diskperf/abs.ms170
-rw-r--r--share/doc/papers/diskperf/appendix.ms96
-rw-r--r--share/doc/papers/diskperf/conclusions.ms121
-rw-r--r--share/doc/papers/diskperf/equip.ms171
-rw-r--r--share/doc/papers/diskperf/methodology.ms105
-rw-r--r--share/doc/papers/diskperf/motivation.ms87
-rw-r--r--share/doc/papers/diskperf/results.ms331
-rw-r--r--share/doc/papers/diskperf/tests.ms102
-rw-r--r--share/doc/papers/fsinterface/Makefile6
-rw-r--r--share/doc/papers/fsinterface/abstract.ms67
-rw-r--r--share/doc/papers/fsinterface/fsinterface.ms1169
-rw-r--r--share/doc/papers/fsinterface/slides.t312
-rw-r--r--share/doc/papers/jail/Makefile12
-rw-r--r--share/doc/papers/jail/future.ms102
-rw-r--r--share/doc/papers/jail/implementation.ms124
-rw-r--r--share/doc/papers/jail/jail01.eps234
-rw-r--r--share/doc/papers/jail/jail01.fig85
-rw-r--r--share/doc/papers/jail/mgt.ms214
-rw-r--r--share/doc/papers/jail/paper.ms436
-rw-r--r--share/doc/papers/kernmalloc/Makefile11
-rw-r--r--share/doc/papers/kernmalloc/alloc.fig109
-rw-r--r--share/doc/papers/kernmalloc/appendix.ms268
-rw-r--r--share/doc/papers/kernmalloc/appendix.t131
-rw-r--r--share/doc/papers/kernmalloc/kernmalloc.t646
-rw-r--r--share/doc/papers/kernmalloc/spell.ok57
-rw-r--r--share/doc/papers/kernmalloc/usage.tbl69
-rw-r--r--share/doc/papers/kerntune/0.t123
-rw-r--r--share/doc/papers/kerntune/1.t42
-rw-r--r--share/doc/papers/kerntune/2.t228
-rw-r--r--share/doc/papers/kerntune/3.t284
-rw-r--r--share/doc/papers/kerntune/4.t93
-rw-r--r--share/doc/papers/kerntune/Makefile11
-rw-r--r--share/doc/papers/kerntune/fig2.pic51
-rw-r--r--share/doc/papers/malloc/Makefile7
-rw-r--r--share/doc/papers/malloc/abs.ms33
-rw-r--r--share/doc/papers/malloc/alternatives.ms43
-rw-r--r--share/doc/papers/malloc/conclusion.ms46
-rw-r--r--share/doc/papers/malloc/implementation.ms223
-rw-r--r--share/doc/papers/malloc/intro.ms72
-rw-r--r--share/doc/papers/malloc/kernel.ms54
-rw-r--r--share/doc/papers/malloc/malloc.ms70
-rw-r--r--share/doc/papers/malloc/performance.ms111
-rw-r--r--share/doc/papers/malloc/problems.ms52
-rw-r--r--share/doc/papers/newvm/0.t80
-rw-r--r--share/doc/papers/newvm/1.t371
-rw-r--r--share/doc/papers/newvm/Makefile6
-rw-r--r--share/doc/papers/newvm/a.t233
-rw-r--r--share/doc/papers/newvm/spell.ok56
-rw-r--r--share/doc/papers/relengr/0.t85
-rw-r--r--share/doc/papers/relengr/1.t63
-rw-r--r--share/doc/papers/relengr/2.t140
-rw-r--r--share/doc/papers/relengr/3.t384
-rw-r--r--share/doc/papers/relengr/Makefile12
-rw-r--r--share/doc/papers/relengr/ref.bib26
-rw-r--r--share/doc/papers/relengr/spell.ok15
-rw-r--r--share/doc/papers/sysperf/0.t241
-rw-r--r--share/doc/papers/sysperf/1.t75
-rw-r--r--share/doc/papers/sysperf/2.t252
-rw-r--r--share/doc/papers/sysperf/3.t688
-rw-r--r--share/doc/papers/sysperf/4.t768
-rw-r--r--share/doc/papers/sysperf/5.t279
-rw-r--r--share/doc/papers/sysperf/6.t64
-rw-r--r--share/doc/papers/sysperf/7.t158
-rw-r--r--share/doc/papers/sysperf/Makefile9
-rw-r--r--share/doc/papers/sysperf/a1.t662
-rw-r--r--share/doc/papers/sysperf/a2.t111
-rw-r--r--share/doc/papers/sysperf/appendix.ms1026
-rw-r--r--share/doc/papers/timecounter/Makefile18
-rw-r--r--share/doc/papers/timecounter/fig1.eps227
-rw-r--r--share/doc/papers/timecounter/fig2.eps150
-rw-r--r--share/doc/papers/timecounter/fig3.eps126
-rw-r--r--share/doc/papers/timecounter/fig4.eps259
-rw-r--r--share/doc/papers/timecounter/fig5.eps211
-rw-r--r--share/doc/papers/timecounter/gps.ps1488
-rw-r--r--share/doc/papers/timecounter/intr.ps1501
-rw-r--r--share/doc/papers/timecounter/timecounter.ms1074
-rw-r--r--share/doc/papers/timecounter/tmac.usenix952
89 files changed, 22319 insertions, 0 deletions
diff --git a/share/doc/papers/Makefile b/share/doc/papers/Makefile
new file mode 100644
index 000000000000..3c702b2aeaaa
--- /dev/null
+++ b/share/doc/papers/Makefile
@@ -0,0 +1,18 @@
+SUBDIR= beyond4.3 \
+ bufbio \
+ contents \
+ devfs \
+ diskperf \
+ fsinterface \
+ jail \
+ kernmalloc \
+ kerntune \
+ malloc \
+ newvm \
+ relengr \
+ sysperf \
+ timecounter
+
+SUBDIR_PARALLEL=
+
+.include <bsd.subdir.mk>
diff --git a/share/doc/papers/beyond4.3/Makefile b/share/doc/papers/beyond4.3/Makefile
new file mode 100644
index 000000000000..acc2048503f8
--- /dev/null
+++ b/share/doc/papers/beyond4.3/Makefile
@@ -0,0 +1,6 @@
+VOLUME= papers
+DOC= beyond43
+SRCS= beyond43.ms
+MACROS= -ms
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/beyond4.3/beyond43.ms b/share/doc/papers/beyond4.3/beyond43.ms
new file mode 100644
index 000000000000..36c9115bb4e9
--- /dev/null
+++ b/share/doc/papers/beyond4.3/beyond43.ms
@@ -0,0 +1,512 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" *troff -ms
+.rm CM
+.sp 2
+.ce 100
+\fB\s+2Current Research by
+The Computer Systems Research Group
+of Berkeley\s-2\fP
+.ds DT "February 10, 1989
+.\" \fBDRAFT of \*(DT\fP
+.sp 2
+.nf
+Marshall Kirk McKusick
+Michael J Karels
+Keith Sklower
+Kevin Fall
+Marc Teitelbaum
+Keith Bostic
+.fi
+.sp 2
+.ce 1
+\fISummary\fP
+.ce 0
+.PP
+The release of 4.3BSD in April of 1986 addressed many of the
+performance problems and unfinished interfaces
+present in 4.2BSD [Leffler84] [McKusick85].
+The Computer Systems Research Group at Berkeley
+has now embarked on a new development phase to
+update other major components of the system, as well as to offer
+new functionality.
+There are five major ongoing projects.
+The first is to develop an OSI network protocol suite and to integrate
+existing ISO applications into Berkeley UNIX.
+The second is to develop and support an interface compliant with the
+P1003.1 POSIX standard recently approved by the IEEE.
+The third is to refine the TCP/IP networking to improve
+its performance and limit congestion on slow and/or lossy networks.
+The fourth is to provide a standard interface to file systems
+so that multiple local and remote file systems can be supported,
+much as multiple networking protocols are supported by 4.3BSD.
+The fifth is to evaluate alternate access control mechanisms and
+audit the existing security features of the system, particularly
+with respect to network services.
+Other areas of work include multi-architecture support,
+a general purpose kernel memory allocator, disk labels, and
+extensions to the 4.2BSD fast filesystem.
+.PP
+We are planning to finish implementation prototypes for each of the
+five main areas of work over the next year, and provide an informal
+test release sometime next year for interested developers.
+After incorporating feedback and refinements from the testers,
+they will appear in the next full Berkeley release, which is typically
+made about a year after the test release.
+.br
+.ne 10
+.sp 2
+.NH
+Recently Completed Projects
+.PP
+There have been several changes in the system that were included
+in the recent 4.3BSD Tahoe release.
+.NH 2
+Multi-architecture support
+.PP
+Support has been added for the DEC VAX 8600/8650, VAX 8200/8250,
+MicroVAXII and MicroVAXIII.
+.PP
+The largest change has been the incorporation of support for the first
+non-VAX processor, the CCI Power 6/32 and 6/32SX. (This addition also
+supports the
+Harris HCX-7 and HCX-9, as well as the Sperry 7000/40 and ICL machines.)
+The Power 6 version of 4.3BSD is largely based on the compilers and
+device drivers done for CCI's 4.2BSD UNIX,
+and is otherwise similar to the VAX release of 4.3BSD.
+The entire source tree, including all kernel and user-level sources,
+has been merged using a structure that will easily accommodate the addition
+of other processor families. A MIPS R2000 has been donated to us,
+making the MIPS architecture a likely candidate for inclusion into a future
+BSD release.
+.NH 2
+Kernel Memory Allocator
+.PP
+The 4.3BSD UNIX kernel used 10 different memory allocation mechanisms,
+each designed for the particular needs of the utilizing subsystem.
+These mechanisms have been replaced by a general purpose dynamic
+memory allocator that can be used by all of the kernel subsystems.
+The design of this allocator takes advantage of known memory usage
+patterns in the UNIX kernel and a hybrid strategy that is time-efficient
+for small allocations and space-efficient for large allocations.
+This allocator replaces the multiple memory allocation interfaces
+with a single easy-to-program interface,
+results in more efficient use of global memory by eliminating
+partitioned and specialized memory pools,
+and is quick enough (approximately 15 VAX instructions) that no
+performance loss is observed relative to the current implementations.
+[McKusick88].
+.NH 2
+Disk Labels
+.PP
+During the work on the CCI machine,
+it became obvious that disk geometry and filesystem layout information
+must be stored on each disk in a pack label.
+Disk labels were implemented for the CCI disks and for the most common
+types of disk controllers on the VAX.
+A utility was written to create and maintain the disk information,
+and other user-level programs that use such information now obtain
+it from the disk label.
+The use of this facility has allowed improvements in the file system's
+knowledge of irregular disk geometries such as track-to-track skew.
+.NH 2
+Fat Fast File System
+.PP
+The 4.2 fast file system [McKusick84]
+contained several statically sized structures,
+imposing limits on the number of cylinders per cylinder group,
+inodes per cylinder group,
+and number of distinguished rotational positions.
+The new ``fat'' filesystem allows these limits to be set at filesystem
+creation time.
+Old kernels will treat the new filesystems as read-only,
+and new kernels
+will accommodate both formats.
+The filesystem check facility, \fBfsck\fP, has also been modified to check
+either type.
+.br
+.ne 10
+.sp 2
+.NH
+Current UNIX Research at Berkeley
+.PP
+Since the release of 4.3BSD in mid 1986,
+we have begun work on several new major areas of research.
+Our goal is to apply leading edge research ideas into a stable
+and reliable implementation that solves current problems in
+operating systems development.
+.NH 2
+OSI network protocol development
+.PP
+The network architecture of 4.2BSD was designed to accommodate
+multiple network protocol families and address formats,
+and an implementation of the ISO OSI network protocols
+should enter into this framework without much difficulty.
+We plan to
+implement the OSI connectionless internet protocol (CLNP),
+and device drivers for X.25, 802.3, and possibly 802.5 interfaces, and
+to integrate these with an OSI transport class 4 (TP-4) implementation.
+We will also incorporate into the Berkeley Software Distribution an
+updated ISO Development Environment (ISODE)
+featuring International Standard (IS) versions of utilities.
+ISODE implements the session and presentation layers of the OSI protocol suite,
+and will include an implementation of the file transfer protocol (FTAM).
+It is also possible that an X.400 implementation now being done at
+University College, London and the University of Nottingham
+will be available for testing and distribution.
+.LP
+This implementation is comprised of four areas.
+.IP 1)
+We are updating the University of
+Wisconsin TP-4 to match GOSIP requirements.
+The University of Wisconsin developed a transport class 4
+implementation for the 4.2BSD kernel under contract to Mitre.
+This implementation must be updated to reflect the National Institute
+of Standards and Technology (NIST, formerly NBS) workshop agreements,
+GOSIP, and 4.3BSD requirements.
+We will make this TP-4 operate with an OSI IP,
+as the original implementation was built to run over the DoD IP.
+.IP 2)
+A kernel version of the OSI IP and ES-IS protocols must be produced.
+We will implement the kernel version of these protocols.
+.IP 3)
+The required device drivers need to be integrated into a BSD kernel.
+4.3BSD has existing device drivers for many Ethernet devices; future
+BSD versions may also support X.25 devices as well as token ring
+networks.
+These device drivers must be integrated
+into the kernel OSI protocol implementations.
+.IP 4)
+The existing OSINET interoperability test network is available so
+that the interoperability of the ISODE and BSD kernel protocols
+can be established through tests with several vendors.
+Testing is crucial because an openly available version of GOSIP protocols
+that does not interoperate with DEC, IBM, SUN, ICL, HIS, and other
+major vendors would be embarrassing.
+To allow testing of the integrated pieces the most desirable
+approach is to provide access to OSINET at UCB.
+A second approach is to do the interoperability testing at
+the site of an existing OSINET member, such as the NBS.
+.NH 2
+Compliance with POSIX 1003
+.PP
+Berkeley became involved several months ago in the development
+of the IEEE POSIX P1003.1 system interface standard.
+Since then, we have been participating in the working groups
+of P1003.2 (shell and application utility interface),
+P1003.6 (security), P1003.7 (system administration), and P1003.8
+(networking).
+.PP
+The IEEE published the POSIX P1003.1 standard in late 1988.
+POSIX related changes to the BSD system have included a new terminal
+driver, support for POSIX sessions and job control, expanded signal
+functionality, restructured directory access routines, and new set-user
+and set-group id facilities.
+We currently have a prototype implementation of the
+POSIX driver with extensions to provide binary compatibility with
+applications developed for the old Berkeley terminal driver.
+We also have a prototype implementation of the 4.2BSD-based POSIX
+job control facility.
+.PP
+The P1003.2 draft is currently being voted on by the IEEE
+P1003.2 balloting group.
+Berkeley is particularly interested in the results of this standard,
+as it will profoundly influence the user environment.
+The other groups are in comparatively early phases, with drafts
+coming to ballot sometime in the 90's.
+Berkeley will continue to participate in these groups, and
+move in the near future toward a P1003.1 and P1003.2 compliant
+system.
+We have many of the utilities outlined in the current P1003.2 draft
+already implemented, and have other parties willing to contribute
+additional implementations.
+.NH 2
+Improvements to the TCP/IP Networking Protocols
+.PP
+The Internet and the Berkeley collection of local-area networks
+have both grown at high rates in the last year.
+The Bay Area Regional Research Network (BARRNet),
+connecting several UC campuses, Stanford and NASA-Ames
+has recently become operational, increasing the complexity
+of the network connectivity.
+Both Internet and local routing algorithms are showing the strain
+of continued growth.
+We have made several changes in the local routing algorithm
+to keep accommodating the current topology,
+and are participating in the development of new routing algorithms
+and standard protocols.
+.PP
+Recent work in collaboration with Van Jacobson of the Lawrence Berkeley
+Laboratory has led to the design and implementation of several new algorithms
+for TCP that improve throughput on both local and long-haul networks
+while reducing unnecessary retransmission.
+The improvement is especially striking when connections must traverse
+slow and/or lossy networks.
+The new algorithms include ``slow-start,''
+a technique for opening the TCP flow control window slowly
+and using the returning stream of acknowledgements as a clock
+to drive the connection at the highest speed tolerated by the intervening
+network.
+A modification of this technique allows the sender to dynamically modify
+the send window size to adjust to changing network conditions.
+In addition, the round-trip timer has been modified to estimate the variance
+in round-trip time, thus allowing earlier retransmission of lost packets
+with less spurious retransmission due to increasing network delay.
+Along with a scheme proposed by Phil Karn of Bellcore,
+these changes reduce unnecessary retransmission over difficult paths
+such as Satnet by nearly two orders of magnitude
+while improving throughput dramatically.
+.PP
+The current TCP implementation is now being readied
+for more widespread distribution via the network and as a
+standard Berkeley distribution unencumbered by any commercial licensing.
+We are continuing to refine the TCP and IP implementations
+using the ARPANET, BARRNet, the NSF network
+and local campus nets as testbeds.
+In addition, we are incorporating applicable algorithms from this work
+into the TP-4 protocol implementation.
+.NH 2
+Toward a Compatible File System Interface
+.PP
+The most critical shortcoming of the 4.3BSD UNIX system was in the
+area of distributed file systems.
+As with networking protocols,
+there is no single distributed file system
+that provides sufficient speed and functionality for all problems.
+It is frequently necessary to support several different remote
+file system protocols, just as it is necessary to run several
+different network protocols.
+.PP
+As network or remote file systems have been implemented for UNIX,
+several stylized interfaces between the file system implementation
+and the rest of the kernel have been developed.
+Among these are Sun Microsystems' Virtual File System interface (VFS)
+using \fBvnodes\fP [Sandburg85] [Kleiman86],
+Digital Equipment's Generic File System (GFS) architecture [Rodriguez86],
+AT&T's File System Switch (FSS) [Rifkin86],
+the LOCUS distributed file system [Walker85],
+and Masscomp's extended file system [Cole85].
+Other remote file systems have been implemented in research or
+university groups for internal use,
+notably the network file system in the Eighth Edition UNIX
+system [Weinberger84] and two different file systems used at Carnegie Mellon
+University [Satyanarayanan85].
+Numerous other remote file access methods have been devised for use
+within individual UNIX processes,
+many of them by modifications to the C I/O library
+similar to those in the Newcastle Connection [Brownbridge82].
+.PP
+Each design attempts to isolate file system-dependent details
+below a generic interface and to provide a framework within which
+new file systems may be incorporated.
+However, each of these interfaces is different from
+and incompatible with the others.
+Each addresses somewhat different design goals,
+having been based on a different version of UNIX,
+having targeted a different set of file systems with varying characteristics,
+and having selected a different set of file system primitive operations.
+.PP
+Our effort in this area is aimed at providing a common framework to
+support these different distributed file systems simultaneously rather than to
+simply implement yet another protocol.
+This requires a detailed study of the existing protocols,
+and discussion with their implementors to determine whether
+they could modify their implementation to fit within our proposed
+framework. We have studied the various file system interfaces to determine
+their generality, completeness, robustness, efficiency, and aesthetics
+and are currently working on a file system interface
+that we believe includes the best features of
+each of the existing implementations.
+This work and the rationale underlying its development
+have been presented to major software vendors as an early step
+toward convergence on a standard compatible file system interface.
+Briefly, the proposal adopts the 4.3BSD calling convention for file
+name lookup but otherwise is closely related to Sun's VFS
+and DEC's GFS. [Karels86].
+.NH 2
+System Security
+.PP
+The recent invasion of the DARPA Internet by a quickly reproducing ``worm''
+highlighted the need for a thorough review of the access
+safeguards built into the system.
+Until now, we have taken a passive approach to dealing with
+weaknesses in the system access mechanisms, rather than actively
+searching for possible weaknesses.
+When we are notified of a problem or loophole in a system utility
+by one of our users,
+we have a well defined procedure for fixing the problem and
+expeditiously disseminating the fix to the BSD mailing list.
+This procedure has proven itself to be effective in
+solving known problems as they arise
+(witness its success in handling the recent worm).
+However, we feel that it would be useful to take a more active
+role in identifying problems before they are reported (or exploited).
+We will make a complete audit of the system
+utilities and network servers to find unintended system access mechanisms.
+.PP
+As a part of the work to make the system more resistant to attack
+from local users or via the network, it will be necessary to produce
+additional documentation on the configuration and operation of the system.
+This documentation will cover such topics as file and directory ownership
+and access, network and server configuration,
+and control of privileged operations such as file system backups.
+.PP
+We are investigating the addition of access control lists (ACLs) for
+filesystem objects.
+ACLs provide a much finer granularity of control over file access permissions
+than the current
+discretionary access control mechanism (mode bits).
+Furthermore, they are necessary
+in environments where C2 level security or better, as defined in the DoD
+TCSEC [DoD83], is required.
+The POSIX P1003.6 security group has made notable progress in determining
+how an ACL mechanism should work, and several vendors have implemented
+ACLs for their commercial systems.
+Berkeley will investigate the existing implementations and determine
+how to best integrate ACLs with the existing mechanism.
+.PP
+A major shortcoming of the present system is that authentication
+over the network is based solely on the privileged port mechanism
+between trusting hosts and users.
+Although privileged ports can only be created by processes running as root
+on a UNIX system,
+such processes are easy for a workstation user to obtain;
+they simply reboot their workstation in single user mode.
+Thus, a better authentication mechanism is needed.
+At present, we believe that the MIT Kerberos authentication
+server [Steiner88] provides the best solution to this problem.
+We propose to investigate Kerberos further as well as other
+authentication mechanisms and then to integrate
+the best one into Berkeley UNIX.
+Part of this integration would be the addition of the
+authentication mechanism into utilities such as
+telnet, login, remote shell, etc.
+We will add support for telnet (eventually replacing rlogin),
+the X window system, and the mail system within an authentication
+domain (a Kerberos \fIrealm\fP).
+We hope to replace the existing password authentication on each host
+with the network authentication system.
+.NH
+References
+.sp
+.IP Brownbridge82
+Brownbridge, D.R., L.F. Marshall, B. Randell,
+``The Newcastle Connection, or UNIXes of the World Unite!,''
+\fISoftware\- Practice and Experience\fP, Vol. 12, pp. 1147-1162, 1982.
+.sp
+.IP Cole85
+.br
+Cole, C.T., P.B. Flinn, A.B. Atlas,
+``An Implementation of an Extended File System for UNIX,''
+\fIUsenix Conference Proceedings\fP,
+pp. 131-150, June, 1985.
+.sp
+.IP DoD83
+.br
+Department of Defense,
+``Trusted Computer System Evaluation Criteria,''
+\fICSC-STD-001-83\fP,
+DoD Computer Security Center, August, 1983.
+.sp
+.IP Karels86
+Karels, M., M. McKusick,
+``Towards a Compatible File System Interface,''
+\fIProceedings of the European UNIX Users Group Meeting\fP,
+Manchester, England, pp. 481-496, September 1986.
+.sp
+.IP Kleiman86
+Kleiman, S.,
+``Vnodes: An Architecture for Multiple File System Types in Sun UNIX,''
+\fIUsenix Conference Proceedings\fP,
+pp. 238-247, June, 1986.
+.sp
+.IP Leffler84
+Leffler, S., M.K. McKusick, M. Karels,
+``Measuring and Improving the Performance of 4.2BSD,''
+\fIUsenix Conference Proceedings\fP, pp. 237-252, June, 1984.
+.sp
+.IP McKusick84
+McKusick, M.K., W. Joy, S. Leffler, R. Fabry,
+``A Fast File System for UNIX'',
+\fIACM Transactions on Computer Systems 2\fP, 3.
+pp 181-197, August 1984.
+.sp
+.IP McKusick85
+McKusick, M.K., M. Karels, S. Leffler,
+``Performance Improvements and Functional Enhancements in 4.3BSD,''
+\fIUsenix Conference Proceedings\fP, pp. 519-531, June, 1985.
+.sp
+.IP McKusick86
+McKusick, M.K., M. Karels,
+``A New Virtual Memory Implementation for Berkeley UNIX,''
+\fIProceedings of the European UNIX Users Group Meeting\fP,
+Manchester, England, pp. 451-460, September 1986.
+.sp
+.IP McKusick88
+McKusick, M.K., M. Karels,
+``Design of a General Purpose Memory Allocator for the 4.3BSD UNIX Kernel,''
+\fIUsenix Conference Proceedings\fP,
+pp. 295-303, June, 1988.
+.sp
+.IP Rifkin86
+Rifkin, A.P., M.P. Forbes, R.L. Hamilton, M. Sabrio, S. Shah, K. Yueh,
+``RFS Architectural Overview,'' \fIUsenix Conference Proceedings\fP,
+pp. 248-259, June, 1986.
+.sp
+.IP Rodriguez86
+Rodriguez, R., M. Koehler, R. Hyde,
+``The Generic File System,''
+\fIUsenix Conference Proceedings\fP,
+pp. 260-269, June, 1986.
+.sp
+.IP Sandberg85
+Sandberg, R., D. Goldberg, S. Kleiman, D. Walsh, B. Lyon,
+``Design and Implementation of the Sun Network File System,''
+\fIUsenix Conference Proceedings\fP,
+pp. 119-130, June, 1985.
+.sp
+.IP Satyanarayanan85
+Satyanarayanan, M., \fIet al.\fP,
+``The ITC Distributed File System: Principles and Design,''
+\fIProc. 10th Symposium on Operating Systems Principles\fP, pp. 35-50,
+ACM, December, 1985.
+.sp
+.IP Steiner88
+Steiner, J., C. Newman, J. Schiller,
+``\fIKerberos:\fP An Authentication Service for Open Network Systems,''
+\fIUsenix Conference Proceedings\fP, pp. 191-202, February, 1988.
+.sp
+.IP Walker85
+Walker, B.J. and S.H. Kiser, ``The LOCUS Distributed File System,''
+\fIThe LOCUS Distributed System Architecture\fP,
+G.J. Popek and B.J. Walker, ed., The MIT Press, Cambridge, MA, 1985.
+.sp
+.IP Weinberger84
+Weinberger, P.J., ``The Version 8 Network File System,''
+\fIUsenix Conference presentation\fP,
+June, 1984.
diff --git a/share/doc/papers/bsdreferences.bib b/share/doc/papers/bsdreferences.bib
new file mode 100644
index 000000000000..2b12deda7ad0
--- /dev/null
+++ b/share/doc/papers/bsdreferences.bib
@@ -0,0 +1,363 @@
+@Comment{A Bilbiography of papers that either use or extend FreeBSD.}
+@Comment{NOTE: Entries are alphabetical by primary key (author)}
+@article{Anderson2014a,
+author = {Anderson, J and Watson, Rnm and Chisnall, D and Gudka, K. and Marinos, I and Davis, B},
+file = {:Users/gnn/Documents/Mendeley Desktop/Anderson et al/Proceedings of the European Conference on Computer Systems/Anderson et al. - 2014 - TESLA temporally enhanced system logic assertions.pdf:pdf},
+isbn = {9781450327046},
+journal = {Proceedings of the European Conference on Computer Systems},
+mendeley-groups = {CADETS},
+title = {{TESLA: temporally enhanced system logic assertions}},
+url = {http://dl.acm.org/citation.cfm?id=2592801},
+year = {2014}
+}
+
+@article{Armitage:2003:MSE:956993.957010,
+ author = {Armitage, Grenville},
+ title = {Maximising Student Exposure to Networking Using FreeBSD Virtual Hosts},
+ journal = {SIGCOMM Comput. Commun. Rev.},
+ issue_date = {July 2003},
+ volume = {33},
+ number = {3},
+ month = jul,
+ year = {2003},
+ issn = {0146-4833},
+ pages = {137--143},
+ numpages = {7},
+ url = {http://doi.acm.org/10.1145/956993.957010},
+ doi = {10.1145/956993.957010},
+ acmid = {957010},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {FreeBSD, IP, Unix, networking, students, teaching, virtual hosts},
+}
+
+@article{Armitage:2008:IHI:1384609.1384613,
+ author = {Armitage, Grenville and Stewart, Lawrence and Welzl, Michael and Healy, James},
+ title = {An Independent H-TCP Implementation Under FreeBSD 7.0: Description and Observed Behaviour},
+ journal = {SIGCOMM Comput. Commun. Rev.},
+ issue_date = {July 2008},
+ volume = {38},
+ number = {3},
+ month = jul,
+ year = {2008},
+ issn = {0146-4833},
+ pages = {27--38},
+ numpages = {12},
+ url = {http://doi.acm.org/10.1145/1384609.1384613},
+ doi = {10.1145/1384609.1384613},
+ acmid = {1384613},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {FreeBSD, H-TCP, TCP, congestion control},
+}
+
+@inproceedings{Bless:2004:IFT:1161734.1162020,
+ author = {Bless, Roland and Doll, Mark},
+ title = {Integration of the FreeBSD TCP/IP-stack into the Discrete Event Simulator OMNet++},
+ booktitle = {Proceedings of the 36th Conference on Winter Simulation},
+ series = {WSC '04},
+ year = {2004},
+ isbn = {0-7803-8786-4},
+ location = {Washington, D.C.},
+ pages = {1556--1561},
+ numpages = {6},
+ url = {http://dl.acm.org/citation.cfm?id=1161734.1162020},
+ acmid = {1162020},
+ publisher = {Winter Simulation Conference},
+}
+
+@inproceedings{Canfora:2011:SIA:1985441.1985463,
+ author = {Canfora, Gerardo and Cerulo, Luigi and Cimitile, Marta and Di Penta, Massimiliano},
+ title = {Social Interactions Around Cross-system Bug Fixings: The Case of FreeBSD and OpenBSD},
+ booktitle = {Proceedings of the 8th Working Conference on Mining Software Repositories},
+ series = {MSR '11},
+ year = {2011},
+ isbn = {978-1-4503-0574-7},
+ location = {Waikiki, Honolulu, HI, USA},
+ pages = {143--152},
+ numpages = {10},
+ url = {http://doi.acm.org/10.1145/1985441.1985463},
+ doi = {10.1145/1985441.1985463},
+ acmid = {1985463},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {bug fixing, code migration, empirical study, social network analysis},
+}
+
+
+@inproceedings{Chang:2008:ESC:1370750.1370766,
+ author = {Chang, Hung-Fu and Mockus, Audris},
+ title = {Evaluation of Source Code Copy Detection Methods on Freebsd},
+ booktitle = {Proceedings of the 2008 International Working Conference on Mining Software Repositories},
+ series = {MSR '08},
+ year = {2008},
+ isbn = {978-1-60558-024-1},
+ location = {Leipzig, Germany},
+ pages = {61--66},
+ numpages = {6},
+ url = {http://doi.acm.org/10.1145/1370750.1370766},
+ doi = {10.1145/1370750.1370766},
+ acmid = {1370766},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {clone detection, cloning, code copying, open source, version control},
+}
+
+@article{Chisnall:2015:BPA:2786763.2694367,
+ author = {Chisnall, David and Rothwell, Colin and Watson, Robert N.M. and Woodruff, Jonathan and Vadera, Munraj and Moore, Simon W. and Roe, Michael and Davis, Brooks and Neumann, Peter G.},
+ title = {Beyond the PDP-11: Architectural Support for a Memory-Safe C Abstract Machine},
+ journal = {SIGARCH Comput. Archit. News},
+ issue_date = {March 2015},
+ volume = {43},
+ number = {1},
+ month = mar,
+ year = {2015},
+ issn = {0163-5964},
+ pages = {117--130},
+ numpages = {14},
+ url = {http://doi.acm.org/10.1145/2786763.2694367},
+ doi = {10.1145/2786763.2694367},
+ acmid = {2694367},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {C language, bounds checking, capabilities, compilers, memory protection, memory safety, processor design, security},
+}
+
+@article{Chisnall:2015:BPA:2775054.2694367,
+ author = {Chisnall, David and Rothwell, Colin and Watson, Robert N.M. and Woodruff, Jonathan and Vadera, Munraj and Moore, Simon W. and Roe, Michael and Davis, Brooks and Neumann, Peter G.},
+ title = {Beyond the PDP-11: Architectural Support for a Memory-Safe C Abstract Machine},
+ journal = {SIGPLAN Not.},
+ issue_date = {April 2015},
+ volume = {50},
+ number = {4},
+ month = mar,
+ year = {2015},
+ issn = {0362-1340},
+ pages = {117--130},
+ numpages = {14},
+ url = {http://doi.acm.org/10.1145/2775054.2694367},
+ doi = {10.1145/2775054.2694367},
+ acmid = {2694367},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {C language, bounds checking, capabilities, compilers, memory protection, memory safety, processor design, security},
+}
+
+@inproceedings{Chisnall:2015:BPA:2694344.2694367,
+ author = {Chisnall, David and Rothwell, Colin and Watson, Robert N.M. and Woodruff, Jonathan and Vadera, Munraj and Moore, Simon W. and Roe, Michael and Davis, Brooks and Neumann, Peter G.},
+ title = {Beyond the PDP-11: Architectural Support for a Memory-Safe C Abstract Machine},
+ booktitle = {Proceedings of the Twentieth International Conference on Architectural Support for Programming Languages and Operating Systems},
+ series = {ASPLOS '15},
+ year = {2015},
+ isbn = {978-1-4503-2835-7},
+ location = {Istanbul, Turkey},
+ pages = {117--130},
+ numpages = {14},
+ url = {http://doi.acm.org/10.1145/2694344.2694367},
+ doi = {10.1145/2694344.2694367},
+ acmid = {2694367},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {C language, bounds checking, capabilities, compilers, memory protection, memory safety, processor design, security},
+}
+
+@article{Frenger:2000:FFB:360271.360272,
+ author = {Frenger, Paul},
+ title = {Forth and the FreeBSD Bootloader},
+ journal = {SIGPLAN Not.},
+ issue_date = {Aug., 2000},
+ volume = {35},
+ number = {8},
+ month = aug,
+ year = {2000},
+ issn = {0362-1340},
+ pages = {15--17},
+ numpages = {3},
+ url = {http://doi.acm.org/10.1145/360271.360272},
+ doi = {10.1145/360271.360272},
+ acmid = {360272},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+}
+
+@inproceedings{Hsu:2003:RSF:1250972.1250973,
+ author = {Hsu, Jeffrey},
+ title = {Reasoning About SMP in FreeBSD},
+ booktitle = {Proceedings of the BSD Conference 2003 on BSD Conference},
+ series = {BSDC'03},
+ year = {2003},
+ location = {San Mateo, California},
+ pages = {1--1},
+ numpages = {1},
+ url = {http://dl.acm.org/citation.cfm?id=1250972.1250973},
+ acmid = {1250973},
+ publisher = {USENIX Association},
+ address = {Berkeley, CA, USA},
+}
+
+@inproceedings{Izurieta:2006:EFL:1159733.1159765,
+ author = {Izurieta, Clemente and Bieman, James},
+ title = {The Evolution of FreeBSD and Linux},
+ booktitle = {Proceedings of the 2006 ACM/IEEE International Symposium on Empirical Software Engineering},
+ series = {ISESE '06},
+ year = {2006},
+ isbn = {1-59593-218-6},
+ location = {Rio de Janeiro, Brazil},
+ pages = {204--211},
+ numpages = {8},
+ url = {http://doi.acm.org/10.1145/1159733.1159765},
+ doi = {10.1145/1159733.1159765},
+ acmid = {1159765},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {FreeBSD, evolution, linux, open source software, replication study, software engineering},
+}
+
+@inproceedings{Li:2009:OBR:1592631.1592641,
+ author = {Li, Qing and Macy, Kip},
+ title = {Optimizing the BSD Routing System for Parallel Processing},
+ booktitle = {Proceedings of the 2Nd ACM SIGCOMM Workshop on Programmable Routers for Extensible Services of Tomorrow},
+ series = {PRESTO '09},
+ year = {2009},
+ isbn = {978-1-60558-446-1},
+ location = {Barcelona, Spain},
+ pages = {37--42},
+ numpages = {6},
+ url = {http://doi.acm.org/10.1145/1592631.1592641},
+ doi = {10.1145/1592631.1592641},
+ acmid = {1592641},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {arp, flow table, freebsd, ip, ipv6, mp, neighbor cache, routing, server load balancing (slb), smp, synchronization},
+}
+
+@book{McKusick:1996:DIO:231070,
+ author = {McKusick, Marshall Kirk and Bostic, Keith and Karels, Michael J. and Quarterman, John S.},
+ title = {The Design and Implementation of the 4.4BSD Operating System},
+ year = {1996},
+ isbn = {0-201-54979-4},
+ publisher = {Addison Wesley Longman Publishing Co., Inc.},
+ address = {Redwood City, CA, USA},
+}
+
+@book{McKusick:2004:DIF:1014910,
+ author = {McKusick, Marshall Kirk and Neville-Neil, George V.},
+ title = {The Design and Implementation of the FreeBSD Operating System},
+ year = {2004},
+ isbn = {0201702452},
+ publisher = {Pearson Education},
+}
+
+@book{McKusick:2014:DIF:2659919,
+ author = {McKusick, Marshall Kirk and Neville-Neil, George and Watson, Robert N.M.},
+ title = {The Design and Implementation of the FreeBSD Operating System},
+ year = {2014},
+ isbn = {0321968972, 9780321968975},
+ edition = {2nd},
+ publisher = {Addison-Wesley Professional},
+}
+
+@article{McKusick:2004:TSF:1035594.1035622,
+ author = {McKusick, Marshall Kirk and Neville-Neil, George V.},
+ title = {Thread Scheduling in FreeBSD 5.2},
+ journal = {Queue},
+ issue_date = {October 2004},
+ volume = {2},
+ number = {7},
+ month = oct,
+ year = {2004},
+ issn = {1542-7730},
+ pages = {58--64},
+ numpages = {7},
+ url = {http://doi.acm.org/10.1145/1035594.1035622},
+ doi = {10.1145/1035594.1035622},
+ acmid = {1035622},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+}
+
+@inproceedings{Murray:2002:IYP:1250894.1250900,
+ author = {Murray, Mark R. V.},
+ title = {An Implementation of the Yarrow PRNG for FreeBSD},
+ booktitle = {Proceedings of the BSD Conference 2002 on BSD Conference},
+ series = {BSDC'02},
+ year = {2002},
+ location = {San Francisco, California},
+ pages = {6--6},
+ numpages = {1},
+ url = {http://dl.acm.org/citation.cfm?id=1250894.1250900},
+ acmid = {1250900},
+ publisher = {USENIX Association},
+ address = {Berkeley, CA, USA},
+}
+
+@inproceedings{Spinellis:2006:GSD:1138506.1138524,
+ author = {Spinellis, Diomidis},
+ title = {Global Software Development in the freeBSD Project},
+ booktitle = {Proceedings of the 2006 International Workshop on Global Software Development for the Practitioner},
+ series = {GSD '06},
+ year = {2006},
+ isbn = {1-59593-404-9},
+ location = {Shanghai, China},
+ pages = {73--79},
+ numpages = {7},
+ url = {http://doi.acm.org/10.1145/1138506.1138524},
+ doi = {10.1145/1138506.1138524},
+ acmid = {1138524},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {global development, open source, quantitative analysis},
+}
+
+@inproceedings{Spinellis:2008:TFK:1368088.1368140,
+ author = {Spinellis, Diomidis},
+ title = {A Tale of Four Kernels},
+ booktitle = {Proceedings of the 30th International Conference on Software Engineering},
+ series = {ICSE '08},
+ year = {2008},
+ isbn = {978-1-60558-079-1},
+ location = {Leipzig, Germany},
+ pages = {381--390},
+ numpages = {10},
+ url = {http://doi.acm.org/10.1145/1368088.1368140},
+ doi = {10.1145/1368088.1368140},
+ acmid = {1368140},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {comparison, freebsd, linux, open source, opensolaris, proprietary software, wrk},
+}
+
+@inproceedings{Woodruff:2014:CCM:2665671.2665740,
+ author = {Woodruff, Jonathan and Watson, Robert N.M. and Chisnall, David and Moore, Simon W. and Anderson, Jonathan and Davis, Brooks and Laurie, Ben and Neumann, Peter G. and Norton, Robert and Roe, Michael},
+ title = {The CHERI Capability Model: Revisiting RISC in an Age of Risk},
+ booktitle = {Proceeding of the 41st Annual International Symposium on Computer Architecuture},
+ series = {ISCA '14},
+ year = {2014},
+ isbn = {978-1-4799-4394-4},
+ location = {Minneapolis, Minnesota, USA},
+ pages = {457--468},
+ numpages = {12},
+ url = {http://dl.acm.org/citation.cfm?id=2665671.2665740},
+ acmid = {2665740},
+ publisher = {IEEE Press},
+ address = {Piscataway, NJ, USA},
+}
+
+@article{Woodruff:2014:CCM:2678373.2665740,
+ author = {Woodruff, Jonathan and Watson, Robert N.M. and Chisnall, David and Moore, Simon W. and Anderson, Jonathan and Davis, Brooks and Laurie, Ben and Neumann, Peter G. and Norton, Robert and Roe, Michael},
+ title = {The CHERI Capability Model: Revisiting RISC in an Age of Risk},
+ journal = {SIGARCH Comput. Archit. News},
+ issue_date = {June 2014},
+ volume = {42},
+ number = {3},
+ month = jun,
+ year = {2014},
+ issn = {0163-5964},
+ pages = {457--468},
+ numpages = {12},
+ url = {http://doi.acm.org/10.1145/2678373.2665740},
+ doi = {10.1145/2678373.2665740},
+ acmid = {2665740},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+}
diff --git a/share/doc/papers/bufbio/Makefile b/share/doc/papers/bufbio/Makefile
new file mode 100644
index 000000000000..a486124fac51
--- /dev/null
+++ b/share/doc/papers/bufbio/Makefile
@@ -0,0 +1,12 @@
+VOLUME= papers
+DOC= bio
+SRCS= bio.ms-patched
+EXTRA= bufsize.eps
+MACROS= -ms
+USE_PIC=
+CLEANFILES= bio.ms-patched
+
+bio.ms-patched: bio.ms
+ sed "s;bufsize\.eps;${.CURDIR}/&;" ${.ALLSRC} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/bufbio/bio.ms b/share/doc/papers/bufbio/bio.ms
new file mode 100644
index 000000000000..677312ac5628
--- /dev/null
+++ b/share/doc/papers/bufbio/bio.ms
@@ -0,0 +1,828 @@
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.if n .ftr C R
+.nr PI 2n
+.TL
+The case for struct bio
+.br
+- or -
+.br
+A road map for a stackable BIO subsystem in FreeBSD
+.AU
+Poul-Henning Kamp <phk@FreeBSD.org>
+.AI
+The FreeBSD Project
+.AB
+Historically, the only translation performed on I/O requests after
+they they left the file-system layer were logical sub disk implementation
+done in the device driver. No universal standard for how sub disks are
+configured and implemented exists, in fact pretty much every single platform
+and operating system have done it their own way. As FreeBSD migrates to
+other platforms it needs to understand these local conventions to be
+able to co-exist with other operating systems on the same disk.
+.PP
+Recently a number of technologies like RAID have expanded the
+concept of "a disk" a fair bit and while these technologies initially
+were implemented in separate hardware they increasingly migrate into
+the operating systems as standard functionality.
+.PP
+Both of these factors indicate the need for a structured approach to
+systematic "geometry manipulation" facilities in FreeBSD.
+.PP
+This paper contains the road-map for a stackable "BIO" system in
+FreeBSD, which will support these facilities.
+.AE
+.NH
+The miseducation of \f(CWstruct buf\fP.
+.PP
+To fully appreciate the topic, I include a little historic overview
+of struct buf, it is a most enlightening case of not exactly bit-rot
+but more appropriately design-rot.
+.PP
+In the beginning, which for this purpose extends until virtual
+memory is was introduced into UNIX, all disk I/O were done from or
+to a struct buf. In the 6th edition sources, as printed in Lions
+Book, struct buf looks like this:
+.DS
+.ft CW
+.ps -1
+struct buf
+{
+ int b_flags; /* see defines below */
+ struct buf *b_forw; /* headed by devtab of b_dev */
+ struct buf *b_back; /* ' */
+ struct buf *av_forw; /* position on free list, */
+ struct buf *av_back; /* if not BUSY*/
+ int b_dev; /* major+minor device name */
+ int b_wcount; /* transfer count (usu. words) */
+ char *b_addr; /* low order core address */
+ char *b_xmem; /* high order core address */
+ char *b_blkno; /* block # on device */
+ char b_error; /* returned after I/O */
+ char *b_resid; /* words not transferred after
+ error */
+} buf[NBUF];
+.ps +1
+.ft P
+.DE
+.PP
+At this point in time, struct buf had only two functions:
+To act as a cache
+and to transport I/O operations to device drivers. For the purpose of
+this document, the cache functionality is uninteresting and will be
+ignored.
+.PP
+The I/O operations functionality consists of three parts:
+.IP "" 5n
+\(bu Where in Ram/Core is the data located (b_addr, b_xmem, b_wcount).
+.IP
+\(bu Where on disk is the data located (b_dev, b_blkno)
+.IP
+\(bu Request and result information (b_flags, b_error, b_resid)
+.PP
+In addition to this, the av_forw and av_back elements are
+used by the disk device drivers to put requests on a linked list.
+All in all the majority of struct buf is involved with the I/O
+aspect and only a few fields relate exclusively to the cache aspect.
+.PP
+If we step forward to the BSD 4.4-Lite-2 release, struct buf has grown
+a bit here or there:
+.DS
+.ft CW
+.ps -1
+struct buf {
+ LIST_ENTRY(buf) b_hash; /* Hash chain. */
+ LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */
+ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */
+ struct buf *b_actf, **b_actb; /* Device driver queue when active. */
+ struct proc *b_proc; /* Associated proc; NULL if kernel. */
+ volatile long b_flags; /* B_* flags. */
+ int b_error; /* Errno value. */
+ long b_bufsize; /* Allocated buffer size. */
+ long b_bcount; /* Valid bytes in buffer. */
+ long b_resid; /* Remaining I/O. */
+ dev_t b_dev; /* Device associated with buffer. */
+ struct {
+ caddr_t b_addr; /* Memory, superblocks, indirect etc. */
+ } b_un;
+ void *b_saveaddr; /* Original b_addr for physio. */
+ daddr_t b_lblkno; /* Logical block number. */
+ daddr_t b_blkno; /* Underlying physical block number. */
+ /* Function to call upon completion. */
+ void (*b_iodone) __P((struct buf *));
+ struct vnode *b_vp; /* Device vnode. */
+ long b_pfcent; /* Center page when swapping cluster. */
+ /* XXX pfcent should be int; overld. */
+ int b_dirtyoff; /* Offset in buffer of dirty region. */
+ int b_dirtyend; /* Offset of end of dirty region. */
+ struct ucred *b_rcred; /* Read credentials reference. */
+ struct ucred *b_wcred; /* Write credentials reference. */
+ int b_validoff; /* Offset in buffer of valid region. */
+ int b_validend; /* Offset of end of valid region. */
+};
+.ps +1
+.ft P
+.DE
+.PP
+The main piece of action is the addition of vnodes, a VM system and a
+prototype LFS filesystem, all of which needed some handles on struct
+buf. Comparison will show that the I/O aspect of struct buf is in
+essence unchanged, the length field is now in bytes instead of words,
+the linked list the drivers can use has been renamed (b_actf,
+b_actb) and a b_iodone pointer for callback notification has been added
+but otherwise there is no change to the fields which
+represent the I/O aspect. All the new fields relate to the cache
+aspect, link buffers to the VM system, provide hacks for file-systems
+(b_lblkno) etc etc.
+.PP
+By the time we get to FreeBSD 3.0 more stuff has grown on struct buf:
+.DS
+.ft CW
+.ps -1
+struct buf {
+ LIST_ENTRY(buf) b_hash; /* Hash chain. */
+ LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */
+ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */
+ TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */
+ struct proc *b_proc; /* Associated proc; NULL if kernel. */
+ long b_flags; /* B_* flags. */
+ unsigned short b_qindex; /* buffer queue index */
+ unsigned char b_usecount; /* buffer use count */
+ int b_error; /* Errno value. */
+ long b_bufsize; /* Allocated buffer size. */
+ long b_bcount; /* Valid bytes in buffer. */
+ long b_resid; /* Remaining I/O. */
+ dev_t b_dev; /* Device associated with buffer. */
+ caddr_t b_data; /* Memory, superblocks, indirect etc. */
+ caddr_t b_kvabase; /* base kva for buffer */
+ int b_kvasize; /* size of kva for buffer */
+ daddr_t b_lblkno; /* Logical block number. */
+ daddr_t b_blkno; /* Underlying physical block number. */
+ off_t b_offset; /* Offset into file */
+ /* Function to call upon completion. */
+ void (*b_iodone) __P((struct buf *));
+ /* For nested b_iodone's. */
+ struct iodone_chain *b_iodone_chain;
+ struct vnode *b_vp; /* Device vnode. */
+ int b_dirtyoff; /* Offset in buffer of dirty region. */
+ int b_dirtyend; /* Offset of end of dirty region. */
+ struct ucred *b_rcred; /* Read credentials reference. */
+ struct ucred *b_wcred; /* Write credentials reference. */
+ int b_validoff; /* Offset in buffer of valid region. */
+ int b_validend; /* Offset of end of valid region. */
+ daddr_t b_pblkno; /* physical block number */
+ void *b_saveaddr; /* Original b_addr for physio. */
+ caddr_t b_savekva; /* saved kva for transfer while bouncing */
+ void *b_driver1; /* for private use by the driver */
+ void *b_driver2; /* for private use by the driver */
+ void *b_spc;
+ union cluster_info {
+ TAILQ_HEAD(cluster_list_head, buf) cluster_head;
+ TAILQ_ENTRY(buf) cluster_entry;
+ } b_cluster;
+ struct vm_page *b_pages[btoc(MAXPHYS)];
+ int b_npages;
+ struct workhead b_dep; /* List of filesystem dependencies. */
+};
+.ps +1
+.ft P
+.DE
+.PP
+Still we find that the I/O aspect of struct buf is in essence unchanged. A couple of fields have been added which allows the driver to hang local data off the buf while working on it have been added (b_driver1, b_driver2) and a "physical block number" (b_pblkno) have been added.
+.PP
+This p_blkno is relevant, it has been added because the disklabel/slice
+code have been abstracted out of the device drivers, the filesystem
+ask for b_blkno, the slice/label code translates this into b_pblkno
+which the device driver operates on.
+.PP
+After this point some minor cleanups have happened, some unused fields
+have been removed etc but the I/O aspect of struct buf is still only
+a fraction of the entire structure: less than a quarter of the
+bytes in a struct buf are used for the I/O aspect and struct buf
+seems to continue to grow and grow.
+.PP
+Since version 6 as documented in Lions book, a three significant pieces
+of code have emerged which need to do non-trivial translations of
+the I/O request before it reaches the device drivers: CCD, slice/label
+and Vinum. They all basically do the same: they map I/O requests from
+a logical space to a physical space, and the mappings they perform
+can be 1:1 or 1:N. \**
+.FS
+It is interesting to note that Lions in his comments to the \f(CWrkaddr\fP
+routine (p. 16-2) writes \fIThe code in this procedure incorporates
+a special feature for files which extend over more than one disk
+drive. This feature is described in the UPM Section "RK(IV)". Its
+usefulness seems to be restricted.\fP This more than hints at the
+presence already then of various hacks to stripe/span multiple devices.
+.FE
+.PP
+The 1:1 mapping of the slice/label code is rather trivial, and the
+addition of the b_pblkno field catered for the majority of the issues
+this resulted in, leaving but one: Reads or writes to the magic "disklabel"
+or equally magic "MBR" sectors on a disk must be caught, examined and in
+some cases modified before being passed on to the device driver. This need
+resulted in the addition of the b_iodone_chain field which adds a limited
+ability to stack I/O operations;
+.PP
+The 1:N mapping of CCD and Vinum are far more interesting. These two
+subsystems look like a device driver, but rather than drive some piece
+of hardware, they allocate new struct buf data structures populates
+these and pass them on to other device drivers.
+.PP
+Apart from it being inefficient to lug about a 348 bytes data structure
+when 80 bytes would have done, it also leads to significant code rot
+when programmers don't know what to do about the remaining fields or
+even worse: "borrow" a field or two for their own uses.
+.PP
+.ID
+.if t .PSPIC bufsize.eps
+.if n [graph not available in this format]
+.DE
+.I
+Conclusions:
+.IP "" 5n
+\(bu Struct buf is victim of chronic bloat.
+.IP
+\(bu The I/O aspect of
+struct buf is practically constant and only about \(14 of the total bytes.
+.IP
+\(bu Struct buf currently have several users, vinum, ccd and to
+limited extent diskslice/label, which
+need only the I/O aspect, not the vnode, caching or VM linkage.
+.IP
+.I
+The I/O aspect of struct buf should be put in a separate \f(CWstruct bio\fP.
+.R
+.NH 1
+Implications for future struct buf improvements
+.PP
+Concerns have been raised about the implications this separation
+will have for future work on struct buf, I will try to address
+these concerns here.
+.PP
+As the existence and popularity of vinum and ccd proves, there is
+a legitimate and valid requirement to be able to do I/O operations
+which are not initiated by a vnode or filesystem operation.
+In other words, an I/O request is a fully valid entity in its own
+right and should be treated like that.
+.PP
+Without doubt, the I/O request has to be tuned to fit the needs
+of struct buf users in the best possible way, and consequently
+any future changes in struct buf are likely to affect the I/O request
+semantics.
+.PP
+One particular change which has been proposed is to drop the present
+requirement that a struct buf be mapped contiguously into kernel
+address space. The argument goes that since many modern drivers use
+physical address DMA to transfer the data maintaining such a mapping
+is needless overhead.
+.PP
+Of course some drivers will still need to be able to access the
+buffer in kernel address space and some kind of compatibility
+must be provided there.
+.PP
+The question is, if such a change is made impossible by the
+separation of the I/O aspect into its own data structure?
+.PP
+The answer to this is ``no''.
+Anything that could be added to or done with
+the I/O aspect of struct buf can also be added to or done
+with the I/O aspect if it lives in a new "struct bio".
+.NH 1
+Implementing a \f(CWstruct bio\fP
+.PP
+The first decision to be made was who got to use the name "struct buf",
+and considering the fact that it is the I/O aspect which gets separated
+out and that it only covers about \(14 of the bytes in struct buf,
+obviously the new structure for the I/O aspect gets a new name.
+Examining the naming in the kernel, the "bio" prefix seemed a given,
+for instance, the function to signal completion of an I/O request is
+already named "biodone()".
+.PP
+Making the transition smooth is obviously also a priority and after
+some prototyping \**
+.FS
+The software development technique previously known as "Trial & Error".
+.FE
+it was found that a totally transparent transition could be made by
+embedding a copy of the new "struct bio" as the first element of "struct buf"
+and by using cpp(1) macros to alias the fields to the legacy struct buf
+names.
+.NH 2
+The b_flags problem.
+.PP
+Struct bio was defined by examining all code existing in the driver tree
+and finding all the struct buf fields which were legitimately used (as
+opposed to "hi-jacked" fields).
+One field was found to have "dual-use": the b_flags field.
+This required special attention.
+Examination showed that b_flags were used for three things:
+.IP "" 5n
+\(bu Communication of the I/O command (READ, WRITE, FORMAT, DELETE)
+.IP
+\(bu Communication of ordering and error status
+.IP
+\(bu General status for non I/O aspect consumers of struct buf.
+.PP
+For historic reasons B_WRITE was defined to be zero, which lead to
+confusion and bugs, this pushed the decision to have a separate
+"b_iocmd" field in struct buf and struct bio for communicating
+only the action to be performed.
+.PP
+The ordering and error status bits were put in a new flag field "b_ioflag".
+This has left sufficiently many now unused bits in b_flags that the b_xflags element
+can now be merged back into b_flags.
+.NH 2
+Definition of struct bio
+.PP
+With the cleanup of b_flags in place, the definition of struct bio looks like this:
+.DS
+.ft CW
+.ps -1
+struct bio {
+ u_int bio_cmd; /* I/O operation. */
+ dev_t bio_dev; /* Device to do I/O on. */
+ daddr_t bio_blkno; /* Underlying physical block number. */
+ off_t bio_offset; /* Offset into file. */
+ long bio_bcount; /* Valid bytes in buffer. */
+ caddr_t bio_data; /* Memory, superblocks, indirect etc. */
+ u_int bio_flags; /* BIO_ flags. */
+ struct buf *_bio_buf; /* Parent buffer. */
+ int bio_error; /* Errno for BIO_ERROR. */
+ long bio_resid; /* Remaining I/O in bytes. */
+ void (*bio_done) __P((struct buf *));
+ void *bio_driver1; /* Private use by the callee. */
+ void *bio_driver2; /* Private use by the callee. */
+ void *bio_caller1; /* Private use by the caller. */
+ void *bio_caller2; /* Private use by the caller. */
+ TAILQ_ENTRY(bio) bio_queue; /* Disksort queue. */
+ daddr_t bio_pblkno; /* physical block number */
+ struct iodone_chain *bio_done_chain;
+};
+.ps +1
+.ft P
+.DE
+.NH 2
+Definition of struct buf
+.PP
+After adding a struct bio to struct buf and the fields aliased into it
+struct buf looks like this:
+.DS
+.ft CW
+.ps -1
+struct buf {
+ /* XXX: b_io must be the first element of struct buf for now /phk */
+ struct bio b_io; /* "Builtin" I/O request. */
+#define b_bcount b_io.bio_bcount
+#define b_blkno b_io.bio_blkno
+#define b_caller1 b_io.bio_caller1
+#define b_caller2 b_io.bio_caller2
+#define b_data b_io.bio_data
+#define b_dev b_io.bio_dev
+#define b_driver1 b_io.bio_driver1
+#define b_driver2 b_io.bio_driver2
+#define b_error b_io.bio_error
+#define b_iocmd b_io.bio_cmd
+#define b_iodone b_io.bio_done
+#define b_iodone_chain b_io.bio_done_chain
+#define b_ioflags b_io.bio_flags
+#define b_offset b_io.bio_offset
+#define b_pblkno b_io.bio_pblkno
+#define b_resid b_io.bio_resid
+ LIST_ENTRY(buf) b_hash; /* Hash chain. */
+ TAILQ_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */
+ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */
+ TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */
+ long b_flags; /* B_* flags. */
+ unsigned short b_qindex; /* buffer queue index */
+ unsigned char b_xflags; /* extra flags */
+[...]
+.ps +1
+.ft P
+.DE
+.PP
+Putting the struct bio as the first element in struct buf during a transition
+period allows a pointer to either to be cast to a pointer of the other,
+which means that certain pieces of code can be left un-converted with the
+use of a couple of casts while the remaining pieces of code are tested.
+The ccd and vinum modules have been left un-converted like this for now.
+.PP
+This is basically where FreeBSD-current stands today.
+.PP
+The next step is to substitute struct bio for struct buf in all the code
+which only care about the I/O aspect: device drivers, diskslice/label.
+The patch to do this is up for review. \**
+.FS
+And can be found at http://phk.freebsd.dk/misc
+.FE
+and consists mainly of systematic substitutions like these
+.DS
+.ft CW
+s/struct buf/struct bio/
+s/b_flags/bio_flags/
+s/b_bcount/bio_bcount/
+&c &c
+.ft P
+.DE
+.NH 2
+Future work
+.PP
+It can be successfully argued that the cpp(1) macros used for aliasing
+above are ugly and should be expanded in place. It would certainly
+be trivial to do so, but not by definition worthwhile.
+.PP
+Retaining the aliasing for the b_* and bio_* name-spaces this way
+leaves us with considerable flexibility in modifying the future
+interaction between the two. The DEV_STRATEGY() macro is the single
+point where a struct buf is turned into a struct bio and launched
+into the drivers to full-fill the I/O request and this provides us
+with a single isolated location for performing non-trivial translations.
+.PP
+As an example of this flexibility: It has been proposed to essentially
+drop the b_blkno field and use the b_offset field to communicate the
+on-disk location of the data. b_blkno is a 32bit offset of B_DEVSIZE
+(512) bytes sectors which allows us to address two terabytes worth
+of data. Using b_offset as a 64 bit byte-address would not only allow
+us to address 8 million times larger disks, it would also make it
+possible to accommodate disks which use non-power-of-two sector-size,
+Audio CD-ROMs for instance.
+.PP
+The above mentioned flexibility makes an implementation almost trivial:
+.IP "" 5n
+\(bu Add code to DEV_STRATEGY() to populate b_offset from b_blkno in the
+cases where it is not valid. Today it is only valid for a struct buf
+marked B_PHYS.
+.IP
+\(bu Change diskslice/label, ccd, vinum and device drivers to use b_offset
+instead of b_blkno.
+.IP
+\(bu Remove the bio_blkno field from struct bio, add it to struct buf as
+b_blkno and remove the cpp(1) macro which aliased it into struct bio.
+.PP
+Another possible transition could be to not have a "built-in" struct bio
+in struct buf. If for some reason struct bio grows fields of no relevance
+to struct buf it might be cheaper to remove struct bio from struct buf,
+un-alias the fields and have DEV_STRATEGY() allocate a struct bio and populate
+the relevant fields from struct buf.
+This would also be entirely transparent to both users of struct buf and
+struct bio as long as we retain the aliasing mechanism and DEV_STRATEGY().
+.bp
+.NH 1
+Towards a stackable BIO subsystem.
+.PP
+Considering that we now have three distinct pieces of code living
+in the nowhere between DEV_STRATEGY() and the device drivers:
+diskslice/label, ccd and vinum, it is not unreasonable to start
+to look for a more structured and powerful API for these pieces
+of code.
+.PP
+In traditional UNIX semantics a "disk" is a one-dimensional array of
+512 byte sectors which can be read or written. Support for sectors
+of multiple of 512 bytes were implemented with a sort of "don't ask-don't tell" policy where system administrator would specify a larger minimum sector-size
+to the filesystem, and things would "just work", but no formal communication about the size of the smallest transfer possible were exchanged between the disk driver and the filesystem.
+.PP
+A truly generalised concept of a disk needs to be more flexible and more
+expressive. For instance, a user of a disk will want to know:
+.IP "" 5n
+\(bu What is the sector size. Sector-size these days may not be a power
+of two, for instance Audio CDs have 2352 byte "sectors".
+.IP
+\(bu How many sectors are there.
+.IP
+\(bu Is writing of sectors supported.
+.IP
+\(bu Is freeing of sectors supported. This is important for flash based
+devices where a wear-distribution software or hardware function uses
+the information about which sectors are actually in use to optimise the
+usage of the slow erase function to a minimum.
+.IP
+\(bu Is opening this device in a specific mode, (read-only or read-write)
+allowed. The VM system and the file-systems generally assume that nobody
+writes to "their storage" under their feet, and therefore opens which
+would make that possible should be rejected.
+.IP
+\(bu What is the "native" geometry of this device (Sectors/Heads/Cylinders).
+This is useful for staying compatible with badly designed on-disk formats
+from other operating systems.
+.PP
+Obviously, all of these properties are dynamic in the sense that in
+these days disks are removable devices, and they may therefore change
+at any time. While some devices like CD-ROMs can lock the media in
+place with a special command, this cannot be done for all devices,
+in particular it cannot be done with normal floppy disk drives.
+.PP
+If we adopt such a model for disk, retain the existing "strategy/biodone" model of I/O scheduling and decide to use a modular or stackable approach to
+geometry translations we find that nearly endless flexibility emerges:
+Mirroring, RAID, striping, interleaving, disk-labels and sub-disks, all of
+these techniques would get a common framework to operate in.
+.PP
+In practice of course, such a scheme must not complicate the use of or
+installation of FreeBSD. The code will have to act and react exactly
+like the current code but fortunately the current behaviour is not at
+all hard to emulate so implementation-wise this is a non-issue.
+.PP
+But lets look at some drawings to see what this means in practice.
+.PP
+Today the plumbing might look like this on a machine:
+.DS
+.PS
+ Ad0: box "disk (ad0)"
+ arrow up from Ad0.n
+ SL0: box "slice/label"
+ Ad1: box "disk (ad1)" with .w at Ad0.e + (.2,0)
+ arrow up from Ad1.n
+ SL1: box "slice/label"
+ Ad2: box "disk (ad2)" with .w at Ad1.e + (.2,0)
+ arrow up from Ad2.n
+ SL2: box "slice/label"
+ Ad3: box "disk (ad3)" with .w at Ad2.e + (.2,0)
+ arrow up from Ad3.n
+ SL3: box "slice/label"
+ DML: box dashed width 4i height .9i with .sw at SL0.sw + (-.2,-.2)
+ "Disk-mini-layer" with .n at DML.s + (0, .1)
+
+ V: box "vinum" at 1/2 <SL1.n, SL2.n> + (0,1.2)
+
+ A0A: arrow up from 1/4 <SL0.nw, SL0.ne>
+ A0B: arrow up from 2/4 <SL0.nw, SL0.ne>
+ A0E: arrow up from 3/4 <SL0.nw, SL0.ne>
+ A1C: arrow up from 2/4 <SL1.nw, SL1.ne>
+ arrow to 1/3 <V.sw, V.se>
+ A2C: arrow up from 2/4 <SL2.nw, SL2.ne>
+ arrow to 2/3 <V.sw, V.se>
+ A3A: arrow up from 1/4 <SL3.nw, SL3.ne>
+ A3E: arrow up from 2/4 <SL3.nw, SL3.ne>
+ A3F: arrow up from 3/4 <SL3.nw, SL3.ne>
+
+ "ad0s1a" with .s at A0A.n + (0, .1)
+ "ad0s1b" with .s at A0B.n + (0, .3)
+ "ad0s1e" with .s at A0E.n + (0, .5)
+ "ad1s1c" with .s at A1C.n + (0, .1)
+ "ad2s1c" with .s at A2C.n + (0, .1)
+ "ad3s4a" with .s at A3A.n + (0, .1)
+ "ad3s4e" with .s at A3E.n + (0, .3)
+ "ad3s4f" with .s at A3F.n + (0, .5)
+
+ V1: arrow up from 1/4 <V.nw, V.ne>
+ V2: arrow up from 2/4 <V.nw, V.ne>
+ V3: arrow up from 3/4 <V.nw, V.ne>
+ "V1" with .s at V1.n + (0, .1)
+ "V2" with .s at V2.n + (0, .1)
+ "V3" with .s at V3.n + (0, .1)
+
+.PE
+.DE
+.PP
+And while this drawing looks nice and clean, the code underneat isn't.
+With a stackable BIO implementation, the picture would look like this:
+.DS
+.PS
+ Ad0: box "disk (ad0)"
+ arrow up from Ad0.n
+ M0: box "MBR"
+ arrow up
+ B0: box "BSD"
+
+ A0A: arrow up from 1/4 <B0.nw, B0.ne>
+ A0B: arrow up from 2/4 <B0.nw, B0.ne>
+ A0E: arrow up from 3/4 <B0.nw, B0.ne>
+
+ Ad1: box "disk (ad1)" with .w at Ad0.e + (.2,0)
+ Ad2: box "disk (ad2)" with .w at Ad1.e + (.2,0)
+ Ad3: box "disk (ad3)" with .w at Ad2.e + (.2,0)
+ arrow up from Ad3.n
+ SL3: box "MBR"
+ arrow up
+ B3: box "BSD"
+
+ V: box "vinum" at 1/2 <Ad1.n, Ad2.n> + (0,.8)
+ arrow from Ad1.n to 1/3 <V.sw, V.se>
+ arrow from Ad2.n to 2/3 <V.sw, V.se>
+
+ A3A: arrow from 1/4 <B3.nw, B3.ne>
+ A3E: arrow from 2/4 <B3.nw, B3.ne>
+ A3F: arrow from 3/4 <B3.nw, B3.ne>
+
+ "ad0s1a" with .s at A0A.n + (0, .1)
+ "ad0s1b" with .s at A0B.n + (0, .3)
+ "ad0s1e" with .s at A0E.n + (0, .5)
+ "ad3s4a" with .s at A3A.n + (0, .1)
+ "ad3s4e" with .s at A3E.n + (0, .3)
+ "ad3s4f" with .s at A3F.n + (0, .5)
+
+ V1: arrow up from 1/4 <V.nw, V.ne>
+ V2: arrow up from 2/4 <V.nw, V.ne>
+ V3: arrow up from 3/4 <V.nw, V.ne>
+ "V1" with .s at V1.n + (0, .1)
+ "V2" with .s at V2.n + (0, .1)
+ "V3" with .s at V3.n + (0, .1)
+
+.PE
+.DE
+.PP
+The first thing we notice is that the disk mini-layer is gone, instead
+separate modules for the Microsoft style MBR and the BSD style disklabel
+are now stacked over the disk. We can also see that Vinum no longer
+needs to go though the BSD/MBR layers if it wants access to the entire
+physical disk, it can be stacked right over the disk.
+.PP
+Now, imagine that a ZIP drive is connected to the machine, and the
+user loads a ZIP disk in it. First the device driver notices the
+new disk and instantiates a new disk:
+.DS
+.PS
+ box "disk (da0)"
+.PE
+.DE
+.PP
+A number of the geometry modules have registered as "auto-discovering"
+and will be polled sequentially to see if any of them recognise what
+is on this disk. The MBR module finds a MBR in sector 0 and attach
+an instance of itself to the disk:
+.DS
+.PS
+ D: box "disk (da0)"
+ arrow up from D.n
+ M: box "MBR"
+ M1: arrow up from 1/3 <M.nw, M.ne>
+ M2: arrow up from 2/3 <M.nw, M.ne>
+.PE
+.DE
+.PP
+It finds two "slices" in the MBR and creates two new "disks" one for
+each of these. The polling of modules is repeated and this time the
+BSD label module recognises a FreeBSD label on one of the slices and
+attach itself:
+.DS
+.PS
+ D: box "disk (da0)"
+ arrow "O" up from D.n
+ M: box "MBR"
+ M1: line up .3i from 1/3 <M.nw, M.ne>
+ arrow "O" left
+ M2: arrow "O" up from 2/3 <M.nw, M.ne>
+ B: box "BSD"
+ B1: arrow "O" up from 1/4 <B.nw, B.ne>
+ B2: arrow "O" up from 2/4 <B.nw, B.ne>
+ B3: arrow "O" up from 3/4 <B.nw, B.ne>
+
+.PE
+.DE
+.PP
+The BSD module finds three partitions, creates them as disks and the
+polling is repeated for each of these. No modules recognise these
+and the process ends. In theory one could have a module recognise
+the UFS superblock and extract from there the path to mount the disk
+on, but this is probably better implemented in a general "device-daemon"
+in user-land.
+.PP
+On this last drawing I have marked with "O" the "disks" which can be
+accessed from user-land or kernel. The VM and file-systems generally
+prefer to have exclusive write access to the disk sectors they use,
+so we need to enforce this policy. Since we cannot know what transformation
+a particular module implements, we need to ask the modules if the open
+is OK, and they may need to ask their neighbours before they can answer.
+.PP
+We decide to mount a filesystem on one of the BSD partitions at the very top.
+The open request is passed to the BSD module, which finds that none of
+the other open partitions (there are none) overlap this one, so far no
+objections. It then passes the open to the MBR module, which goes through
+basically the same procedure finds no objections and pass the request to
+the disk driver, which since it was not previously open approves of the
+open.
+.PP
+Next we mount a filesystem on the next BSD partition. The
+BSD module again checks for overlapping open partitions and find none.
+This time however, it finds that it has already opened the "downstream"
+in R/W mode so it does not need to ask for permission for that again
+so the open is OK.
+.PP
+Next we mount a msdos filesystem on the other MBR slice. This is the
+same case, the MBR finds no overlapping open slices and has already
+opened "downstream" so the open is OK.
+.PP
+If we now try to open the other slice for writing, the one which has the
+BSD module attached already. The open is passed to the MBR module which
+notes that the device is already opened for writing by a module (the BSD
+module) and consequently the open is refused.
+.PP
+While this sounds complicated it actually took less than 200 lines of
+code to implement in a prototype implementation.
+.PP
+Now, the user ejects the ZIP disk. If the hardware can give a notification
+of intent to eject, a call-up from the driver can try to get devices synchronised
+and closed, this is pretty trivial. If the hardware just disappears like
+a unplugged parallel zip drive, a floppy disk or a PC-card, we have no
+choice but to dismantle the setup. The device driver sends a "gone" notification to the MBR module, which replicates this upwards to the mounted msdosfs
+and the BSD module. The msdosfs unmounts forcefully, invalidates any blocks
+in the buf/vm system and returns. The BSD module replicates the "gone" to
+the two mounted file-systems which in turn unmounts forcefully, invalidates
+blocks and return, after which the BSD module releases any resources held
+and returns, the MBR module releases any resources held and returns and all
+traces of the device have been removed.
+.PP
+Now, let us get a bit more complicated. We add another disk and mirror
+two of the MBR slices:
+.DS
+.PS
+ D0: box "disk (da0)"
+
+ arrow "O" up from D0.n
+ M0: box "MBR"
+ M01: line up .3i from 1/3 <M0.nw, M0.ne>
+ arrow "O" left
+ M02: arrow "O" up from 2/3 <M0.nw, M0.ne>
+
+ D1: box "disk (da1)" with .w at D0.e + (.2,0)
+ arrow "O" up from D1.n
+ M1: box "MBR"
+ M11: line up .3i from 1/3 <M1.nw, M1.ne>
+ line "O" left
+ M11a: arrow up .2i
+
+ I: box "Mirror" with .s at 1/2 <M02.n, M11a.n>
+ arrow "O" up
+ BB: box "BSD"
+ BB1: arrow "O" up from 1/4 <BB.nw, BB.ne>
+ BB2: arrow "O" up from 2/4 <BB.nw, BB.ne>
+ BB3: arrow "O" up from 3/4 <BB.nw, BB.ne>
+
+ M12: arrow "O" up from 2/3 <M1.nw, M1.ne>
+ B: box "BSD"
+ B1: arrow "O" up from 1/4 <B.nw, B.ne>
+ B2: arrow "O" up from 2/4 <B.nw, B.ne>
+ B3: arrow "O" up from 3/4 <B.nw, B.ne>
+.PE
+.DE
+.PP
+Now assuming that we lose disk da0, the notification goes up like before
+but the mirror module still has a valid mirror from disk da1, so it
+doesn't propagate the "gone" notification further up and the three
+file-systems mounted are not affected.
+.PP
+It is possible to modify the graph while in action, as long as the
+modules know that they will not affect any I/O in progress. This is
+very handy for moving things around. At any of the arrows we can
+insert a mirroring module, since it has a 1:1 mapping from input
+to output. Next we can add another copy to the mirror, give the
+mirror time to sync the two copies. Detach the first mirror copy
+and remove the mirror module. We have now in essence moved a partition
+from one disk to another transparently.
+.NH 1
+Getting stackable BIO layers from where we are today.
+.PP
+Most of the infrastructure is in place now to implement stackable
+BIO layers:
+.IP "" 5n
+\(bu The dev_t change gave us a public structure where
+information about devices can be put. This enabled us to get rid
+of all the NFOO limits on the number of instances of a particular
+driver/device, and significantly cleaned up the vnode aliasing for
+device vnodes.
+.IP
+\(bu The disk-mini-layer has
+taken the knowledge about diskslice/labels out of the
+majority of the disk-drivers, saving on average 100 lines of code per
+driver.
+.IP
+\(bu The struct bio/buf divorce is giving us an IO request of manageable
+size which can be modified without affecting all the filesystem and
+VM system users of struct buf.
+.PP
+The missing bits are:
+.IP "" 5n
+\(bu changes to struct bio to make it more
+stackable. This mostly relates to the handling of the biodone()
+event, something which will be transparent to all current users
+of struct buf/bio.
+.IP
+\(bu code to stich modules together and to pass events and notifications
+between them.
+.NH 1
+An Implementation plan for stackable BIO layers
+.PP
+My plan for implementation stackable BIO layers is to first complete
+the struct bio/buf divorce with the already mentioned patch.
+.PP
+The next step is to re-implement the monolithic disk-mini-layer so
+that it becomes the stackable BIO system. Vinum and CCD and all
+other consumers should not be unable to tell the difference between
+the current and the new disk-mini-layer. The new implementation
+will initially use a static stacking to remain compatible with the
+current behaviour. This will be the next logical checkpoint commit.
+.PP
+The next step is to make the stackable layers configurable,
+to provide the means to initialise the stacking and to subsequently
+change it. This will be the next logical checkpoint commit.
+.PP
+At this point new functionality can be added inside the stackable
+BIO system: CCD can be re-implemented as a mirror module and a stripe
+module. Vinum can be integrated either as one "macro-module" or
+as separate functions in separate modules. Also modules for other
+purposes can be added, sub-disk handling for Solaris, MacOS, etc
+etc. These modules can be committed one at a time.
diff --git a/share/doc/papers/bufbio/bufsize.eps b/share/doc/papers/bufbio/bufsize.eps
new file mode 100644
index 000000000000..2396ac62aa40
--- /dev/null
+++ b/share/doc/papers/bufbio/bufsize.eps
@@ -0,0 +1,479 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: a.ps
+%%Creator: $FreeBSD$
+%%CreationDate: Sat Apr 8 08:32:58 2000
+%%DocumentFonts: (atend)
+%%BoundingBox: 50 50 410 302
+%%Orientation: Portrait
+%%EndComments
+/gnudict 256 dict def
+gnudict begin
+/Color false def
+/Solid false def
+/gnulinewidth 5.000 def
+/userlinewidth gnulinewidth def
+/vshift -46 def
+/dl {10 mul} def
+/hpt_ 31.5 def
+/vpt_ 31.5 def
+/hpt hpt_ def
+/vpt vpt_ def
+/M {moveto} bind def
+/L {lineto} bind def
+/R {rmoveto} bind def
+/V {rlineto} bind def
+/vpt2 vpt 2 mul def
+/hpt2 hpt 2 mul def
+/Lshow { currentpoint stroke M
+ 0 vshift R show } def
+/Rshow { currentpoint stroke M
+ dup stringwidth pop neg vshift R show } def
+/Cshow { currentpoint stroke M
+ dup stringwidth pop -2 div vshift R show } def
+/UP { dup vpt_ mul /vpt exch def hpt_ mul /hpt exch def
+ /hpt2 hpt 2 mul def /vpt2 vpt 2 mul def } def
+/DL { Color {setrgbcolor Solid {pop []} if 0 setdash }
+ {pop pop pop Solid {pop []} if 0 setdash} ifelse } def
+/BL { stroke gnulinewidth 2 mul setlinewidth } def
+/AL { stroke gnulinewidth 2 div setlinewidth } def
+/UL { gnulinewidth mul /userlinewidth exch def } def
+/PL { stroke userlinewidth setlinewidth } def
+/LTb { BL [] 0 0 0 DL } def
+/LTa { AL [1 dl 2 dl] 0 setdash 0 0 0 setrgbcolor } def
+/LT0 { PL [] 1 0 0 DL } def
+/LT1 { PL [4 dl 2 dl] 0 1 0 DL } def
+/LT2 { PL [2 dl 3 dl] 0 0 1 DL } def
+/LT3 { PL [1 dl 1.5 dl] 1 0 1 DL } def
+/LT4 { PL [5 dl 2 dl 1 dl 2 dl] 0 1 1 DL } def
+/LT5 { PL [4 dl 3 dl 1 dl 3 dl] 1 1 0 DL } def
+/LT6 { PL [2 dl 2 dl 2 dl 4 dl] 0 0 0 DL } def
+/LT7 { PL [2 dl 2 dl 2 dl 2 dl 2 dl 4 dl] 1 0.3 0 DL } def
+/LT8 { PL [2 dl 2 dl 2 dl 2 dl 2 dl 2 dl 2 dl 4 dl] 0.5 0.5 0.5 DL } def
+/Pnt { stroke [] 0 setdash
+ gsave 1 setlinecap M 0 0 V stroke grestore } def
+/Dia { stroke [] 0 setdash 2 copy vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath stroke
+ Pnt } def
+/Pls { stroke [] 0 setdash vpt sub M 0 vpt2 V
+ currentpoint stroke M
+ hpt neg vpt neg R hpt2 0 V stroke
+ } def
+/Box { stroke [] 0 setdash 2 copy exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath stroke
+ Pnt } def
+/Crs { stroke [] 0 setdash exch hpt sub exch vpt add M
+ hpt2 vpt2 neg V currentpoint stroke M
+ hpt2 neg 0 R hpt2 vpt2 V stroke } def
+/TriU { stroke [] 0 setdash 2 copy vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath stroke
+ Pnt } def
+/Star { 2 copy Pls Crs } def
+/BoxF { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath fill } def
+/TriUF { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath fill } def
+/TriD { stroke [] 0 setdash 2 copy vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath stroke
+ Pnt } def
+/TriDF { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath fill} def
+/DiaF { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath fill } def
+/Pent { stroke [] 0 setdash 2 copy gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath stroke grestore Pnt } def
+/PentF { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath fill grestore } def
+/Circle { stroke [] 0 setdash 2 copy
+ hpt 0 360 arc stroke Pnt } def
+/CircleF { stroke [] 0 setdash hpt 0 360 arc fill } def
+/C0 { BL [] 0 setdash 2 copy moveto vpt 90 450 arc } bind def
+/C1 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C2 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C3 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C4 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 180 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C5 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc
+ 2 copy moveto
+ 2 copy vpt 180 270 arc closepath fill
+ vpt 0 360 arc } bind def
+/C6 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C7 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C8 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 270 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C9 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 270 450 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C10 { BL [] 0 setdash 2 copy 2 copy moveto vpt 270 360 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 90 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C11 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 180 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 270 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C12 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 180 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C13 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 180 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C14 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 360 arc closepath fill
+ vpt 0 360 arc } bind def
+/C15 { BL [] 0 setdash 2 copy vpt 0 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/Rec { newpath 4 2 roll moveto 1 index 0 rlineto 0 exch rlineto
+ neg 0 rlineto closepath } bind def
+/Square { dup Rec } bind def
+/Bsquare { vpt sub exch vpt sub exch vpt2 Square } bind def
+/S0 { BL [] 0 setdash 2 copy moveto 0 vpt rlineto BL Bsquare } bind def
+/S1 { BL [] 0 setdash 2 copy vpt Square fill Bsquare } bind def
+/S2 { BL [] 0 setdash 2 copy exch vpt sub exch vpt Square fill Bsquare } bind def
+/S3 { BL [] 0 setdash 2 copy exch vpt sub exch vpt2 vpt Rec fill Bsquare } bind def
+/S4 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt Square fill Bsquare } bind def
+/S5 { BL [] 0 setdash 2 copy 2 copy vpt Square fill
+ exch vpt sub exch vpt sub vpt Square fill Bsquare } bind def
+/S6 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill Bsquare } bind def
+/S7 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill
+ 2 copy vpt Square fill
+ Bsquare } bind def
+/S8 { BL [] 0 setdash 2 copy vpt sub vpt Square fill Bsquare } bind def
+/S9 { BL [] 0 setdash 2 copy vpt sub vpt vpt2 Rec fill Bsquare } bind def
+/S10 { BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt Square fill
+ Bsquare } bind def
+/S11 { BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt2 vpt Rec fill
+ Bsquare } bind def
+/S12 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill Bsquare } bind def
+/S13 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+ 2 copy vpt Square fill Bsquare } bind def
+/S14 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+ 2 copy exch vpt sub exch vpt Square fill Bsquare } bind def
+/S15 { BL [] 0 setdash 2 copy Bsquare fill Bsquare } bind def
+/D0 { gsave translate 45 rotate 0 0 S0 stroke grestore } bind def
+/D1 { gsave translate 45 rotate 0 0 S1 stroke grestore } bind def
+/D2 { gsave translate 45 rotate 0 0 S2 stroke grestore } bind def
+/D3 { gsave translate 45 rotate 0 0 S3 stroke grestore } bind def
+/D4 { gsave translate 45 rotate 0 0 S4 stroke grestore } bind def
+/D5 { gsave translate 45 rotate 0 0 S5 stroke grestore } bind def
+/D6 { gsave translate 45 rotate 0 0 S6 stroke grestore } bind def
+/D7 { gsave translate 45 rotate 0 0 S7 stroke grestore } bind def
+/D8 { gsave translate 45 rotate 0 0 S8 stroke grestore } bind def
+/D9 { gsave translate 45 rotate 0 0 S9 stroke grestore } bind def
+/D10 { gsave translate 45 rotate 0 0 S10 stroke grestore } bind def
+/D11 { gsave translate 45 rotate 0 0 S11 stroke grestore } bind def
+/D12 { gsave translate 45 rotate 0 0 S12 stroke grestore } bind def
+/D13 { gsave translate 45 rotate 0 0 S13 stroke grestore } bind def
+/D14 { gsave translate 45 rotate 0 0 S14 stroke grestore } bind def
+/D15 { gsave translate 45 rotate 0 0 S15 stroke grestore } bind def
+/DiaE { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath stroke } def
+/BoxE { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath stroke } def
+/TriUE { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath stroke } def
+/TriDE { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath stroke } def
+/PentE { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath stroke grestore } def
+/CircE { stroke [] 0 setdash
+ hpt 0 360 arc stroke } def
+/Opaque { gsave closepath 1 setgray fill grestore 0 setgray closepath } def
+/DiaW { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V Opaque stroke } def
+/BoxW { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V Opaque stroke } def
+/TriUW { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V Opaque stroke } def
+/TriDW { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V Opaque stroke } def
+/PentW { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ Opaque stroke grestore } def
+/CircW { stroke [] 0 setdash
+ hpt 0 360 arc Opaque stroke } def
+/BoxFill { gsave Rec 1 setgray fill grestore } def
+end
+%%EndProlog
+gnudict begin
+gsave
+50 50 translate
+0.050 0.050 scale
+0 setgray
+newpath
+(Helvetica) findfont 140 scalefont setfont
+1.000 UL
+LTb
+630 420 M
+63 0 V
+6269 0 R
+-63 0 V
+546 420 M
+(0) Rshow
+630 1020 M
+63 0 V
+6269 0 R
+-63 0 V
+-6353 0 R
+(50) Rshow
+630 1620 M
+63 0 V
+6269 0 R
+-63 0 V
+-6353 0 R
+(100) Rshow
+630 2220 M
+63 0 V
+6269 0 R
+-63 0 V
+-6353 0 R
+(150) Rshow
+630 2820 M
+63 0 V
+6269 0 R
+-63 0 V
+-6353 0 R
+(200) Rshow
+630 3420 M
+63 0 V
+6269 0 R
+-63 0 V
+-6353 0 R
+(250) Rshow
+630 4020 M
+63 0 V
+6269 0 R
+-63 0 V
+-6353 0 R
+(300) Rshow
+630 4620 M
+63 0 V
+6269 0 R
+-63 0 V
+-6353 0 R
+(350) Rshow
+630 420 M
+0 63 V
+0 4137 R
+0 -63 V
+630 280 M
+(0) Cshow
+1263 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(10) Cshow
+1896 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(20) Cshow
+2530 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(30) Cshow
+3163 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(40) Cshow
+3796 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(50) Cshow
+4429 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(60) Cshow
+5062 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(70) Cshow
+5696 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(80) Cshow
+6329 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(90) Cshow
+6962 420 M
+0 63 V
+0 4137 R
+0 -63 V
+0 -4277 R
+(100) Cshow
+1.000 UL
+LTb
+630 420 M
+6332 0 V
+0 4200 V
+-6332 0 V
+630 420 L
+140 2520 M
+currentpoint gsave translate 90 rotate 0 0 M
+(Bytes) Cshow
+grestore
+3796 70 M
+(CVS revision of <sys/buf.h>) Cshow
+3796 4830 M
+(Sizeof\(struct buf\)) Cshow
+1.000 UL
+LT0
+693 1764 M
+64 384 V
+63 0 V
+63 0 V
+64 -96 V
+63 0 V
+63 0 V
+64 816 V
+63 0 V
+63 0 V
+64 768 V
+63 48 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 48 V
+63 96 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 -48 V
+63 0 V
+63 -48 V
+64 0 V
+63 0 V
+63 96 V
+64 0 V
+63 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 48 V
+64 0 V
+63 48 V
+63 96 V
+64 -48 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+63 0 V
+64 96 V
+63 -96 V
+63 -48 V
+64 48 V
+63 0 V
+63 384 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+64 0 V
+63 0 V
+63 0 V
+63 48 V
+64 0 V
+63 0 V
+63 96 V
+64 96 V
+63 0 V
+stroke
+grestore
+end
+showpage
+%%Trailer
+%%DocumentFonts: Helvetica
diff --git a/share/doc/papers/contents/Makefile b/share/doc/papers/contents/Makefile
new file mode 100644
index 000000000000..454fff5ee5a9
--- /dev/null
+++ b/share/doc/papers/contents/Makefile
@@ -0,0 +1,6 @@
+VOLUME= papers
+DOC= contents
+SRCS= contents.ms
+MACROS= -ms
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/contents/contents.ms b/share/doc/papers/contents/contents.ms
new file mode 100644
index 000000000000..271b05166111
--- /dev/null
+++ b/share/doc/papers/contents/contents.ms
@@ -0,0 +1,216 @@
+.\" Copyright (c) 1996 FreeBSD Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.OH '''Papers Contents'
+.EH 'Papers Contents'''
+.TL
+UNIX Papers coming with FreeBSD
+.PP
+These papers are of both historic and current interest, but most of them are
+many years old.
+More recent documentation is available from
+.>> <a href="https://www.freebsd.org/docs/">
+https://www.FreeBSD.org/docs/
+.>> </a>
+
+.IP
+.tl '\fBBerkeley Pascal''px\fP'
+.if !r.U .nr .U 0
+.if \n(.U \{\
+.br
+.>> <a href="px.html">px.html</a>
+.\}
+.QP
+Berkeley Pascal
+PX Implementation Notes
+.br
+Version 2.0
+.sp
+Performance Effects of Disk Subsystem Choices
+for VAX\(dg Systems Running 4.2BSD UNIX.
+.sp
+William N. Joy, M. Kirk McKusick.
+.sp
+Revised January, 1979.
+
+.sp
+.IP
+.tl '\fBDisk Performance''diskperf\fP'
+.if \n(.U \{\
+.br
+.>> <a href="diskperf.html">diskperf.html</a>
+.\}
+.QP
+Performance Effects of Disk Subsystem Choices
+for VAX\(dg Systems Running 4.2BSD UNIX.
+.sp
+Bob Kridle, Marshall Kirk McKusick.
+.sp
+Revised July 27, 1983.
+
+.sp
+.IP
+.tl '\fBTune the 4.2BSD Kernel''kerntune\fP'
+.if \n(.U \{\
+.br
+.>> <a href="kerntune.html">kerntune.html</a>
+.\}
+.QP
+Using gprof to Tune the 4.2BSD Kernel.
+.sp
+Marshall Kirk McKusick.
+.sp
+Revised May 21, 1984 (?).
+
+.sp
+.IP
+.tl '\fBNew Virtual Memory''newvm\fP'
+.if \n(.U \{\
+.br
+.>> <a href="newvm.html">newvm.html</a>
+.\}
+.QP
+A New Virtual Memory Implementation for Berkeley.
+.sp
+Marshall Kirk McKusick, Michael J. Karels.
+.sp
+Revised 1986.
+
+.sp
+.IP
+.tl '\fBKernel Malloc''kernmalloc\fP'
+.if \n(.U \{\
+.br
+.>> <a href="kernmalloc.html">kernmalloc.html</a>
+.\}
+.QP
+Design of a General Purpose Memory Allocator for the 4.3BSD UNIX Kernel.
+.sp
+Marshall Kirk McKusick, Michael J. Karels.
+.sp
+Reprinted from:
+\fIProceedings of the San Francisco USENIX Conference\fP,
+pp. 295-303, June 1988.
+
+.sp
+.IP
+.tl '\fBRelease Engineering''relengr\fP'
+.if \n(.U \{\
+.br
+.>> <a href="releng.html">releng.html</a>
+.\}
+.QP
+The Release Engineering of 4.3\s-1BSD\s0.
+.sp
+Marshall Kirk McKusick, Michael J. Karels, Keith Bostic.
+.sp
+Revised 1989.
+
+.sp
+.IP
+.tl '\fBBeyond 4.3BSD''beyond4.3\fP'
+.if \n(.U \{\
+.br
+.>> <a href="beyond43.html">beyond43.html</a>
+.\}
+.QP
+Current Research by The Computer Systems Research Group of Berkeley.
+.sp
+Marshall Kirk McKusick, Michael J Karels, Keith Sklower, Kevin Fall,
+Marc Teitelbaum, Keith Bostic.
+.sp
+Revised February 2, 1989.
+
+.sp
+.IP
+.tl '\fBFilesystem Interface''fsinterface\fP'
+.if \n(.U \{\
+.br
+.>> <a href="fsinterface.html">fsinterface.html</a>
+.\}
+.QP
+Toward a Compatible Filesystem Interface.
+.sp
+Michael J. Karels, Marshall Kirk McKusick.
+.sp
+Conference of the European Users' Group, September 1986.
+Last modified April 16, 1991.
+
+.sp
+.IP
+.tl '\fBSystem Performance''sysperf\fP'
+.if \n(.U \{\
+.br
+.>> <a href="sysperf.html">sysperf.html</a>
+.\}
+.QP
+Measuring and Improving the Performance of Berkeley UNIX.
+.sp
+Marshall Kirk McKusick, Samuel J. Leffler, Michael J. Karels.
+.sp
+Revised April 17, 1991.
+
+.sp
+.IP
+.tl '\fBNot Quite NFS''nqnfs\fP'
+.if \n(.U \{\
+.br
+.>> <a href="nqnfs.html">nqnfs.html</a>
+.\}
+.QP
+Not Quite NFS, Soft Cache Consistency for NFS.
+.sp
+Rick Macklem.
+.sp
+Reprinted with permission from the "Proceedings of the Winter 1994 Usenix
+Conference", January 1994, San Francisco.
+
+.sp
+.IP
+.tl '\fBMalloc(3)''malloc\fP'
+.if \n(.U \{\
+.br
+.>> <a href="malloc.html">malloc.html</a>
+.\}
+.QP
+Malloc(3) in modern Virtual Memory environments.
+.sp
+Poul-Henning Kamp.
+.sp
+Revised April 5, 1996.
+
+.sp
+.IP
+.tl '\fBJails: Confining the omnipotent root''jail\fP'
+.if \n(.U \{\
+.br
+.>> <a href="jail.html">jail.html</a>
+.\}
+.QP
+The jail system call sets up a jail and locks the current process in it.
+.sp
+Poul-Henning Kamp, Robert N. M. Watson.
+.sp
+This paper was presented at the 2nd International System Administration
+and Networking Conference "SANE 2000" May 22-25, 2000 in Maastricht,
+The Netherlands and is published in the proceedings.
diff --git a/share/doc/papers/devfs/Makefile b/share/doc/papers/devfs/Makefile
new file mode 100644
index 000000000000..656e3cd89c2c
--- /dev/null
+++ b/share/doc/papers/devfs/Makefile
@@ -0,0 +1,7 @@
+VOLUME= papers
+DOC= devfs
+SRCS= paper.me
+MACROS= -me
+USE_PIC=
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/devfs/paper.me b/share/doc/papers/devfs/paper.me
new file mode 100644
index 000000000000..9cfae72bdd2c
--- /dev/null
+++ b/share/doc/papers/devfs/paper.me
@@ -0,0 +1,1276 @@
+.\" format with ditroff -me
+.\" format made to look as a paper for the proceedings is to look
+.\" (as specified in the text)
+.if n \{ .po 0
+. ll 78n
+. na
+.\}
+.if t \{ .po 1.0i
+. ll 6.5i
+. nr pp 10 \" text point size
+. nr sp \n(pp+2 \" section heading point size
+. nr ss 1.5v \" spacing before section headings
+.\}
+.nr tm 1i
+.nr bm 1i
+.nr fm 2v
+.he ''''
+.de bu
+.ip \0\s-2\(bu\s+2
+..
+.lp
+.rs
+.ce 5
+.sp
+.sz 14
+.b "Rethinking /dev and devices in the UNIX kernel"
+.sz 12
+.sp
+.i "Poul-Henning Kamp"
+.sp .1
+.i "<phk@FreeBSD.org>"
+.i "The FreeBSD Project"
+.i
+.sp 1.5
+.b Abstract
+.lp
+An outstanding novelty in UNIX at its introduction was the notion
+of ``a file is a file is a file and even a device is a file.''
+Going from ``hardware only changes when the DEC Field engineer is here''
+to ``my toaster has USB'' has put serious strain on the rather crude
+implementation of the ``devices as files'' concept, an implementation which
+has survived practically unchanged for 30 years in most UNIX variants.
+Starting from a high-level view of devices and the semantics that
+have grown around them over the years, this paper takes the audience on a
+grand tour of the redesigned FreeBSD device-I/O system,
+to convey an overview of how it all fits together, and to explain why
+things ended up as they did, how to use the new features and
+in particular how not to.
+.sp
+.if t \{
+.2c
+.\}
+.\" end boilerplate... paper starts here.
+.sh 1 "Introduction"
+.sp
+There are really only two fundamental ways to conceptualise
+I/O devices in an operating system:
+The usual way and the UNIX way.
+.lp
+The usual way is to treat I/O devices as their own class of things,
+possibly several classes of things, and provide APIs tailored
+to the semantics of the devices.
+In practice this means that a program must know what it is dealing
+with, it has to interact with disks one way, tapes another and
+rodents yet a third way, all of which are different from how it
+interacts with a plain disk file.
+.lp
+The UNIX way has never been described better than in the very first
+paper
+published on UNIX by Ritchie and Thompson [Ritchie74]:
+.(q
+Special files constitute the most unusual feature of the UNIX filesystem.
+Each supported I/O device is associated with at least one such file.
+Special files are read and written just like ordinary disk files,
+but requests to read or write result in activation of the associated device.
+An entry for each special file resides in directory /dev,
+although a link may be made to one of these files just as it may to an
+ordinary file.
+Thus, for example, to write on a magnetic tape one may write on the file /dev/mt.
+
+Special files exist for each communication line, each disk, each tape drive,
+and for physical main memory.
+Of course, the active disks and the memory special files are protected from indiscriminate access.
+
+There is a threefold advantage in treating I/O devices this way:
+file and device I/O are as similar as possible;
+file and device names have the same syntax and meaning,
+so that a program expecting a file name as a parameter can be passed a device name;
+finally, special files are subject to the same protection mechanism as regular files.
+.)q
+.lp
+.\" (Why was this so special at the time?)
+At the time, this was quite a strange concept; it was totally accepted
+for instance, that neither the system administrator nor the users were
+able to interact with a disk as a disk.
+Operating systems simply
+did not provide access to disk other than as a filesystem.
+Most vendors did not even release a program to initialise a
+disk-pack with a filesystem: selling pre-initialised and ``quality
+tested'' disk-packs was quite a profitable business.
+.lp
+In many cases some kind of API for reading and
+writing individual sectors on a disk pack
+did exist in the operating system,
+but more often than not
+it was not listed in the public documentation.
+.sh 2 "The traditional implementation"
+.lp
+.\" (Explain how opening /dev/lpt0 lands you in the right device driver)
+The initial implementation used hardcoded inode numbers [Ritchie98].
+The console
+device would be inode number 5, the paper-tape-punch number 6 and so on,
+even if those inodes were also actual regular files in the filesystem.
+.lp
+For reasons one can only too vividly imagine, this was changed and
+Thompson
+[Thompson78]
+describes how the implementation now used ``major and minor''
+device numbers to index though the devsw array to the correct device driver.
+.lp
+For all intents and purposes, this is the implementation which survives
+in most UNIX-like systems even to this day.
+Apart from the access control and timestamp information which is
+found in all inodes, the special inodes in the filesystem contain only
+one piece of information: the major and minor device numbers, often
+logically OR'ed to one field.
+.lp
+When a program opens a special file, the kernel uses the major number
+to find the entry points in the device driver, and passes the combined
+major and minor numbers as a parameter to the device driver.
+.sh 1 "The challenge"
+.lp
+Now, we did not talk much about where the special inodes came from
+to begin with.
+They were created by hand, using the
+mknod(2) system call, usually through the mknod(8) program.
+.lp
+In those days a
+computer had a very static hardware configuration\**
+.(f
+\** Unless your assigned field engineer was present on site.
+.)f
+and it certainly did not
+change while the system was up and running, so creating device nodes
+by hand was certainly an acceptable solution.
+.lp
+The first sign that this would not hold up as a solution came with
+the advent of TCP/IP and the telnet(1) program, or more precisely
+with the telnetd(8) daemon.
+In order to support remote login a ``pseudo-tty'' device driver was implemented,
+basically as tty driver which instead of hardware had another device which
+would allow a process to ``act as hardware'' for the tty.
+The telnetd(8) daemon would read and write data on the ``master'' side of
+the pseudo-tty and the user would be running on the ``slave'' side,
+which would act just like any other tty: you could change the erase
+character if you wanted to and all the signals and all that stuff worked.
+.lp
+Obviously with a device requiring no hardware, you can compile as many
+instances into the kernel as you like, as long as you do not use
+too much memory.
+As system after system was connected
+to the ARPANet, ``increasing number of ptys'' became a regular task
+for system administrators, and part of this task was to create
+more special nodes in the filesystem.
+.lp
+Several UNIX vendors also noticed an issue when they sold minicomputers
+in many different configurations: explaining to system administrators
+just which special nodes they would need and how to create them were
+a significant documentation hassle. Some opted for the simple solution
+and pre-populated /dev with every conceivable device node, resulting
+in a predictable slowdown on access to filenames in /dev.
+.lp
+System V UNIX provided a band-aid solution:
+a special boot sequence would take effect if the kernel or
+the hardware had changed since last reboot.
+This boot procedure would
+amongst other things create the necessary special files in the filesystem,
+based on an intricate system of per device driver configuration files.
+.lp
+In the recent years, we have become used to hardware which changes
+configuration at any time: people plug USB, Firewire and PCCard
+devices into their computers.
+These devices can be anything from modems and disks to GPS receivers
+and fingerprint authentication hardware.
+Suddenly maintaining the
+correct set of special devices in ``/dev'' became a major headache.
+.lp
+Along the way, UNIX kernels had learned to deal with multiple filesystem
+types [Heidemann91a] and a ``device-pseudo-filesystem'' was a pretty
+obvious idea.
+The device drivers have a pretty good idea which
+devices they have found in the configuration, so all that is needed is
+to present this information as a filesystem filled with just the right
+special files.
+Experience has shown that this like most other ``pseudo
+filesystems'' sound a lot simpler in theory than in practice.
+.sh 1 "Truly understanding devices"
+.lp
+Before we continue, we need to fully understand the
+``device special file'' in UNIX.
+.lp
+First we need to realize that a special file has the nature of
+a pointer from the filesystem into a different namespace;
+a little understood fact with far reaching consequences.
+.lp
+One implication of this is that several special files can
+exist in the filename namespace all pointing to the same device
+but each having their own access and timestamp attributes:
+.lp
+.(b M
+.vs -3
+\fC\s-3guest# ls -l /dev/fd0 /tmp/fd0
+crw-r----- 1 root operator 9, 0 Sep 27 19:21 /dev/fd0
+crw-rw-rw- 1 root wheel 9, 0 Sep 27 19:24 /tmp/fd0\fP\s+3
+.vs +3
+.)b
+Obviously, the administrator needs to be on top of this:
+one popular way to exploit an unguarded root prompt is
+to create a replica of the special file /dev/kmem
+in a location where it will not be noticed.
+Since /dev/kmem gives access to the kernel memory,
+gaining any particular
+privilege can be arranged by suitably modifying the kernel's
+data structures through the illicit special file.
+.lp
+When NFS appeared it opened a new avenue for this attack:
+People may have root privilege on one machine but not another.
+Since device nodes are not interpreted on the NFS server
+but rather on the local computer,
+a user with root privilege on a NFS client
+computer can create a device node to his liking on a filesystem
+mounted from an NFS server.
+This device node can in turn be used to
+circumvent the security of other computers which mount that filesystem,
+including the server, unless they protect themselves by not
+trusting any device entries on untrusted filesystem by mounting such
+filesystems with the \fCnodev\fP mount-option.
+.lp
+The fact that the device itself does not actually exist inside the
+filesystem which holds the special file makes it possible
+to perform boot-strapping stunts in the spirit
+of Baron Von Münchausen [raspe1785],
+where a filesystem is (re)mounted using one of its own
+device vnodes:
+.(b M
+.vs -3
+\fC\s-2guest# mount -o ro /dev/fd0 /mnt
+guest# fsck /mnt/dev/fd0
+guest# mount -u -o rw /mnt/dev/fd0 /mnt\fP\s+2
+.vs +3
+.)b
+.lp
+Other interesting details are chroot(2) and jail(2) [Kamp2000] which
+provide filesystem isolation for process-trees.
+Whereas chroot(2) was not implemented as a security tool [Mckusick1999]
+(although it has been widely used as such), the jail(2) security
+facility in FreeBSD provides a pretty convincing ``virtual machine''
+where even the root privilege is isolated and restricted to the designated
+area of the machine.
+Obviously chroot(2) and jail(2) may require access to a well-defined
+subset of devices like /dev/null, /dev/zero and /dev/tty,
+whereas access to other devices such as /dev/kmem
+or any disks could be used to compromise the integrity of the jail(2)
+confinement.
+.lp
+For a long time FreeBSD, like almost all UNIX-like systems had two kinds
+of devices, ``block'' and
+``character'' special files, the difference being that ``block''
+devices would provide caching and alignment for disk device access.
+This was one of those minor architectural mistakes which took
+forever to correct.
+.lp
+The argument that block devices were a mistake is really very
+very simple: Many devices other than disks have multiple modes
+of access which you select by choosing which special file to use.
+.lp
+Pick any old timer and he will be able to recite painful
+sagas about the crucial difference between the /dev/rmt
+and /dev/nrmt devices for tape access.\**
+.(f
+\** Make absolutely sure you know the difference before you take
+important data on a multi-file 9-track tape to remote locations.
+.)f
+.lp
+Tapes, asynchronous ports, line printer ports and many other devices
+have implemented submodes, selectable by the user
+at a special filename level, but that has not earned them their
+own special file types.
+Only disks\**
+.(f
+\** Well, OK: and some 9-track tapes.
+.)f
+have enjoyed the privilege of getting an entire file type dedicated to a
+a minor device mode.
+.lp
+Caching and alignment modes should have been enabled by setting
+some bit in the minor device number on the disk special file,
+not by polluting the filesystem code with another file type.
+.lp
+In FreeBSD block devices were not even implemented in a fashion
+which would be of any use, since any write errors would never be
+reported to the writing process. For this reason, and since no
+applications
+were found to be in existence which relied on block devices
+and since historical usage was indeed historical [Mckusick2000],
+block devices were removed from the FreeBSD system.
+This greatly simlified the task of keeping track of open(2)
+reference counts for disks and
+removed much magic special-case code throughout.
+.lp
+.sh 1 "Files, sockets, pipes, SVID IPC and devices"
+.sp
+It is an instructive lesson in inconsistency to look at the
+various types of ``things'' a process can access in UNIX-like
+systems today.
+.lp
+First there are normal files, which are our reference yardstick here:
+they are accessed with open(2), read(2), write(2), mmap(2), close(2)
+and various other auxiliary system calls.
+.lp
+Sockets and pipes are also accessed via file handles but each has
+its own namespace. That means you cannot open(2) a socket,\**
+.(f
+\** This is particularly bizarre in the case of UNIX domain sockets
+which use the filesystem as their namespace and appear in directory
+listings.
+.)f
+but you can read(2) and write(2) to it.
+Sockets and pipes vector off at the file descriptor level and do
+not get in touch with the vnode based part of the kernel at all.
+.lp
+Devices land somewhere in the middle between pipes and sockets on
+one side and normal files on the other.
+They use the filesystem
+namespace, are implemented with vnodes, and can be operated
+on like normal files, but don't actually live in the filesystem.
+.lp
+Devices are in fact special-cased all the way through the vnode system.
+For one thing devices break the ``one file-one vnode''
+rule, making it necessary to chain all vnodes for the same
+device together in
+order to be able to find ``the canonical vnode for this device node'',
+but more importantly, many operations have to be specifically denied
+on special file vnodes since they do not make any sense.
+.lp
+For true inconsistency, consider the SVID IPC mechanisms - not
+only do they not operate via file handles,
+but they also sport a singularly
+illconceived 32 bit numeric namespace and a dedicated set of
+system calls for access.
+.lp
+Several people have convincingly argued that this is an inconsistent
+mess, and have proposed and implemented more consistent operating systems
+like the Plan9 from Bell Labs [Pike90a] [Pike92a].
+Unfortunately reality is that people are not interested in learning a new
+operating system when the one they have is pretty darn good, and
+consequently research into better and more consistent ways is
+a pretty frustrating [Pike2000] but by no means irrelevant topic.
+.sh 1 "Solving the /dev maintenance problem"
+.lp
+There are a number of obvious, simple but wrong ways one could
+go about solving the ``/dev'' maintenance problem.
+.lp
+The very straightforward way is to hack the namei() kernel function
+responsible for filename translation and lookup.
+It is only a minor matter of programming to
+add code to special-case any lookup which ends up in ``/dev''.
+But this leads to problems: in the case of chroot(2) or jail(2), the
+administrator will want to present only a subset of the available
+devices in ``/dev'', so some kind of state will have to be kept per
+chroot(2)/jail(2) about which devices are visible and
+which devices are hidden, but no obvious location for this information
+is available in the absence of a mount data structure.
+.lp
+It also leads to some unpleasant issues
+because of the fact that ``/dev/foo'' is a synthesised directory
+entry which may or may not actually be present on the filesystem
+which seems to provide ``/dev''.
+The vnodes either have to belong to a filesystem or they
+must be special-cased throughout the vnode layer of the kernel.
+.lp
+Finally there is the simple matter of generality:
+hardcoding the string "/dev" in the kernel is very general.
+.lp
+A cruder solution is to leave it to a daemon: make a special
+device driver, have a daemon read messages from it and create and
+destroy nodes in ``/dev'' in response to these messages.
+.lp
+The main drawback to this idea is that now we have added IPC
+to the mix introducing new and interesting race conditions.
+.lp
+Otherwise this solution is a surprisingly effective,
+but chroot(2)/jail(2) requirements prevents a simple implementation
+and running a daemon per jail would become an administrative
+nightmare.
+.lp
+Another pitfall of
+this approach is that we are not able to remount the root filesystem
+read-write at boot until we have a device node for the root device,
+but if this node is missing we cannot create it with a daemon since
+the root filesystem (and hence /dev) is read-only.
+Adding a read-write memory-filesystem mount /dev to solve this problem
+does not improve
+the architectural qualities further and certainly the KISS principle has
+been violated by now.
+.lp
+The final and in the end only satisfactory solution is to write a ``DEVFS''
+which mounts on ``/dev''.
+.lp
+The good news is that it does solve the problem with chroot(2) and jail(2):
+just mount a DEVFS instance on the ``dev'' directory inside the filesystem
+subtree where the chroot or jail lives. Having a mountpoint gives us
+a convenient place to keep track of the local state of this DEVFS mount.
+.lp
+The bad news is that it takes a lot of cleanup and care to implement
+a DEVFS into a UNIX kernel.
+.sh 1 "DEVFS architectural decisions"
+.lp
+Before implementing a DEVFS, it is necessary to decide on a range
+of corner cases in behaviour, and some of these choices have proved
+surprisingly hard to settle for the FreeBSD project.
+.sh 2 "The ``persistence'' issue"
+.lp
+When DEVFS in FreeBSD was initially presented at a BoF at the 1995
+USENIX Technical Conference in New Orleans,
+a group of people demanded that it provide ``persistence''
+for administrative changes.
+.lp
+When trying to get a definition of ``persistence'', people can generally
+agree that if the administrator changes the access control bits of
+a device node, they want that mode to survive across reboots.
+.lp
+Once more tricky examples of the sort of manipulations one can do
+on special files are proposed, people rapidly disagree about what
+should be supported and what should not.
+.lp
+For instance, imagine a
+system with one floppy drive which appears in DEVFS as ``/dev/fd0''.
+Now the administrator, in order to get some badly written software
+to run, links this to ``/dev/fd1'':
+.(b M
+\fC\s-2ln /dev/fd0 /dev/fd1\fP\s+2
+.)b
+This works as expected and with persistence in DEVFS, the link is
+still there after a reboot.
+But what if after a reboot another floppy drive has been connected
+to the system?
+This drive would naturally have the name ``/dev/fd1'',
+but this name is now occupied by the administrators hard link.
+Should the link be broken?
+Should the new floppy drive be called
+``/dev/fd2''? Nobody can agree on anything but the ugliness of the
+situation.
+.lp
+Given that we are no longer dependent on DEC Field engineers to
+change all four wheels to see which one is flat, the basic assumption
+that the machine has a constant hardware configuration is simply no
+longer true.
+The new assumption one should start from when analysing this
+issue is that when the system boots, we cannot know what devices we
+will find, and we can not know if the devices we do find
+are the same ones we had when the system was last shut down.
+.lp
+And in fact, this is very much the case with laptops today: if I attach
+my IOmega Zip drive to my laptop it appears like a SCSI disk named
+``/dev/da0'', but so does the RAID-5 array attached to the PCI SCSI controller
+installed in my laptop's docking station. If I change mode to ``a+rw''
+on the Zip drive, do I want that mode to apply to the RAID-5 as well?
+Unlikely.
+.lp
+And what if we have persistent information about the mode of
+device ``/dev/sio0'', but we boot and do not find any sio devices?
+Do we keep the information in our device-persistence registry?
+How long do we keep it? If I borrow a modem card,
+set the permissions to some non-standard value like 0666,
+and then attach some other serial device a year from now - do I
+want some old permissions changes to come back and haunt me,
+just because they both happened to be ``/dev/sio0''?
+Unlikely.
+.lp
+The fact that more people have laptop computers today than
+five years ago, and the fact that nobody has been able to credibly
+propose where a persistent DEVFS would actually store the
+information about these things in the first place has settled the issue.
+.lp
+Persistence may be the right answer, but to the
+wrong question: persistence is not a desirable property for a DEVFS
+when the hardware configuration may change literally at any time.
+.sh 2 "Who decides on the names?"
+.lp
+In a DEVFS-enabled system, the responsibility for creating nodes in
+/dev shifts to the device drivers, and consequently the device
+drivers get to choose the names of the device files.
+In addition an initial value for owner, group and mode bits are
+provided by the device driver.
+.lp
+But should it be possible to rename ``/dev/lpt0'' to ``/dev/myprinter''?
+While the obvious affirmative answer is easy to arrive at, it leaves
+a lot to be desired once the implications are unmasked.
+.lp
+Most device drivers know their own name and use it purposefully in
+their debug and log messages to identify themselves.
+Furthermore, the ``NewBus'' [NewBus] infrastructure facility,
+which ties hardware to device drivers, identifies things by name
+and unit numbers.
+.lp
+A very common way to report errors in fact:
+.(b M
+.vs -3
+\fC\s-2#define LPT_NAME "lpt" /* our official name */
+[...]
+printf(LPT_NAME
+ ": cannot alloc ppbus (%d)!", error);\fP\s+2
+.vs +3
+.)b
+.lp
+So despite the user renaming the device node pointing to the printer
+to ``myprinter'', this has absolutely no effect in the kernel and can
+be considered a userland aliasing operation.
+.lp
+The decision was therefore made that it should not be possible to rename
+device nodes since it would only lead to confusion and because the desired
+effect could be attained by giving the user the ability to create
+symlinks in DEVFS.
+.sh 2 "On-demand device creation"
+.lp
+Pseudo-devices like pty, tun and bpf,
+but also some real devices, may not pre-emptively create entries for all
+possible device nodes. It would be a pointless waste of resources
+to always create 1000 ptys just in case they are needed,
+and in the worst case more than 1800 device nodes would be needed per
+physical disk to represent all possible slices and partitions.
+.lp
+For pseudo-devices the task at hand is to make a magic device node,
+``/dev/pty'', which when opened will magically transmogrify into the
+first available pty subdevice, maybe ``/dev/pty123''.
+.lp
+Device submodes, on the other hand, work by having multiple
+entries in /dev, each with a different minor number, as a way to instruct
+the device driver in aspects of its operation. The most widespread
+example is probably ``/dev/mt0'' and ``/dev/nmt0'', where the node
+with the extra ``n''
+instructs the tape device driver to not rewind on close.\**
+.(f
+\** This is the answer to the question in footnote number 2.
+.)f
+.lp
+Some UNIX systems have solved the problem for pseudo-devices by
+creating magic cloning devices like ``/dev/tcp''.
+When a cloning device is opened,
+it finds a free instance and through vnode and file descriptor mangling
+return this new device to the opening process.
+.lp
+This scheme has two disadvantages: the complexity of switching vnodes
+in midstream is non-trivial, but even worse is the fact that it
+does not work for
+submodes for a device because it only reacts to one particular /dev entry.
+.lp
+The solution for both needs is a more flexible on-demand device
+creation, implemented in FreeBSD as a two-level lookup.
+When a
+filename is looked up in DEVFS, a match in the existing device nodes is
+sought first and if found, returned.
+If no match is found, device drivers are polled in turn to ask if
+they would be able to synthesise a device node of the given name.
+.lp
+The device driver gets a chance to modify the name
+and create a device with make_dev().
+If one of the drivers succeeds in this, the lookup is started over and
+the newly found device node is returned:
+.(b M
+.vs -3
+\fC\s-2pty_clone()
+ if (name != "pty")
+ return(NULL); /* no luck */
+ n = find_next_unit();
+ dev = make_dev(...,n,"pty%d",n);
+ name = dev->name;
+ return(dev);\fP\s+2
+.vs +3
+.)b
+.lp
+An interesting mixed use of this mechanism is with the sound device drivers.
+Modern sound devices have multiple channels, presumably to allow the
+user to listen to CNN, Napstered MP3 files and Quake sound effects at
+the same time.
+The only problem is that all applications attempt to open ``/dev/dsp''
+since they have no concept of multiple sound devices.
+The sound device drivers use the cloning facility to direct ``/dev/dsp''
+to the first available sound channel completely transparently to the
+process.
+.lp
+There are very few drawbacks to this mechanism, the major one being
+that ``ls /dev'' now errs on the sparse side instead of the rich when used
+as a system device inventory, a practice which has always been
+of dubious precision at best.
+.sh 2 "Deleting and recreating devices"
+.lp
+Deleting device nodes is no problem to implement, but as likely as not,
+some people will want a method to get them back.
+Since only the device driver know how to create a given device,
+recreation cannot be performed solely on the basis of the parameters
+provided by a process in userland.
+.lp
+In order to not complicate the code which updates the directory
+structure for a mountpoint to reflect changes in the DEVFS inode list,
+a deleted entry is merely marked with DE_WHITEOUT instead of being
+removed entirely.
+Otherwise a separate list would be needed for inodes which we had
+deleted so that they would not be mistaken for new inodes.
+.lp
+The obvious way to recreate deleted devices is to let mknod(2) do it
+by matching the name and disregarding the major/minor arguments.
+Recreating the device with mknod(2) will simply remove the DE_WHITEOUT
+flag.
+.sh 2 "Jail(2), chroot(2) and DEVFS"
+.lp
+The primary requirement from facilities like jail(2) and chroot(2)
+is that it must be possible to control the contents of a DEVFS mount
+point.
+.lp
+Obviously, it would not be desirable for dynamic devices to pop
+into existence in the carefully pruned /dev of jails so it must be
+possible to mark a DEVFS mountpoint as ``no new devices''.
+And in the same way, the jailed root should not be able to recreate
+device nodes which the real root has removed.
+.lp
+These behaviours will be controlled with mount options, but these have not
+yet been implemented because FreeBSD has run out of bitmap flags for
+mount options, and a new unlimited mount option implementation is
+still not in place at the time of writing.
+.lp
+One mount option ``jaildevfs'', will restrict the contents of the
+DEVFS mountpoint to the ``normal set'' of devices for a jail and
+automatically hide all future devices and make it impossible
+for a jailed root to un-hide hidden entries while letting an un-jailed
+root do so.
+.lp
+Mounting or remounting read-only, will prevent all future
+devices from appearing and will make it impossible to
+hide or un-hide entries in the mountpoint.
+This is probably only useful for chroots or jails where no tty
+access is intended since cloning will not work either.
+.lp
+More mount options may be needed as more experience is gained.
+.sh 2 "Default mode, owner & group"
+.lp
+When a device driver creates a device node, and a DEVFS mount adds it
+to its directory tree, it needs to have some values for the access
+control fields: mode, owner and group.
+.lp
+Currently, the device driver specifies the initial values in the
+make_dev() call, but this is far from optimal.
+For one thing, embedding magic UIDs and GIDs in the kernel is simply
+bad style unless they are numerically zero.
+More seriously, they represent compile-time defaults which in these
+enlightened days is rather old-fashioned.
+.lp
+.sh 1 "Cleaning up before we build: struct specinfo and dev_t"
+.lp
+Most of the rest of the paper will be about the various challenges
+and issues in the implementation of DEVFS in FreeBSD.
+All of this should be applicable to other systems derived from
+4.4BSD-Lite as well.
+.lp
+POSIX has defined a type called ``dev_t'' which is the identity of a device.
+This is mainly for use in the few system calls which knows about devices:
+stat(2), fstat(2) and mknod(2).
+A dev_t is constructed by logically OR'ing
+the major# and minor# for the device.
+Since those have been defined
+as having no overlapping bits, the major# and minor#
+can be retrieved from the dev_t by a simple masking operation.
+.lp
+Although the kernel had a well-defined concept of any particular
+device it did not have a data structure to represent "a device".
+The device driver has such a structure, traditionally called ``softc''
+but the high kernel does not (and should not!) have access to the
+device driver's private data structures.
+.lp
+It is an interesting tale how things got to be this way,\**
+.(f
+\** Basically, devices should have been moved up with sockets and
+pipes at the file descriptor level when the VFS layering was introduced,
+rather than have all the special casing throughout the vnode system.
+.)f
+but for now just record for
+a fact how the actual relationship between the data structures was
+in the 4.4BSD release (Fig. 1). [44BSDBook]
+.(z
+.PS 3
+F: box "file" "handle"
+arrow down from F.s
+V: box "vnode"
+arrow right from V.e
+S: box "specinfo"
+arrow down from V.s
+I: box "inode"
+arrow right from I.e
+C: box invis "devsw[]" "[major#]"
+arrow down from C.s
+D: box "device" "driver"
+line right from D.e
+box invis "softc[]" "[minor#]"
+F2: box "file" "handle" at F + (2.5,0)
+arrow down from F2.s
+V2: box "vnode"
+arrow right from V2.e
+S2: box "specinfo"
+arrow down from V2.s
+I2: box "inode"
+arrow left from I2.w
+.PE
+.ce 1
+Fig. 1 - Data structures in 4.4BSD
+.)z
+.lp
+As for all other files, a vnode references a filesystem inode, but
+in addition it points to a ``specinfo'' structure. In the inode
+we find the dev_t which is used to reference the device driver.
+.lp
+Access to the device driver happens by extracting the major# from
+the dev_t, indexing through the global devsw[] array to locate
+the device driver's entry point.
+.lp
+The device driver will extract the minor# from the dev_t and use
+that as the index into the softc array of private data per device.
+.lp
+The ``specinfo'' structure is a little sidekick vnodes grew underway,
+and is used to find all vnodes which reference the same device (i.e.
+they have the same major# and minor#).
+This linkage is used to determine
+which vnode is the ``chosen one'' for this device, and to keep track of
+open(2)/close(2) against this device.
+The actual implementation was an inefficient hash implementation,
+which depending on the vnode reclamation rate and /dev directory lookup
+traffic, may become a measurable performance liability.
+.sh 2 "The new vnode/inode/dev_t layout"
+.lp
+In the new layout (Fig. 2) the specinfo structure takes a central
+role. There is only one instanace of struct specinfo per
+device (i.e. unique major#
+and minor# combination) and all vnodes referencing this device point
+to this structure directly.
+.(z
+.PS 2.25
+F: box "file" "handle"
+arrow down from F.s
+V: box "vnode"
+arrow right from V.e
+S: box "specinfo"
+arrow down from V.s
+I: box "inode"
+F2: box "file" "handle" at F + (2.5,0)
+arrow down from F2.s
+V2: box "vnode"
+arrow left from V2.w
+arrow down from V2.s
+I2: box "inode"
+arrow down from S.s
+D: box "device" "driver"
+.PE
+.ce 1
+Fig. 2 - The new FreeBSD data structures.
+.)z
+.lp
+In userland, a dev_t is still the logical OR of the major# and
+minor#, but this entity is now called a udev_t in the kernel.
+In the kernel a dev_t is now a pointer to a struct specinfo.
+.lp
+All vnodes referencing a device are linked to a list hanging
+directly off the specinfo structure, removing the need for the
+hash table and consequently simplifying and speeding up a lot
+of code dealing with vnode instantiation, retirement and
+name-caching.
+.lp
+The entry points to the device driver are stored in the specinfo
+structure, removing the need for the devsw[] array and allowing
+device drivers to use separate entrypoints for various minor numbers.
+.lp
+This is very convenient for devices which have a ``control''
+device for management and tuning. The control device, almost always
+have entirely separate open/close/ioctl implementations [MD.C].
+.lp
+In addition to this, two data elements are included in the specinfo
+structure but ``owned'' by the device driver. Typically the
+device driver will store a pointer to the softc structure in
+one of these, and unit number or mode information in the other.
+.lp
+This removes the need for drivers to find the softc using array
+indexing based on the minor#, and at the same time has obliviated
+the need for the compiled-in ``NFOO'' constants which traditionally
+determined how many softc structures and therefore devices
+the driver could support.\**
+.(f
+\** Not to mention all the drivers which implemented panic(2)
+because they forgot to perform bounds checking on the index before
+using it on their softc arrays.
+.)f
+.lp
+There are some trivial technical issues relating to allocating
+the storage for specinfo early in the boot sequence and how to
+find a specinfo from the udev_t/major#+minor#, but they will
+not be discussed here.
+.sh 2 "Creating and destroying devices"
+.lp
+Ideally, devices should only be created and
+destroyed by the device drivers which know what devices are present.
+This is accomplished with the make_dev() and destroy_dev()
+function calls.
+.lp
+Life is seldom quite that simple. The operating system might be called
+on to act as a NFS server for a diskless workstation, possibly even
+of a different architecture, so we still need to be able to represent
+device nodes with no device driver backing in the filesystems and
+consequently we need to be able to create a specinfo from
+the major#+minor# in these inodes when we encounter them.
+In practice this is quite trivial, but in a few places in the code
+one needs to be aware of the existence
+of both ``named'' and ``anonymous'' specinfo structures.
+.lp
+The make_dev() call creates a specinfo structure and populates
+it with driver entry points, major#, minor#, device node name
+(for instance ``lpt0''), UID, GID and access mode bits. The return
+value is a dev_t (i.e., a pointer to struct specinfo).
+If the device driver determines that the device is no longer
+present, it calls destroy_dev(), giving a dev_t as argument
+and the dev_t will be cleaned and converted to an anonymous dev_t.
+.lp
+Once created with make_dev() a named dev_t exists until destroy_dev()
+is called by the driver. The driver can rely on this and keep state
+in the fields in dev_t which is reserved for driver use.
+.sh 1 "DEVFS"
+.lp
+By now we have all the relevant information about each device node
+collected in struct specinfo but we still have one problem to
+solve before we can add the DEVFS filesystem on top of it.
+.sh 2 "The interrupt problem"
+.lp
+Some device drivers, notably the CAM/SCSI subsystem in FreeBSD
+will discover changes in the device configuration inside an interrupt
+routine.
+.lp
+This imposes some limitations on what can and should do be done:
+first one should minimise the amount
+of work done in an interrupt routine for performance reasons;
+second, to avoid deadlocks, vnodes and mountpoints should not be
+accessed from an interrupt routine.
+.lp
+Also, in addition to the locking issue,
+a machine can have many instances of DEVFS mounted:
+for a jail(8) based virtual-machine web-server several hundred instances
+is not unheard of, making it far too expensive to update all of them
+in an interrupt routine.
+.lp
+The solution to this problem is to do all the filesystem work on
+the filesystem side of DEVFS and use atomically manipulated integer indices
+(``inode numbers'') as the barrier between the two sides.
+.lp
+The functions called from the device drivers, make_dev(), destroy_dev()
+&c. only manipulate the DEVFS inode number of the dev_t in
+question and do not even get near any mountpoints or vnodes.
+.lp
+For make_dev() the task is to assign a unique inode number to the
+dev_t and store the dev_t in the DEVFS-global inode-to-dev_t array.
+.(b M
+.vs -3
+\fC\s-2make_dev(...)
+ store argument values in dev_t
+ assign unique inode number to dev_t
+ atomically insert dev_t into inode_array\fP\s+2
+.vs +3
+.)b
+.lp
+For destroy_dev() the task is the opposite: clear the inode number
+in the dev_t and NULL the pointer in the devfs-global inode-to-dev_t
+array.
+.(b M
+.vs -3
+\fC\s-2destroy_dev(...)
+ clear fields in dev_t
+ zero dev_t inode number.
+ atomically clear entry in inode_array\fP\s+2
+.vs +3
+.)b
+.lp
+Both functions conclude by atomically incrementing a global variable
+\fCdevfs_generation\fP to leave an indication to the filesystem
+side that something has changed.
+.lp
+By modifying the global state only with atomic instructions, locks
+have been entirely avoided in this part of the code which means that
+the make_dev() and destroy_dev() functions can be called from practically
+anywhere in the kernel at any time.
+.lp
+On the filesystem side of DEVFS, the only two vnode methods which examine
+or rely on the directory structure, VOP_LOOKUP and VOP_READDIR,
+call the function devfs_populate() to update their mountpoint's view
+of the device hierarchy to match current reality before doing any work.
+.(b M
+.vs -3
+\fC\s-2devfs_readdir(...)
+ devfs_populate(...)
+ ...\fP\s+2
+.)b
+.vs +3
+.lp
+The devfs_populate() function, compares the current \fCdevfs_generation\fP
+to the value saved in the mountpoint last time devfs_populate() completed
+and if (actually: while) they differ a linear run is made through the
+devfs-global inode-array and the directory tree of the mountpoint is
+brought up to date.
+.lp
+The actual code is slightly more complicated than shown in the pseudo-code
+here because it has to deal with subdirectories and hidden entries.
+.(b M
+.vs -3
+\fC\s-2devfs_populate(...)
+ while (mount->generation != devfs_generation)
+ for i in all inodes
+ if inode created)
+ create directory entry
+ else if inode destroyed
+ remove directory entry
+.vs +3
+.)b
+.lp
+Access to the global DEVFS inode table is again implemented
+with atomic instructions and failsafe retries to avoid the
+need for locking.
+.lp
+From a performance point of view this scheme also means that a particular
+DEVFS mountpoint is not updated until it needs to be, and then always by
+a process belonging to the jail in question thus minimising and
+distributing the CPU load.
+.sh 1 "Device-driver impact"
+.lp
+All these changes have had a significant impact on how device drivers
+interact with the rest of the kernel regarding registration of
+devices.
+.lp
+If we look first at the ``before'' image in Fig. 3, we notice first
+the NFOO define which imposes a firm upper limit on the number of
+devices the kernel can deal with.
+Also notice that the softc structure for all of them is allocated
+at compile time.
+This is because most device drivers (and texts on writing device
+drivers) are from before the general
+kernel malloc facility [Mckusick1988] was introduced into the BSD kernel.
+.lp
+.(b M
+.vs -3
+\fC\s-2
+#ifndef NFOO
+# define NFOO 4
+#endif
+
+struct foo_softc {
+ ...
+} foo_softc[NFOO];
+
+int nfoo = 0;
+
+foo_open(dev, ...)
+{
+ int unit = minor(dev);
+ struct foo_softc *sc;
+
+ if (unit >= NFOO || unit >= nfoo)
+ return (ENXIO);
+
+ sc = &foo_softc[unit]
+
+ ...
+}
+
+foo_attach(...)
+{
+ struct foo_softc *sc;
+ static int once;
+
+ ...
+ if (nfoo >= NFOO) {
+ /* Have hardware, can't handle */
+ return (-1);
+ }
+ sc = &foo_softc[nfoo++];
+ if (!once) {
+ cdevsw_add(&cdevsw);
+ once++;
+ }
+ ...
+}
+\fP\s+2
+Fig. 3 - Device-driver, old style.
+.vs +3
+.)b
+.lp
+Also notice how range checking is needed to make sure that the
+minor# is inside range. This code gets more complex if device-numbering
+is sparse. Code equivalent to that shown in the foo_open() routine
+would also be needed in foo_read(), foo_write(), foo_ioctl() &c.
+.lp
+Finally notice how the attach routine needs to remember to register
+the cdevsw structure (not shown) when the first device is found.
+.lp
+Now, compare this to our ``after'' image in Fig. 4.
+NFOO is totally gone and so is the compile time allocation
+of space for softc structures.
+.lp
+The foo_open (and foo_close, foo_ioctl &c) functions can now
+derive the softc pointer directly from the dev_t they receive
+as an argument.
+.lp
+.(b M
+.vs -3
+\fC\s-2
+struct foo_softc {
+ ....
+};
+
+int nfoo;
+
+foo_open(dev, ...)
+{
+ struct foo_softc *sc = dev->si_drv1;
+
+ ...
+}
+
+foo_attach(...)
+{
+ struct foo_softc *sc;
+
+ ...
+ sc = MALLOC(..., M_ZERO);
+ if (sc == NULL) {
+ /* Have hardware, can't handle */
+ return (-1);
+ }
+ sc->dev = make_dev(&cdevsw, nfoo,
+ UID_ROOT, GID_WHEEL, 0644,
+ "foo%d", nfoo);
+ nfoo++;
+ sc->dev->si_drv1 = sc;
+ ...
+}
+\fP\s+2
+Fig. 4 - Device-driver, new style.
+.vs +3
+.)b
+.lp
+In foo_attach() we can now attach to all the devices we can
+allocate memory for and we register the cdevsw structure per
+dev_t rather than globally.
+.lp
+This last trick is what allows us to discard all bounds checking
+in the foo_open() &c. routines, because they can only be
+called through the cdevsw, and the cdevsw is only attached to
+dev_t's which foo_attach() has created.
+There is no way to end
+up in foo_open() with a dev_t not created by foo_attach().
+.lp
+In the two examples here, the difference is only 10 lines of source
+code, primarily because only one of the worker functions of the
+device driver is shown.
+In real device drivers it is not uncommon to save 50 or more lines
+of source code which typically is about a percent or two of the
+entire driver.
+.sh 1 "Future work"
+.lp
+Apart from some minor issues to be cleaned up, DEVFS is now a reality
+and future work therefore is likely concentrate on applying the
+facilities and functionality of DEVFS to FreeBSD.
+.sh 2 "devd"
+.lp
+It would be logical to complement DEVFS with a ``device-daemon'' which
+could configure and de-configure devices as they come and go.
+When a disk appears, mount it.
+When a network interface appears, configure it.
+And in some configurable way allow the user to customise the action,
+so that for instance images will automatically be copied off the
+flash-based media from a camera, &c.
+.lp
+In this context it is good to question how we view dynamic devices.
+If for instance a printer is removed in the middle of a print job
+and another printer arrives a moment later, should the system
+automatically continue the print job on this new printer?
+When a disk-like device arrives, should we always mount it? Should
+we have a database of known disk-like devices to tell us where to
+mount it, what permissions to give the mountpoint?
+Some computers come in multiple configurations, for instance laptops
+with and without their docking station. How do we want to present
+this to the users and what behaviour do the users expect?
+.sh 2 "Pathname length limitations"
+.lp
+In order to simplify memory management in the early stages of boot,
+the pathname relative to the mountpoint is presently stored in a
+small fixed size buffer inside struct specinfo.
+It should be possible to use filenames as long as the system otherwise
+permits, so some kind of extension mechanism is called for.
+.lp
+Since it cannot be guaranteed that memory can be allocated in
+all the possible scenarios where make_dev() can be called, it may
+be necessary to mandate that the caller allocates the buffer if
+the content will not fit inside the default buffer size.
+.sh 2 "Initial access parameter selection"
+.lp
+As it is now, device drivers propose the initial mode, owner and group
+for the device nodes, but it would be more flexible if it were possible
+to give the kernel a set of rules, much like packet filtering rules,
+which allow the user to set the wanted policy for new devices.
+Such a mechanism could also be used to filter new devices for mount
+points in jails and to determine other behaviour.
+.lp
+Doing these things from userland results in some awkward race conditions
+and software bloat for embedded systems, so a kernel approach may be more
+suitable.
+.sh 2 "Applications of on-demand device creation"
+.lp
+The facility for on-demand creation of devices has some very interesting
+possibilities.
+.lp
+One planned use is to enable user-controlled labelling
+of disks.
+Today disks have names like /dev/da0, /dev/ad4, but since
+this numbering is topological any change in the hardware configuration
+may rename the disks, causing /etc/fstab and backup procedures
+to get out of sync with the hardware.
+.lp
+The current idea is to store on the media of the disk a user-chosen
+disk name and allow access through this name, so that for instance
+/dev/mydisk0
+would be a symlink to whatever topological name the disk might have
+at any given time.
+.lp
+To simplify this and to avoid a forest of symlinks, it will probably
+be decided to move all the sub-divisions of a disk into one subdirectory
+per disk so just a single symlink can do the job.
+In practice that means that the current /dev/ad0s2f will become
+something like /dev/ad0/s2f and so on.
+Obviously, in the same way, disks could also be accessed by their
+topological address, down to the specific path in a SAN environment.
+.lp
+Another potential use could be for automated offline data media libraries.
+It would be quite trivial to make it possible to access all the media
+in the library using /dev/lib/$LABEL which would be a remarkable
+simplification compared with most current automated retrieval facilities.
+.lp
+Another use could be to access devices by parameter rather than by
+name. One could imagine sending a printjob to /dev/printer/color/A2
+and behind the scenes a search would be made for a device with the
+correct properties and paper-handling facilities.
+.sh 1 "Conclusion"
+.lp
+DEVFS has been successfully implemented in FreeBSD,
+including a powerful, simple and flexible solution supporting
+pseudo-devices and on-demand device node creation.
+.lp
+Contrary to the trend, the implementation added functionality
+with a net decrease in source lines,
+primarily because of the improved API seen from device drivers point of view.
+.lp
+Even if DEVFS is not desired, other 4.4BSD derived UNIX variants
+would probably benefit from adopting the dev_t/specinfo related
+cleanup.
+.sh 1 "Acknowledgements"
+.lp
+I first got started on DEVFS in 1989 because the abysmal performance
+of the Olivetti M250 computer forced me to implement a network-disk-device
+for Minix in order to retain my sanity.
+That initial work led to a
+crude but working DEVFS for Minix, so obviously both Andrew Tannenbaum
+and Olivetti deserve credit for inspiration.
+.lp
+Julian Elischer implemented a DEVFS for FreeBSD around 1994 which never
+quite made it to maturity and subsequently was abandoned.
+.lp
+Bruce Evans deserves special credit not only for his keen eye for detail,
+and his competent criticism but also for his enthusiastic resistance to the
+very concept of DEVFS.
+.lp
+Many thanks to the people who took time to help me stamp out ``Danglish''
+through their reviews and comments: Chris Demetriou, Paul Richards,
+Brian Somers, Nik Clayton, and Hanne Munkholm.
+Any remaining insults to proper use of english language are my own fault.
+.\" (list & why)
+.sh 1 "References"
+.lp
+[44BSDBook]
+Mckusick, Bostic, Karels & Quarterman:
+``The Design and Implementation of 4.4 BSD Operating System.''
+Addison Wesley, 1996, ISBN 0-201-54979-4.
+.lp
+[Heidemann91a]
+John S. Heidemann:
+``Stackable layers: an architecture for filesystem development.''
+Master's thesis, University of California, Los Angeles, July 1991.
+Available as UCLA technical report CSD-910056.
+.lp
+[Kamp2000]
+Poul-Henning Kamp and Robert N. M. Watson:
+``Confining the Omnipotent root.''
+Proceedings of the SANE 2000 Conference.
+Available in FreeBSD distributions in \fC/usr/share/papers\fP.
+.lp
+[MD.C]
+Poul-Henning Kamp et al:
+FreeBSD memory disk driver:
+\fCsrc/sys/dev/md/md.c\fP
+.lp
+[Mckusick1988]
+Marshall Kirk Mckusick, Mike J. Karels:
+``Design of a General Purpose Memory Allocator for the 4.3BSD UNIX-Kernel''
+Proceedings of the San Francisco USENIX Conference, pp. 295-303, June 1988.
+.lp
+[Mckusick1999]
+Dr. Marshall Kirk Mckusick:
+Private email communication.
+\fI``According to the SCCS logs, the chroot call was added by Bill Joy
+on March 18, 1982 approximately 1.5 years before 4.2BSD was released.
+That was well before we had ftp servers of any sort (ftp did not
+show up in the source tree until January 1983). My best guess as
+to its purpose was to allow Bill to chroot into the /4.2BSD build
+directory and build a system using only the files, include files,
+etc contained in that tree. That was the only use of chroot that
+I remember from the early days.''\fP
+.lp
+[Mckusick2000]
+Dr. Marshall Kirk Mckusick:
+Private communication at BSDcon2000 conference.
+\fI``I have not used block devices since I wrote FFS and that
+was \fPmany\fI years ago.''\fP
+.lp
+[NewBus]
+NewBus is a subsystem which provides most of the glue between
+hardware and device drivers. Despite the importance of this
+there has never been published any good overview documentation
+for it.
+The following article by Alexander Langer in ``Dæmonnews'' is
+the best reference I can come up with:
+\fC\s-2http://www.daemonnews.org/200007/newbus-intro.html\fP\s+2
+.lp
+[Pike2000]
+Rob Pike:
+``Systems Software Research is Irrelevant.''
+\fC\s-2http://www.cs.bell\-labs.com/who/rob/utah2000.pdf\fP\s+2
+.lp
+[Pike90a]
+Rob Pike, Dave Presotto, Ken Thompson and Howard Trickey:
+``Plan 9 from Bell Labs.''
+Proceedings of the Summer 1990 UKUUG Conference.
+.lp
+[Pike92a]
+Rob Pike, Dave Presotto, Ken Thompson, Howard Trickey and Phil Winterbottom:
+``The Use of Name Spaces in Plan 9.''
+Proceedings of the 5th ACM SIGOPS Workshop.
+.lp
+[Raspe1785]
+Rudolf Erich Raspe:
+``Baron Münchhausen's Narrative of his marvellous Travels and Campaigns in Russia.''
+Kearsley, 1785.
+.lp
+[Ritchie74]
+D.M. Ritchie and K. Thompson:
+``The UNIX Time-Sharing System''
+Communications of the ACM, Vol. 17, No. 7, July 1974.
+.lp
+[Ritchie98]
+Dennis Ritchie: private conversation at USENIX Annual Technical Conference
+New Orleans, 1998.
+.lp
+[Thompson78]
+Ken Thompson:
+``UNIX Implementation''
+The Bell System Technical Journal, vol 57, 1978, number 6 (part 2) p. 1931ff.
diff --git a/share/doc/papers/diskperf/Makefile b/share/doc/papers/diskperf/Makefile
new file mode 100644
index 000000000000..0c909ea95845
--- /dev/null
+++ b/share/doc/papers/diskperf/Makefile
@@ -0,0 +1,8 @@
+VOLUME= papers
+DOC= diskperf
+SRCS= abs.ms motivation.ms equip.ms methodology.ms tests.ms \
+ results.ms conclusions.ms appendix.ms
+MACROS= -ms
+USE_TBL=
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/diskperf/abs.ms b/share/doc/papers/diskperf/abs.ms
new file mode 100644
index 000000000000..4c970acd7f7b
--- /dev/null
+++ b/share/doc/papers/diskperf/abs.ms
@@ -0,0 +1,170 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.if n .ND
+.TL
+Performance Effects of Disk Subsystem Choices
+for VAX\(dg Systems Running 4.2BSD UNIX*
+.sp
+Revised July 27, 1983
+.AU
+Bob Kridle
+.AI
+mt Xinu
+2560 9th Street
+Suite #312
+Berkeley, California 94710
+.AU
+Marshall Kirk McKusick\(dd
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, CA 94720
+.AB
+.FS
+\(dgVAX, UNIBUS, and MASSBUS are trademarks of Digital Equipment Corporation.
+.FE
+.FS
+* UNIX is a trademark of Bell Laboratories.
+.FE
+.FS
+\(ddThis work was supported under grants from
+the National Science Foundation under grant MCS80-05144,
+and the Defense Advance Research Projects Agency (DoD) under
+Arpa Order No. 4031 monitored by Naval Electronic System Command under
+Contract No. N00039-82-C-0235.
+.FE
+Measurements were made of the UNIX file system
+throughput for various I/O operations using the most attractive currently
+available Winchester disks and controllers attached to both the
+native busses (SBI/CMI) and the UNIBUS on both VAX 11/780s and VAX 11/750s.
+The tests were designed to highlight the performance of single
+and dual drive subsystems operating in the 4.2BSD
+.I
+fast file system
+.R
+environment.
+Many of the results of the tests were initially counter-intuitive
+and revealed several important aspects of the VAX implementations
+which were surprising to us.
+.PP
+The hardware used included two Fujitsu 2351A
+``Eagle''
+disk drives on each of two foreign-vendor disk controllers
+and two DEC RA-81 disk drives on a DEC UDA-50 disk controller.
+The foreign-vendor controllers were Emulex SC750, SC780
+and Systems Industries 9900 native bus interfaced controllers.
+The DEC UDA-50 controller is a UNIBUS interfaced, heavily buffered
+controller which is the first implementation of a new DEC storage
+system architecture, DSA.
+.PP
+One of the most important results of our testing was the correction
+of several timing parameters in our device handler for devices
+with an RH750/RH780 type interface and having high burst transfer
+rates.
+The correction of these parameters resulted in an increase in
+performance of over twenty percent in some cases.
+In addition, one of the controller manufacturers altered their bus
+arbitration scheme to produce another increase in throughput.
+.AE
+.LP
+.de PT
+.lt \\n(LLu
+.pc %
+.nr PN \\n%
+.tl '\\*(LH'\\*(CH'\\*(RH'
+.lt \\n(.lu
+..
+.af PN i
+.ds LH Performance
+.ds RH Contents
+.bp 1
+.\".if t .ds CF July 27, 1983
+.\".if t .ds LF CSRG TR/8
+.\".if t .ds RF Kridle, et. al.
+.ce
+.B "TABLE OF CONTENTS"
+.LP
+.sp 1
+.nf
+.B "1. Motivation"
+.LP
+.sp .5v
+.nf
+.B "2. Equipment
+2.1. DEC UDA50 disk controller
+2.2. Emulex SC750/SC780 disk controllers
+2.3. Systems Industries 9900 disk controller
+2.4. DEC RA81 disk drives
+2.5. Fujitsu 2351A disk drives
+.LP
+.sp .5v
+.nf
+.B "3. Methodology
+.LP
+.sp .5v
+.nf
+.B "4. Tests
+.LP
+.sp .5v
+.nf
+.B "5. Results
+.LP
+.sp .5v
+.nf
+.B "6. Conclusions
+.LP
+.sp .5v
+.nf
+.B Acknowledgements
+.LP
+.sp .5v
+.nf
+.B References
+.LP
+.sp .5v
+.nf
+.B "Appendix A
+A.1. read_8192
+A.2. write_4096
+A.3. write_8192
+A.4. rewrite_8192
+.ds RH Motivation
+.af PN 1
+.bp 1
+.de _d
+.if t .ta .6i 2.1i 2.6i
+.\" 2.94 went to 2.6, 3.64 to 3.30
+.if n .ta .84i 2.6i 3.30i
+..
+.de _f
+.if t .ta .5i 1.25i 2.5i
+.\" 3.5i went to 3.8i
+.if n .ta .7i 1.75i 3.8i
+..
diff --git a/share/doc/papers/diskperf/appendix.ms b/share/doc/papers/diskperf/appendix.ms
new file mode 100644
index 000000000000..831621486f06
--- /dev/null
+++ b/share/doc/papers/diskperf/appendix.ms
@@ -0,0 +1,96 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" .nr H2 1
+.ds RH Appendix A
+.NH
+\s+2Appendix A\s0
+.NH 2
+read_8192
+.PP
+.DS
+#define BUFSIZ 8192
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = open(argv[1], 0);
+ for (i = 0; i < 1024; i++)
+ read(j, buf, BUFSIZ);
+}
+.DE
+.NH 2
+write_4096
+.PP
+.DS
+#define BUFSIZ 4096
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = creat(argv[1], 0666);
+ for (i = 0; i < 2048; i++)
+ write(j, buf, BUFSIZ);
+}
+.DE
+.NH 2
+write_8192
+.PP
+.DS
+#define BUFSIZ 8192
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = creat(argv[1], 0666);
+ for (i = 0; i < 1024; i++)
+ write(j, buf, BUFSIZ);
+}
+.DE
+.bp
+.NH 2
+rewrite_8192
+.PP
+.DS
+#define BUFSIZ 8192
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = open(argv[1], 2);
+ for (i = 0; i < 1024; i++)
+ write(j, buf, BUFSIZ);
+}
+.DE
diff --git a/share/doc/papers/diskperf/conclusions.ms b/share/doc/papers/diskperf/conclusions.ms
new file mode 100644
index 000000000000..9a55d3f595cf
--- /dev/null
+++ b/share/doc/papers/diskperf/conclusions.ms
@@ -0,0 +1,121 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Conclusions
+.NH
+Conclusions
+.PP
+Peak available throughput is only one criterion
+in most storage system purchasing decisions.
+Most of the VAX UNIX systems we are familiar with
+are not I/O bandwidth constrained.
+Nevertheless, an adequate disk bandwidth is necessary for
+good performance and especially to preserve snappy
+response time.
+All of the disk systems we tested provide more than
+adequate bandwidth for typical VAX UNIX system application.
+Perhaps in some I/O-intensive applications such as
+image processing, more consideration should be given
+to the peak throughput available.
+In most situations, we feel that other factors are more
+important in making a storage choice between the systems we
+tested.
+Cost, reliability, availability, and support are some of these
+factors.
+The maturity of the technology purchased must also be weighed
+against the future value and expandability of newer technologies.
+.PP
+Two important conclusions about storage systems in general
+can be drawn from these tests.
+The first is that buffering can be effective in smoothing
+the effects of lower bus speeds and bus contention.
+Even though the UDA50 is located on the relatively slow
+UNIBUS, its performance is similar to controllers located on
+the faster processor busses.
+However, the SC780 with only one sector of buffering shows that
+little buffering is needed if the underlying bus is fast enough.
+.PP
+Placing more intelligence in the controller seems to hinder UNIX system
+performance more than it helps.
+Our profiling tests have indicated that UNIX spends about
+the same percentage of time in the SC780 driver and the UDA50 driver
+(about 10-14%).
+Normally UNIX uses a disk sort algorithm that separates reads and
+writes into two seek order queues.
+The read queue has priority over the write queue,
+since reads cause processes to block,
+while writes can be done asynchronously.
+This is particularly useful when generating large files,
+as it allows the disk allocator to read
+new disk maps and begin doing new allocations
+while the blocks allocated out of the previous map are written to disk.
+Because the UDA50 handles all block ordering,
+and because it keeps all requests in a single queue,
+there is no way to force the longer seek needed to get the next disk map.
+This disfunction causes all the writes to be done before the disk map read,
+which idles the disk until a new set of blocks can be allocated.
+.PP
+The additional functionality of the UDA50 controller that allows it
+to transfer simultaneously from two drives at once tends to make
+the two drive transfer tests run much more effectively.
+Tuning for the single drive case works more effectively in the two
+drive case than when controllers that cannot handle simultaneous
+transfers are used.
+.ds RH Acknowledgements
+.nr H2 1
+.sp 1
+.NH
+\s+2Acknowledgements\s0
+.PP
+We thank Paul Massigilia and Bill Grace
+of Digital Equipment Corp for helping us run our
+disk tests on their UDA50/RA81.
+We also thank Rich Notari and Paul Ritkowski
+of Emulex for making their machines available
+to us to run our tests of the SC780/Eagles.
+Dan McKinster, then of Systems Industries,
+arranged to make their equipment available for the tests.
+We appreciate the time provided by Bob Gross, Joe Wolf, and
+Sam Leffler on their machines to refine our benchmarks.
+Finally we thank our sponsors,
+the National Science Foundation under grant MCS80-05144,
+and the Defense Advance Research Projects Agency (DoD) under
+Arpa Order No. 4031 monitored by Naval Electronic System Command under
+Contract No. N00039-82-C-0235.
+.ds RH References
+.nr H2 1
+.sp 1
+.NH
+\s+2References\s0
+.LP
+.IP [McKusick83] 20
+M. McKusick, W. Joy, S. Leffler, R. Fabry,
+``A Fast File System for UNIX'',
+\fIACM Transactions on Computer Systems 2\fP, 3.
+pp 181-197, August 1984.
+.ds RH Appendix A
+.bp
diff --git a/share/doc/papers/diskperf/equip.ms b/share/doc/papers/diskperf/equip.ms
new file mode 100644
index 000000000000..351b4fe35514
--- /dev/null
+++ b/share/doc/papers/diskperf/equip.ms
@@ -0,0 +1,171 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Equipment
+.NH
+Equipment
+.PP
+Various combinations of the three manufacturers disk controllers,
+and two pairs of Winchester disk drives were tested on both
+VAX 11/780 and VAX 11/750 CPUs. The Emulex and Systems Industries
+disk controllers were interfaced to Fujitsu 2351A
+``Eagle''
+404 Megabyte disk drives.
+The DEC UDA50 disk controller was interfaced to two DEC RA81
+456 Megabyte Winchester disk drives.
+All three controllers were tested on the VAX 780 although
+only the Emulex and DEC controllers were benchmarked on the VAX 11/750.
+Systems Industries makes a VAX 11/750 CMI interface for
+their controller, but we did not have time to test this device.
+In addition, not all the storage systems were tested for
+two drive throughput.
+Each of the controllers and disk drives used in the benchmarks
+is described briefly below.
+.NH 2
+DEC UDA50 disk controller
+.PP
+This is a new controller design which is part of a larger, long range
+storage architecture referred to as
+``DSA''
+or \fBD\fRigital \fBS\fRtorage \fBA\fRrchetecture.
+An important aspect of DSA is migrating a large part
+of the storage management previously handled in the operating
+system to the storage system. Thus, the UDA50 is a much more
+intelligent controller than previous interfaces like the RH750 or
+RH780.
+The UDA50 handles all error correction.
+It also deals with most of the physical storage parameters.
+Typically, system software requests a logical block or
+sequence of blocks.
+The physical locations of these blocks,
+their head, track, and cylinder indices,
+are determined by the controller.
+The UDA50 also orders disk requests to maximize throughput
+where possible, minimizing total seek and rotational delays.
+Where multiple drives are attached to a single controller,
+the UDA50 can interleave
+simultaneous
+data transfers from multiple drives.
+.PP
+The UDA50 is a UNIBUS implementation of a DSA controller.
+It contains 52 sectors of internal buffering to minimize
+the effects of a slow UNIBUS such as the one on the VAX-11/780.
+This buffering also minimizes the effects of contention with
+other UNIBUS peripherals.
+.NH 2
+Emulex SC750/SC780 disk controllers
+.PP
+These two models of the same controller interface to the CMI bus
+of a VAX 11/750 and the SBI bus of a 11/VAX 780, respectively.
+To the operating system, they emulate either an RH750 or
+and RH780.
+The controllers install in the
+MASSBUS
+locations in the CPU cabinets and operate from the
+VAX power suplies.
+They provide an
+``SMD''
+or \fBS\fRtorage \fBM\fRodule \fBD\fRrive
+interface to the disk drives.
+Although a large number of disk drives use this interface, we tested
+the controller exclusively connected to Fujitsu 2351A disks.
+.PP
+The controller ws first implemented for the VAX-11/750 as the SC750
+model several years ago. Although the SC780 was introduced more
+recently, both are stable products with no bugs known to us.
+.NH 2
+System Industries 9900 disk controller
+.PP
+This controller is an evolution of the S.I. 9400 first introduced
+as a UNIBUS SMD interface.
+The 9900 has been enhanced to include an interface to the VAX 11/780 native
+bus, the SBI.
+It has also been upgraded to operate with higher data rate drives such
+as the Fujitsu 2351As we used in this test.
+The controller is contained in its own rack-mounted drawer with an integral
+power supply.
+The interface to the SMD is a four module set which mounts in a
+CPU cabinet slot normally occupied by an RH780.
+The SBI interface derives power from the VAX CPU cabinet power
+supplies.
+.NH 2
+DEC RA81 disk drives
+.PP
+The RA81 is a rack-mountable 456 Megabyte (formatted) Winchester
+disk drive manufactured by DEC.
+It includes a great deal of technology which is an integral part
+of the DEC \fBDSA\fR scheme.
+The novel technology includes a serial packet based communications
+protocol with the controller over a pair of mini-coaxial cables.
+The physical characteristics of the RA81 are shown in the
+table below:
+.DS
+.TS
+box,center;
+c s
+l l.
+DEC RA81 Disk Drive Characteristics
+_
+Peak Transfer Rate 2.2 Mbytes/sec.
+Rotational Speed 3,600 RPM
+Data Sectors/Track 51
+Logical Cylinders 1,248
+Logical Data Heads 14
+Data Capacity 456 Mbytes
+Minimum Seek Time 6 milliseconds
+Average Seek Time 28 milliseconds
+Maximum Seek Time 52 milliseconds
+.TE
+.DE
+.NH 2
+Fujitsu 2351A disk drives
+.PP
+The Fujitsu 2351A disk drive is a Winchester disk drive
+with an SMD controller interface.
+Fujitsu has developed a very good reputation for
+reliable storage products over the last several years.
+The 2351A has the following physical characteristics:
+.DS
+.TS
+box,center;
+c s
+l l.
+Fujitsu 2351A Disk Drive Characteristics
+_
+Peak Transfer Rate 1.859 Mbytes/sec.
+Rotational Speed 3,961 RPM
+Data Sectors/Track 48
+Cylinders 842
+Data Heads 20
+Data Capacity 404 Mbytes
+Minimum Seek Time 5 milliseconds
+Average Seek Time 18 milliseconds
+Maximum Seek Time 35 milliseconds
+.TE
+.DE
+.ds RH Methodology
+.bp
diff --git a/share/doc/papers/diskperf/methodology.ms b/share/doc/papers/diskperf/methodology.ms
new file mode 100644
index 000000000000..ce6f491ee3a5
--- /dev/null
+++ b/share/doc/papers/diskperf/methodology.ms
@@ -0,0 +1,105 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Methodology
+.NH
+Methodology
+.PP
+Our goal was to evaluate the performance of the target peripherals
+in an environment as much like our 4.2BSD UNIX systems as possible.
+There are two basic approaches to creating this kind of test environment.
+These might be termed the \fIindirect\fR and the \fIdirect\fR approach.
+The approach used by DEC in producing most of the performance data
+on the UDA50/RA81 system under VMS is what we term the indirect
+approach.
+We chose to use the direct approach.
+.PP
+The indirect approach used by DEC involves two steps.
+First, the environment in which performance is to be evaluated
+is parameterized.
+In this case, the disk I/O characteristics of VMS were measured
+as to the distribution of various sizes of accesses and the proportion
+of reads and writes.
+This parameterization of
+typical
+I/O activity was termed a
+``vax mix.''
+The second stage involves simulating this mixture of I/O activities
+with the devices to be tested and noting the total volume of transactions
+processed per unit time by each system.
+.PP
+The problems encountered with this indirect approach often
+have to do with the completeness and correctness of the parameterization
+of the context environment.
+For example, the
+``vax mix''
+model constructed for DECs tests uses a random distribution of seeks
+to the blocks read or written.
+It is not likely that any real system produces a distribution
+of disk transfer locations which is truly random and does not
+exhibit strong locality characteristics.
+.PP
+The methodology chosen by us is direct
+in the sense that it uses the standard structured file system mechanism present
+in the 4.2BSD UNIX operating system to create the sequence of locations
+and sizes of reads and writes to the benchmarked equipment.
+We simply create, write, and read
+files as they would be by user's activities.
+The disk space allocation and disk cacheing mechanism built into
+UNIX is used to produce the actual device reads and writes as well
+as to determine their size and location on the disk.
+We measure and compare the rate at which these
+.I
+user files
+.R
+can be written, rewritten, or read.
+.PP
+The advantage of this approach is the implicit accuracy in
+testing in the same environment in which the peripheral
+will be used.
+Although this system does not account for the I/O produced
+by some paging and swapping, in our memory rich environment
+these activities account for a relatively small portion
+of the total disk activity.
+.PP
+A more significant disadvantage to the direct approach
+is the occasional difficulty we have in accounting for our
+measured results.
+The apparently straight-forward activity of reading or writing a logical file
+on disk can produce a complex mixture of disk traffic.
+File I/O is supported by a file management system that
+buffers disk traffic through an internal cache,
+which allows writes to ba handled asynchronously.
+Reads must be done synchronously,
+however this restriction is moderated by the use of read-ahead.
+Small changes in the performance of the disk controller
+subsystem can result in large and unexpected
+changes in the file system performance,
+as it may change the characteristics of the memory contention
+experienced by the processor.
+.ds RH Tests
+.bp
diff --git a/share/doc/papers/diskperf/motivation.ms b/share/doc/papers/diskperf/motivation.ms
new file mode 100644
index 000000000000..0bbc11d66e8d
--- /dev/null
+++ b/share/doc/papers/diskperf/motivation.ms
@@ -0,0 +1,87 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Motivation
+.NH
+Motivation
+.PP
+These benchmarks were performed for several reasons.
+Foremost was our desire to obtain guideline to aid
+in choosing one the most expensive components of any
+VAX UNIX configuration, the disk storage system.
+The range of choices in this area has increased dramatically
+in the last year.
+DEC has become, with the introduction of the UDA50/RA81 system,
+cost competitive
+in the area of disk storage for the first time.
+Emulex's entry into the VAX 11/780 SBI controller
+field, the SC780, represented an important choice for us to examine, given
+our previous success with their VAX 11/750 SC750 controller and
+their UNIBUS controllers.
+The Fujitsu 2351A
+Winchester disk drive represents the lowest cost-per-byte disk storage
+known to us.
+In addition, Fujitsu's reputation for reliability was appealing.
+The many attractive aspects of these components justified a more
+careful examination of their performance aspects under UNIX.
+.PP
+In addition to the direct motivation of developing an effective
+choice of storage systems, we hoped to gain more insight into
+VAX UNIX file system and I/O performance in general.
+What generic characteristics of I/O subsystems are most
+important?
+How important is the location of the controller on the SBI/CMI versus
+the UNIBUS?
+Is extensive buffering in the controller essential or even important?
+How much can be gained by putting more of the storage system
+management and optimization function in the controller as
+DEC does with the UDA50?
+.PP
+We also wanted to resolve particular speculation about the value of
+storage system optimization by a controller in a UNIX
+environment.
+Is the access optimization as effective as that already provided
+by the existing 4.2BSD UNIX device handlers for traditional disks?
+VMS disk handlers do no seek optimization.
+This gives the UDA50 controller an advantage over other controllers
+under VMS which is not likely to be as important to UNIX.
+Are there penalties associated with greater intelligence in the controller?
+.PP
+A third and last reason for evaluating this equipment is comparable
+to the proverbial mountain climbers answer when asked why he climbs
+a particular mountain,
+``It was there.''
+In our case the equipment
+was there.
+We were lucky enough to assemble all the desired disks and controllers
+and get them installed on a temporarily idle VAX 11/780.
+This got us started collecting data.
+Although many of the tests were later rerun on a variety of other systems,
+this initial test bed was essential for working out the testing bugs
+and getting our feet wet.
+.ds RH Equipment
+.bp
diff --git a/share/doc/papers/diskperf/results.ms b/share/doc/papers/diskperf/results.ms
new file mode 100644
index 000000000000..3bf68b6f8daf
--- /dev/null
+++ b/share/doc/papers/diskperf/results.ms
@@ -0,0 +1,331 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Results
+.NH
+Results
+.PP
+The following tables indicate the results of our
+test runs.
+Note that each table contains results for tests run
+on two varieties of 4.2BSD file systems.
+The first set of results is always for a file system
+with a basic blocking factor of eight Kilobytes and a
+fragment size of 1 Kilobyte. The second sets of measurements
+are for file systems with a four Kilobyte block size and a
+one Kilobyte fragment size.
+The values in parenthesis indicate the percentage of CPU
+time used by the test program.
+In the case of the two disk arm tests,
+the value in parenthesis indicates the sum of the percentage
+of the test programs that were run.
+Entries of ``n. m.'' indicate this value was not measured.
+.DS
+.TS
+box,center;
+c s s s s
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+4.2BSD File Systems Tests - \fBVAX 11/750\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Emulex SC750/Eagle UDA50/RA81
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 490 (69%) 620 (96%) 310 (44%) 520 (65%)
+write_4096 380 (99%) 370 (99%) 370 (97%) 360 (98%)
+write_8192 470 (99%) 470 (99%) 320 (71%) 410 (83%)
+rewrite_8192 650 (99%) 620 (99%) 310 (50%) 450 (70%)
+=
+.T&
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+Logically Sequential Transfers
+from \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Emulex SC750/Eagle UDA50/RA81
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 300 (60%) 400 (84%) 210 (42%) 340 (77%)
+write_4096 320 (98%) 320 (98%) 220 (67%) 290 (99%)
+write_8192 340 (98%) 340 (99%) 220 (65%) 310 (98%)
+rewrite_8192 450 (99%) 450 (98%) 230 (47%) 340 (78%)
+.TE
+.DE
+.PP
+Note that the rate of write operations on the VAX 11/750 are ultimately
+CPU limited in some cases.
+The write rates saturate the CPU at a lower bandwidth than the reads
+because they must do disk allocation in addition to moving the data
+from the user program to the disk.
+The UDA50/RA81 saturates the CPU at a lower transfer rate for a given
+operation than the SC750/Eagle because
+it causes more memory contention with the CPU.
+We do not know if this contention is caused by
+the UNIBUS controller or the UDA50.
+.PP
+The following table reports the results of test runs on a VAX 11/780
+with 4 Megabytes of main memory.
+.DS
+.TS
+box,center;
+c s s s s s s
+c s s s s s s
+c s s s s s s
+l | l s | l s | l s
+l | l s | l s | l s
+l | l l | l l | l l
+l | c c | c c | c c.
+4.2BSD File Systems Tests - \fBVAX 11/780\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Emulex SC780/Eagle UDA50/RA81 Sys. Ind. 9900/Eagle
+
+ 1 Drive 2 Drives 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 560 (70%) 480 (58%) 360 (45%) 540 (72%) 340 (41%) 520 (66%)
+write_4096 440 (98%) 440 (98%) 380 (99%) 480 (96%) 490 (96%) 440 (84%)
+write_8192 490 (98%) 490 (98%) 220 (58%)* 480 (92%) 490 (80%) 430 (72%)
+rewrite_8192 760 (100%) 560 (72%) 220 (50%)* 180 (52%)* 490 (60%) 520 (62%)
+=
+.T&
+c s s s s s s
+c s s s s s s
+l | l s | l s | l s
+l | l s | l s | l s
+l | l l | l l | l l
+l | c c | c c | c c.
+Logically Sequential Transfers
+from an \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Emulex SC780/Eagle UDA50/RA81 Sys. Ind. 9900/Eagle
+
+ 1 Drive 2 Drives 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 490 (77%) 370 (66%) n.m. n.m. 200 (31%) 370 (56%)
+write_4096 380 (98%) 370 (98%) n.m. n.m. 200 (46%) 370 (88%)
+write_8192 380 (99%) 370 (97%) n.m. n.m. 200 (45%) 320 (76%)
+rewrite_8192 490 (87%) 350 (66%) n.m. n.m. 200 (31%) 300 (46%)
+.TE
+* the operation of the hardware was suspect during these tests.
+.DE
+.PP
+The dropoff in reading and writing rates for the two drive SC780/Eagle
+tests are probably due to the file system using insufficient
+rotational delay for these tests.
+We have not fully investigated these times.
+.PP
+The following table compares data rates on VAX 11/750s directly
+with those of VAX 11/780s using the UDA50/RA81 storage system.
+.DS
+.TS
+box,center;
+c s s s s
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+4.2BSD File Systems Tests - \fBDEC UDA50 - 750 vs. 780\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test VAX 11/750 UNIBUS VAX 11/780 UNIBUS
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 310 (44%) 520 (84%) 360 (45%) 540 (72%)
+write_4096 370 (97%) 360 (100%) 380 (99%) 480 (96%)
+write_8192 320 (71%) 410 (96%) 220 (58%)* 480 (92%)
+rewrite_8192 310 (50%) 450 (80%) 220 (50%)* 180 (52%)*
+=
+.T&
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+Logically Sequential Transfers
+from an \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test VAX 11/750 UNIBUS VAX 11/780 UNIBUS
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 210 (42%) 342 (77%) n.m. n.m.
+write_4096 215 (67%) 294 (99%) n.m. n.m.
+write_8192 215 (65%) 305 (98%) n.m. n.m.
+rewrite_8192 227 (47%) 336 (78%) n.m. n.m.
+.TE
+* the operation of the hardware was suspect during these tests.
+.DE
+.PP
+The higher throughput available on VAX 11/780s is due to a number
+of factors.
+The larger main memory size allows a larger file system cache.
+The block allocation routines run faster, raising the upper limit
+on the data rates in writing new files.
+.PP
+The next table makes the same comparison using an Emulex controller
+on both systems.
+.DS
+.TS
+box, center;
+c s s s s
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+4.2BSD File Systems Tests - \fBEmulex - 750 vs. 780\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test VAX 11/750 CMI Bus VAX 11/780 SBI Bus
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 490 (69%) 620 (96%) 560 (70%) 480 (58%)
+write_4096 380 (99%) 370 (99%) 440 (98%) 440 (98%)
+write_8192 470 (99%) 470 (99%) 490 (98%) 490 (98%)
+rewrite_8192 650 (99%) 620 (99%) 760 (100%) 560 (72%)
+=
+.T&
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+Logically Sequential Transfers
+from an \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test VAX 11/750 CMI Bus VAX 11/780 SBI Bus
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 300 (60%) 400 (84%) 490 (77%) 370 (66%)
+write_4096 320 (98%) 320 (98%) 380 (98%) 370 (98%)
+write_8192 340 (98%) 340 (99%) 380 (99%) 370 (97%)
+rewrite_8192 450 (99%) 450 (98%) 490 (87%) 350 (66%)
+.TE
+.DE
+.PP
+The following table illustrates the evolution of our testing
+process as both hardware and software problems effecting
+the performance of the Emulex SC780 were corrected.
+The software change was suggested to us by George Goble
+of Purdue University.
+.PP
+The 4.2BSD handler for RH750/RH780 interfaced disk drives
+contains several constants which to determine how
+much time is provided between an interrupt signaling the completion
+of a positioning command and the subsequent start of a data transfer
+operation. These lead times are expressed as sectors of rotational delay.
+If they are too small, an extra complete rotation will often be required
+between a seek and subsequent read or write operation.
+The higher bit rate and rotational speed of the 2351A Fujitsu
+disk drives required
+increasing these constants.
+.PP
+The hardware change involved allowing for slightly longer
+delays in arbitrating for cycles on the SBI bus by
+starting the bus arbitration cycle a little further ahead of
+when the data was ready for transfer.
+Finally we had to increase the rotational delay between consecutive
+blocks in the file because
+the higher bandwidth from the disk generated more memory contention,
+which slowed down the processor.
+.DS
+.TS
+box,center,expand;
+c s s s s s s
+c s s s s s s
+c s s s s s s
+l | l s | l s | l s
+l | l s | l s | l s
+l | l s | l s | l s
+l | c c | c c | c c
+l | c c | c c | c c.
+4.2BSD File Systems Tests - \fBEmulex SC780 Disk Controller Evolution\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Inadequate Search Lead OK Search Lead OK Search Lead
+ Initial SBI Arbitration Init SBI Arb. Improved SBI Arb.
+
+ 1 Drive 2 Drives 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 320 370 440 (60%) n.m. 560 (70%) 480 (58%)
+write_4096 250 270 300 (63%) n.m. 440 (98%) 440 (98%)
+write_8192 250 280 340 (60%) n.m. 490 (98%) 490 (98%)
+rewrite_8192 250 290 380 (48%) n.m. 760 (100%) 560 (72%)
+=
+.T&
+c s s s s s s
+c s s s s s s
+l | l s | l s | l s
+l | l s | l s | l s
+l | l s | l s | l s
+l | c c | c c | c c
+l | c c | c c | c c.
+Logically Sequential Transfers
+from an \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Inadequate Search Lead OK Search Lead OK Search Lead
+ Initial SBI Arbitration Init SBI Arb. Improved SBI Arb.
+
+ 1 Drive 2 Drives 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 200 220 280 n.m. 490 (77%) 370 (66%)
+write_4096 180 190 300 n.m. 380 (98%) 370 (98%)
+write_8192 180 200 320 n.m. 380 (99%) 370 (97%)
+rewrite_8192 190 200 340 n.m. 490 (87%) 350 (66%)
+.TE
+.DE
+.ds RH Conclusions
+.bp
diff --git a/share/doc/papers/diskperf/tests.ms b/share/doc/papers/diskperf/tests.ms
new file mode 100644
index 000000000000..d9f736c3bf41
--- /dev/null
+++ b/share/doc/papers/diskperf/tests.ms
@@ -0,0 +1,102 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Tests
+.NH
+Tests
+.PP
+Our battery of tests consists of four programs,
+read_8192, write_8192, write_4096
+and rewrite_8192 originally written by [McKusick83]
+to evaluate the performance of the new file system in 4.2BSD.
+These programs all follow the same model and are typified by
+read_8192 shown here.
+.DS
+#define BUFSIZ 8192
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = open(argv[1], 0);
+ for (i = 0; i < 1024; i++)
+ read(j, buf, BUFSIZ);
+}
+.DE
+The remaining programs are included in appendix A.
+.PP
+These programs read, write with two different blocking factors,
+and rewrite logical files in structured file system on the disk
+under test.
+The write programs create new files while the rewrite program
+overwrites an existing file.
+Each of these programs represents an important segment of the
+typical UNIX file system activity with the read program
+representing by far the largest class and the rewrite the smallest.
+.PP
+A blocking factor of 8192 is used by all programs except write_4096.
+This is typical of most 4.2BSD user programs since a standard set of
+I/O support routines is commonly used and these routines buffer
+data in similar block sizes.
+.PP
+For each test run, an empty eight Kilobyte block
+file system was created in the target
+storage system.
+Then each of the four tests was run and timed.
+Each test was run three times;
+the first to clear out any useful data in the cache,
+and the second two to insure that the experiment
+had stablized and was repeatable.
+Each test operated on eight Megabytes of data to
+insure that the cache did not overly influence the results.
+Another file system was then initialized using a
+basic blocking factor of four Kilobytes and the same tests
+were run again and timed.
+A command script for a run appears as follows:
+.DS
+#!/bin/csh
+set time=2
+echo "8K/1K file system"
+newfs /dev/rhp0g eagle
+mount /dev/hp0g /mnt0
+mkdir /mnt0/foo
+echo "write_8192 /mnt0/foo/tst2"
+rm -f /mnt0/foo/tst2
+write_8192 /mnt0/foo/tst2
+rm -f /mnt0/foo/tst2
+write_8192 /mnt0/foo/tst2
+rm -f /mnt0/foo/tst2
+write_8192 /mnt0/foo/tst2
+echo "read_8192 /mnt0/foo/tst2"
+read_8192 /mnt0/foo/tst2
+read_8192 /mnt0/foo/tst2
+read_8192 /mnt0/foo/tst2
+umount /dev/hp0g
+.DE
+.ds RH Results
+.bp
diff --git a/share/doc/papers/fsinterface/Makefile b/share/doc/papers/fsinterface/Makefile
new file mode 100644
index 000000000000..3c45b04b08d6
--- /dev/null
+++ b/share/doc/papers/fsinterface/Makefile
@@ -0,0 +1,6 @@
+VOLUME= papers
+DOC= fsinterface
+SRCS= fsinterface.ms
+MACROS= -ms
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/fsinterface/abstract.ms b/share/doc/papers/fsinterface/abstract.ms
new file mode 100644
index 000000000000..b85bd6cbc45b
--- /dev/null
+++ b/share/doc/papers/fsinterface/abstract.ms
@@ -0,0 +1,67 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.TL
+Toward a Compatible Filesystem Interface
+.AU
+Michael J. Karels
+Marshall Kirk McKusick
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.LP
+As network or remote filesystems have been implemented for
+.UX ,
+several stylized interfaces between the filesystem implementation
+and the rest of the kernel have been developed.
+Notable among these are Sun Microsystems' virtual filesystem interface
+using vnodes, Digital Equipment's Generic File System architecture,
+and AT&T's File System Switch.
+Each design attempts to isolate filesystem-dependent details
+below the generic interface and to provide a framework within which
+new filesystems may be incorporated.
+However, each of these interfaces is different from
+and incompatible with the others.
+Each of them addresses somewhat different design goals.
+Each was based upon a different starting version of
+.UX ,
+targetted a different set of filesystems with varying characteristics,
+and uses a different set of primitive operations provided by the filesystem.
+The current study compares the various filesystem interfaces.
+Criteria for comparison include generality, completeness, robustness,
+efficiency and esthetics.
+As a result of this comparison, a proposal for a new filesystem interface
+is advanced that includes the best features of the existing implementations.
+The proposal adopts the calling convention for name lookup introduced
+in 4.3BSD.
+A prototype implementation is described.
+This proposal and the rationale underlying its development
+have been presented to major software vendors
+as an early step toward convergence upon a compatible filesystem interface.
diff --git a/share/doc/papers/fsinterface/fsinterface.ms b/share/doc/papers/fsinterface/fsinterface.ms
new file mode 100644
index 000000000000..cb567416334f
--- /dev/null
+++ b/share/doc/papers/fsinterface/fsinterface.ms
@@ -0,0 +1,1169 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.nr UX 0
+.de UX
+.ie \\n(UX \s-1UNIX\s0\\$1
+.el \{\
+\s-1UNIX\s0\\$1\(dg
+.FS
+\(dg \s-1UNIX\s0 is a registered trademark of AT&T.
+.FE
+.nr UX 1
+.\}
+..
+.TL
+Toward a Compatible Filesystem Interface
+.AU
+Michael J. Karels
+Marshall Kirk McKusick
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+.LP
+As network or remote filesystems have been implemented for
+.UX ,
+several stylized interfaces between the filesystem implementation
+and the rest of the kernel have been developed.
+.FS
+This is an update of a paper originally presented
+at the September 1986 conference of the European
+.UX
+Users' Group.
+Last modified April 16, 1991.
+.FE
+Notable among these are Sun Microsystems' Virtual Filesystem interface (VFS)
+using vnodes, Digital Equipment's Generic File System (GFS) architecture,
+and AT&T's File System Switch (FSS).
+Each design attempts to isolate filesystem-dependent details
+below a generic interface and to provide a framework within which
+new filesystems may be incorporated.
+However, each of these interfaces is different from
+and incompatible with the others.
+Each of them addresses somewhat different design goals.
+Each was based on a different starting version of
+.UX ,
+targetted a different set of filesystems with varying characteristics,
+and uses a different set of primitive operations provided by the filesystem.
+The current study compares the various filesystem interfaces.
+Criteria for comparison include generality, completeness, robustness,
+efficiency and esthetics.
+Several of the underlying design issues are examined in detail.
+As a result of this comparison, a proposal for a new filesystem interface
+is advanced that includes the best features of the existing implementations.
+The proposal adopts the calling convention for name lookup introduced
+in 4.3BSD, but is otherwise closely related to Sun's VFS.
+A prototype implementation is now being developed at Berkeley.
+This proposal and the rationale underlying its development
+have been presented to major software vendors
+as an early step toward convergence on a compatible filesystem interface.
+.AE
+.NH
+Introduction
+.PP
+As network communications and workstation environments
+became common elements in
+.UX
+systems, several vendors of
+.UX
+systems have designed and built network file systems
+that allow client process on one
+.UX
+machine to access files on a server machine.
+Examples include Sun's Network File System, NFS [Sandberg85],
+AT&T's recently-announced Remote File Sharing, RFS [Rifkin86],
+the LOCUS distributed filesystem [Walker85],
+and Masscomp's extended filesystem [Cole85].
+Other remote filesystems have been implemented in research or university groups
+for internal use, notably the network filesystem in the Eighth Edition
+.UX
+system [Weinberger84] and two different filesystems used at Carnegie-Mellon
+University [Satyanarayanan85].
+Numerous other remote file access methods have been devised for use
+within individual
+.UX
+processes,
+many of them by modifications to the C I/O library
+similar to those in the Newcastle Connection [Brownbridge82].
+.PP
+Multiple network filesystems may frequently
+be found in use within a single organization.
+These circumstances make it highly desirable to be able to transport filesystem
+implementations from one system to another.
+Such portability is considerably enhanced by the use of a stylized interface
+with carefully-defined entry points to separate the filesystem from the rest
+of the operating system.
+This interface should be similar to the interface between device drivers
+and the kernel.
+Although varying somewhat among the common versions of
+.UX ,
+the device driver interfaces are sufficiently similar that device drivers
+may be moved from one system to another without major problems.
+A clean, well-defined interface to the filesystem also allows a single
+system to support multiple local filesystem types.
+.PP
+For reasons such as these, several filesystem interfaces have been used
+when integrating new filesystems into the system.
+The best-known of these are Sun Microsystems' Virtual File System interface,
+VFS [Kleiman86], and AT&T's File System Switch, FSS.
+Another interface, known as the Generic File System, GFS,
+has been implemented for the ULTRIX\(dd
+.FS
+\(dd ULTRIX is a trademark of Digital Equipment Corp.
+.FE
+system by Digital [Rodriguez86].
+There are numerous differences among these designs.
+The differences may be understood from the varying philosophies
+and design goals of the groups involved, from the systems under which
+the implementations were done, and from the filesystems originally targetted
+by the designs.
+These differences are summarized in the following sections
+within the limitations of the published specifications.
+.NH
+Design goals
+.PP
+There are several design goals which, in varying degrees,
+have driven the various designs.
+Each attempts to divide the filesystem into a filesystem-type-independent
+layer and individual filesystem implementations.
+The division between these layers occurs at somewhat different places
+in these systems, reflecting different views of the diversity and types
+of the filesystems that may be accommodated.
+Compatibility with existing local filesystems has varying importance;
+at the user-process level, each attempts to be completely transparent
+except for a few filesystem-related system management programs.
+The AT&T interface also makes a major effort to retain familiar internal
+system interfaces, and even to retain object-file-level binary compatibility
+with operating system modules such as device drivers.
+Both Sun and DEC were willing to change internal data structures and interfaces
+so that other operating system modules might require recompilation
+or source-code modification.
+.PP
+AT&T's interface both allows and requires filesystems to support the full
+and exact semantics of their previous filesystem,
+including interruptions of system calls on slow operations.
+System calls that deal with remote files are encapsulated
+with their environment and sent to a server where execution continues.
+The system call may be aborted by either client or server, returning
+control to the client.
+Most system calls that descend into the file-system dependent layer
+of a filesystem other than the standard local filesystem do not return
+to the higher-level kernel calling routines.
+Instead, the filesystem-dependent code completes the requested
+operation and then executes a non-local goto (\fIlongjmp\fP) to exit the
+system call.
+These efforts to avoid modification of main-line kernel code
+indicate a far greater emphasis on internal compatibility than on modularity,
+clean design, or efficiency.
+.PP
+In contrast, the Sun VFS interface makes major modifications to the internal
+interfaces in the kernel, with a very clear separation
+of filesystem-independent and -dependent data structures and operations.
+The semantics of the filesystem are largely retained for local operations,
+although this is achieved at some expense where it does not fit the internal
+structuring well.
+The filesystem implementations are not required to support the same
+semantics as local
+.UX
+filesystems.
+Several historical features of
+.UX
+filesystem behavior are difficult to achieve using the VFS interface,
+including the atomicity of file and link creation and the use of open files
+whose names have been removed.
+.PP
+A major design objective of Sun's network filesystem,
+statelessness,
+permeates the VFS interface.
+No locking may be done in the filesystem-independent layer,
+and locking in the filesystem-dependent layer may occur only during
+a single call into that layer.
+.PP
+A final design goal of most implementors is performance.
+For remote filesystems,
+this goal tends to be in conflict with the goals of complete semantic
+consistency, compatibility and modularity.
+Sun has chosen performance over modularity in some areas,
+but has emphasized clean separation of the layers within the filesystem
+at the expense of performance.
+Although the performance of RFS is yet to be seen,
+AT&T seems to have considered compatibility far more important than modularity
+or performance.
+.NH
+Differences among filesystem interfaces
+.PP
+The existing filesystem interfaces may be characterized
+in several ways.
+Each system is centered around a few data structures or objects,
+along with a set of primitives for performing operations upon these objects.
+In the original
+.UX
+filesystem [Ritchie74],
+the basic object used by the filesystem is the inode, or index node.
+The inode contains all of the information about a file except its name:
+its type, identification, ownership, permissions, timestamps and location.
+Inodes are identified by the filesystem device number and the index within
+the filesystem.
+The major entry points to the filesystem are \fInamei\fP,
+which translates a filesystem pathname into the underlying inode,
+and \fIiget\fP, which locates an inode by number and installs it in the in-core
+inode table.
+\fINamei\fP performs name translation by iterative lookup
+of each component name in its directory to find its inumber,
+then using \fIiget\fP to return the actual inode.
+If the last component has been reached, this inode is returned;
+otherwise, the inode describes the next directory to be searched.
+The inode returned may be used in various ways by the caller;
+it may be examined, the file may be read or written,
+types and access may be checked, and fields may be modified.
+Modified inodes are automatically written back to the filesystem
+on disk when the last reference is released with \fIiput\fP.
+Although the details are considerably different,
+the same general scheme is used in the faster filesystem in 4.2BSD
+.UX
+[Mckusick85].
+.PP
+Both the AT&T interface and, to a lesser extent, the DEC interface
+attempt to preserve the inode-oriented interface.
+Each modify the inode to allow different varieties of the structure
+for different filesystem types by separating the filesystem-dependent
+parts of the inode into a separate structure or one arm of a union.
+Both interfaces allow operations
+equivalent to the \fInamei\fP and \fIiget\fP operations
+of the old filesystem to be performed in the filesystem-independent
+layer, with entry points to the individual filesystem implementations to support
+the type-specific parts of these operations. Implicit in this interface
+is that files may be conveniently be named by and located using a single
+index within a filesystem.
+The GFS provides specific entry points to the filesystems
+to change most file properties rather than allowing arbitrary changes
+to be made to the generic part of the inode.
+.PP
+In contrast, the Sun VFS interface replaces the inode as the primary object
+with the vnode.
+The vnode contains no filesystem-dependent fields except the pointer
+to the set of operations implemented by the filesystem.
+Properties of a vnode that might be transient, such as the ownership,
+permissions, size and timestamps, are maintained by the lower layer.
+These properties may be presented in a generic format upon request;
+callers are expected not to hold this information for any length of time,
+as they may not be up-to-date later on.
+The vnode operations do not include a corollary for \fIiget\fP;
+the only external interface for obtaining vnodes for specific files
+is the name lookup operation.
+(Separate procedures are provided outside of this interface
+that obtain a ``file handle'' for a vnode which may be given
+to a client by a server, such that the vnode may be retrieved
+upon later presentation of the file handle.)
+.NH
+Name translation issues
+.PP
+Each of the systems described include a mechanism for performing
+pathname-to-internal-representation translation.
+The style of the name translation function is very different in all
+three systems.
+As described above, the AT&T and DEC systems retain the \fInamei\fP function.
+The two are quite different, however, as the ULTRIX interface uses
+the \fInamei\fP calling convention introduced in 4.3BSD.
+The parameters and context for the name lookup operation
+are collected in a \fInameidata\fP structure which is passed to \fInamei\fP
+for operation.
+Intent to create or delete the named file is declared in advance,
+so that the final directory scan in \fInamei\fP may retain information
+such as the offset in the directory at which the modification will be made.
+Filesystems that use such mechanisms to avoid redundant work
+must therefore lock the directory to be modified so that it may not
+be modified by another process before completion.
+In the System V filesystem, as in previous versions of
+.UX ,
+this information is stored in the per-process \fIuser\fP structure
+by \fInamei\fP for use by a low-level routine called after performing
+the actual creation or deletion of the file itself.
+In 4.3BSD and in the GFS interface, these side effects of \fInamei\fP
+are stored in the \fInameidata\fP structure given as argument to \fInamei\fP,
+which is also presented to the routine implementing file creation or deletion.
+.PP
+The ULTRIX \fInamei\fP routine is responsible for the generic
+parts of the name translation process, such as copying the name into
+an internal buffer, validating it, interpolating
+the contents of symbolic links, and indirecting at mount points.
+As in 4.3BSD, the name is copied into the buffer in a single call,
+according to the location of the name.
+After determining the type of the filesystem at the start of translation
+(the current directory or root directory), it calls the filesystem's
+\fInamei\fP entry with the same structure it received from its caller.
+The filesystem-specific routine translates the name, component by component,
+as long as no mount points are reached.
+It may return after any number of components have been processed.
+\fINamei\fP performs any processing at mount points, then calls
+the correct translation routine for the next filesystem.
+Network filesystems may pass the remaining pathname to a server for translation,
+or they may look up the pathname components one at a time.
+The former strategy would be more efficient,
+but the latter scheme allows mount points within a remote filesystem
+without server knowledge of all client mounts.
+.PP
+The AT&T \fInamei\fP interface is presumably the same as that in previous
+.UX
+systems, accepting the name of a routine to fetch pathname characters
+and an operation (one of: lookup, lookup for creation, or lookup for deletion).
+It translates, component by component, as before.
+If it detects that a mount point crosses to a remote filesystem,
+it passes the remainder of the pathname to the remote server.
+A pathname-oriented request other than open may be completed
+within the \fInamei\fP call,
+avoiding return to the (unmodified) system call handler
+that called \fInamei\fP.
+.PP
+In contrast to the first two systems, Sun's VFS interface has replaced
+\fInamei\fP with \fIlookupname\fP.
+This routine simply calls a new pathname-handling module to allocate
+a pathname buffer and copy in the pathname (copying a character per call),
+then calls \fIlookuppn\fP.
+\fILookuppn\fP performs the iteration over the directories leading
+to the destination file; it copies each pathname component to a local buffer,
+then calls the filesystem \fIlookup\fP entry to locate the vnode
+for that file in the current directory.
+Per-filesystem \fIlookup\fP routines may translate only one component
+per call.
+For creation and deletion of new files, the lookup operation is unmodified;
+the lookup of the final component only serves to check for the existence
+of the file.
+The subsequent creation or deletion call, if any, must repeat the final
+name translation and associated directory scan.
+For new file creation in particular, this is rather inefficient,
+as file creation requires two complete scans of the directory.
+.PP
+Several of the important performance improvements in 4.3BSD
+were related to the name translation process [McKusick85][Leffler84].
+The following changes were made:
+.IP 1. 4
+A system-wide cache of recent translations is maintained.
+The cache is separate from the inode cache, so that multiple names
+for a file may be present in the cache.
+The cache does not hold ``hard'' references to the inodes,
+so that the normal reference pattern is not disturbed.
+.IP 2.
+A per-process cache is kept of the directory and offset
+at which the last successful name lookup was done.
+This allows sequential lookups of all the entries in a directory to be done
+in linear time.
+.IP 3.
+The entire pathname is copied into a kernel buffer in a single operation,
+rather than using two subroutine calls per character.
+.IP 4.
+A pool of pathname buffers are held by \fInamei\fP, avoiding allocation
+overhead.
+.LP
+All of these performance improvements from 4.3BSD are well worth using
+within a more generalized filesystem framework.
+The generalization of the structure may otherwise make an already-expensive
+function even more costly.
+Most of these improvements are present in the GFS system, as it derives
+from the beta-test version of 4.3BSD.
+The Sun system uses a name-translation cache generally like that in 4.3BSD.
+The name cache is a filesystem-independent facility provided for the use
+of the filesystem-specific lookup routines.
+The Sun cache, like that first used at Berkeley but unlike that in 4.3,
+holds a ``hard'' reference to the vnode (increments the reference count).
+The ``soft'' reference scheme in 4.3BSD cannot be used with the current
+NFS implementation, as NFS allocates vnodes dynamically and frees them
+when the reference count returns to zero rather than caching them.
+As a result, fewer names may be held in the cache
+than (local filesystem) vnodes, and the cache distorts the normal reference
+patterns otherwise seen by the LRU cache.
+As the name cache references overflow the local filesystem inode table,
+the name cache must be purged to make room in the inode table.
+Also, to determine whether a vnode is in use (for example,
+before mounting upon it), the cache must be flushed to free any
+cache reference.
+These problems should be corrected
+by the use of the soft cache reference scheme.
+.PP
+A final observation on the efficiency of name translation in the current
+Sun VFS architecture is that the number of subroutine calls used
+by a multi-component name lookup is dramatically larger
+than in the other systems.
+The name lookup scheme in GFS suffers from this problem much less,
+at no expense in violation of layering.
+.PP
+A final problem to be considered is synchronization and consistency.
+As the filesystem operations are more stylized and broken into separate
+entry points for parts of operations, it is more difficult to guarantee
+consistency throughout an operation and/or to synchronize with other
+processes using the same filesystem objects.
+The Sun interface suffers most severely from this,
+as it forbids the filesystems from locking objects across calls
+to the filesystem.
+It is possible that a file may be created between the time that a lookup
+is performed and a subsequent creation is requested.
+Perhaps more strangely, after a lookup fails to find the target
+of a creation attempt, the actual creation might find that the target
+now exists and is a symbolic link.
+The call will either fail unexpectedly, as the target is of the wrong type,
+or the generic creation routine will have to note the error
+and restart the operation from the lookup.
+This problem will always exist in a stateless filesystem,
+but the VFS interface forces all filesystems to share the problem.
+This restriction against locking between calls also
+forces duplication of work during file creation and deletion.
+This is considered unacceptable.
+.NH
+Support facilities and other interactions
+.PP
+Several support facilities are used by the current
+.UX
+filesystem and require generalization for use by other filesystem types.
+For filesystem implementations to be portable,
+it is desirable that these modified support facilities
+should also have a uniform interface and
+behave in a consistent manner in target systems.
+A prominent example is the filesystem buffer cache.
+The buffer cache in a standard (System V or 4.3BSD)
+.UX
+system contains physical disk blocks with no reference to the files containing
+them.
+This works well for the local filesystem, but has obvious problems
+for remote filesystems.
+Sun has modified the buffer cache routines to describe buffers by vnode
+rather than by device.
+For remote files, the vnode used is that of the file, and the block
+numbers are virtual data blocks.
+For local filesystems, a vnode for the block device is used for cache reference,
+and the block numbers are filesystem physical blocks.
+Use of per-file cache description does not easily accommodate
+caching of indirect blocks, inode blocks, superblocks or cylinder group blocks.
+However, the vnode describing the block device for the cache
+is one created internally,
+rather than the vnode for the device looked up when mounting,
+and it is located by searching a private list of vnodes
+rather than by holding it in the mount structure.
+Although the Sun modification makes it possible to use the buffer
+cache for data blocks of remote files, a better generalization
+of the buffer cache is needed.
+.PP
+The RFS filesystem used by AT&T does not currently cache data blocks
+on client systems, thus the buffer cache is probably unmodified.
+The form of the buffer cache in ULTRIX is unknown to us.
+.PP
+Another subsystem that has a large interaction with the filesystem
+is the virtual memory system.
+The virtual memory system must read data from the filesystem
+to satisfy fill-on-demand page faults.
+For efficiency, this read call is arranged to place the data directly
+into the physical pages assigned to the process (a ``raw'' read) to avoid
+copying the data.
+Although the read operation normally bypasses the filesystem buffer cache,
+consistency must be maintained by checking the buffer cache and copying
+or flushing modified data not yet stored on disk.
+The 4.2BSD virtual memory system, like that of Sun and ULTRIX,
+maintains its own cache of reusable text pages.
+This creates additional complications.
+As the virtual memory systems are redesigned, these problems should be
+resolved by reading through the buffer cache, then mapping the cached
+data into the user address space.
+If the buffer cache or the process pages are changed while the other reference
+remains, the data would have to be copied (``copy-on-write'').
+.PP
+In the meantime, the current virtual memory systems must be used
+with the new filesystem framework.
+Both the Sun and AT&T filesystem interfaces
+provide entry points to the filesystem for optimization of the virtual
+memory system by performing logical-to-physical block number translation
+when setting up a fill-on-demand image for a process.
+The VFS provides a vnode operation analogous to the \fIbmap\fP function of the
+.UX
+filesystem.
+Given a vnode and logical block number, it returns a vnode and block number
+which may be read to obtain the data.
+If the filesystem is local, it returns the private vnode for the block device
+and the physical block number.
+As the \fIbmap\fP operations are all performed at one time, during process
+startup, any indirect blocks for the file will remain in the cache
+after they are once read.
+In addition, the interface provides a \fIstrategy\fP entry that may be used
+for ``raw'' reads from a filesystem device,
+used to read data blocks into an address space without copying.
+This entry uses a buffer header (\fIbuf\fP structure)
+to describe the I/O operation
+instead of a \fIuio\fP structure.
+The buffer-style interface is the same as that used by disk drivers internally.
+This difference allows the current \fIuio\fP primitives to be avoided,
+as they copy all data to/from the current user process address space.
+Instead, for local filesystems these operations could be done internally
+with the standard raw disk read routines,
+which use a \fIuio\fP interface.
+When loading from a remote filesystems,
+the data will be received in a network buffer.
+If network buffers are suitably aligned,
+the data may be mapped into the process address space by a page swap
+without copying.
+In either case, it should be possible to use the standard filesystem
+read entry from the virtual memory system.
+.PP
+Other issues that must be considered in devising a portable
+filesystem implementation include kernel memory allocation,
+the implicit use of user-structure global context,
+which may create problems with reentrancy,
+the style of the system call interface,
+and the conventions for synchronization
+(sleep/wakeup, handling of interrupted system calls, semaphores).
+.NH
+The Berkeley Proposal
+.PP
+The Sun VFS interface has been most widely used of the three described here.
+It is also the most general of the three, in that filesystem-specific
+data and operations are best separated from the generic layer.
+Although it has several disadvantages which were described above,
+most of them may be corrected with minor changes to the interface
+(and, in a few areas, philosophical changes).
+The DEC GFS has other advantages, in particular the use of the 4.3BSD
+\fInamei\fP interface and optimizations.
+It allows single or multiple components of a pathname
+to be translated in a single call to the specific filesystem
+and thus accommodates filesystems with either preference.
+The FSS is least well understood, as there is little public information
+about the interface.
+However, the design goals are the least consistent with those of the Berkeley
+research groups.
+Accordingly, a new filesystem interface has been devised to avoid
+some of the problems in the other systems.
+The proposed interface derives directly from Sun's VFS,
+but, like GFS, uses a 4.3BSD-style name lookup interface.
+Additional context information has been moved from the \fIuser\fP structure
+to the \fInameidata\fP structure so that name translation may be independent
+of the global context of a user process.
+This is especially desired in any system where kernel-mode servers
+operate as light-weight or interrupt-level processes,
+or where a server may store or cache context for several clients.
+This calling interface has the additional advantage
+that the call parameters need not all be pushed onto the stack for each call
+through the filesystem interface,
+and they may be accessed using short offsets from a base pointer
+(unlike global variables in the \fIuser\fP structure).
+.PP
+The proposed filesystem interface is described very tersely here.
+For the most part, data structures and procedures are analogous
+to those used by VFS, and only the changes will be treated here.
+See [Kleiman86] for complete descriptions of the vfs and vnode operations
+in Sun's interface.
+.PP
+The central data structure for name translation is the \fInameidata\fP
+structure.
+The same structure is used to pass parameters to \fInamei\fP,
+to pass these same parameters to filesystem-specific lookup routines,
+to communicate completion status from the lookup routines back to \fInamei\fP,
+and to return completion status to the calling routine.
+For creation or deletion requests, the parameters to the filesystem operation
+to complete the request are also passed in this same structure.
+The form of the \fInameidata\fP structure is:
+.br
+.ne 2i
+.ID
+.nf
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+/*
+ * Encapsulation of namei parameters.
+ * One of these is located in the u. area to
+ * minimize space allocated on the kernel stack
+ * and to retain per-process context.
+ */
+struct nameidata {
+ /* arguments to namei and related context: */
+ caddr_t ni_dirp; /* pathname pointer */
+ enum uio_seg ni_seg; /* location of pathname */
+ short ni_nameiop; /* see below */
+ struct vnode *ni_cdir; /* current directory */
+ struct vnode *ni_rdir; /* root directory, if not normal root */
+ struct ucred *ni_cred; /* credentials */
+
+ /* shared between namei, lookup routines and commit routines: */
+ caddr_t ni_pnbuf; /* pathname buffer */
+ char *ni_ptr; /* current location in pathname */
+ int ni_pathlen; /* remaining chars in path */
+ short ni_more; /* more left to translate in pathname */
+ short ni_loopcnt; /* count of symlinks encountered */
+
+ /* results: */
+ struct vnode *ni_vp; /* vnode of result */
+ struct vnode *ni_dvp; /* vnode of intermediate directory */
+
+/* BEGIN UFS SPECIFIC */
+ struct diroffcache { /* last successful directory search */
+ struct vnode *nc_prevdir; /* terminal directory */
+ long nc_id; /* directory's unique id */
+ off_t nc_prevoffset; /* where last entry found */
+ } ni_nc;
+/* END UFS SPECIFIC */
+};
+.DE
+.DS
+.ta \w'#define\0\0'u +\w'WANTPARENT\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+/*
+ * namei operations and modifiers
+ */
+#define LOOKUP 0 /* perform name lookup only */
+#define CREATE 1 /* setup for file creation */
+#define DELETE 2 /* setup for file deletion */
+#define WANTPARENT 0x10 /* return parent directory vnode also */
+#define NOCACHE 0x20 /* name must not be left in cache */
+#define FOLLOW 0x40 /* follow symbolic links */
+#define NOFOLLOW 0x0 /* don't follow symbolic links (pseudo) */
+.DE
+As in current systems other than Sun's VFS, \fInamei\fP is called
+with an operation request, one of LOOKUP, CREATE or DELETE.
+For a LOOKUP, the operation is exactly like the lookup in VFS.
+CREATE and DELETE allow the filesystem to ensure consistency
+by locking the parent inode (private to the filesystem),
+and (for the local filesystem) to avoid duplicate directory scans
+by storing the new directory entry and its offset in the directory
+in the \fIndirinfo\fP structure.
+This is intended to be opaque to the filesystem-independent levels.
+Not all lookups for creation or deletion are actually followed
+by the intended operation; permission may be denied, the filesystem
+may be read-only, etc.
+Therefore, an entry point to the filesystem is provided
+to abort a creation or deletion operation
+and allow release of any locked internal data.
+After a \fInamei\fP with a CREATE or DELETE flag, the pathname pointer
+is set to point to the last filename component.
+Filesystems that choose to implement creation or deletion entirely
+within the subsequent call to a create or delete entry
+are thus free to do so.
+.PP
+The \fInameidata\fP is used to store context used during name translation.
+The current and root directories for the translation are stored here.
+For the local filesystem, the per-process directory offset cache
+is also kept here.
+A file server could leave the directory offset cache empty,
+could use a single cache for all clients,
+or could hold caches for several recent clients.
+.PP
+Several other data structures are used in the filesystem operations.
+One is the \fIucred\fP structure which describes a client's credentials
+to the filesystem.
+This is modified slightly from the Sun structure;
+the ``accounting'' group ID has been merged into the groups array.
+The actual number of groups in the array is given explicitly
+to avoid use of a reserved group ID as a terminator.
+Also, typedefs introduced in 4.3BSD for user and group ID's have been used.
+The \fIucred\fP structure is thus:
+.DS
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+/*
+ * Credentials.
+ */
+struct ucred {
+ u_short cr_ref; /* reference count */
+ uid_t cr_uid; /* effective user id */
+ short cr_ngroups; /* number of groups */
+ gid_t cr_groups[NGROUPS]; /* groups */
+ /*
+ * The following either should not be here,
+ * or should be treated as opaque.
+ */
+ uid_t cr_ruid; /* real user id */
+ gid_t cr_svgid; /* saved set-group id */
+};
+.DE
+.PP
+A final structure used by the filesystem interface is the \fIuio\fP
+structure mentioned earlier.
+This structure describes the source or destination of an I/O
+operation, with provision for scatter/gather I/O.
+It is used in the read and write entries to the filesystem.
+The \fIuio\fP structure presented here is modified from the one
+used in 4.2BSD to specify the location of each vector of the operation
+(user or kernel space)
+and to allow an alternate function to be used to implement the data movement.
+The alternate function might perform page remapping rather than a copy,
+for example.
+.DS
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+/*
+ * Description of an I/O operation which potentially
+ * involves scatter-gather, with individual sections
+ * described by iovec, below. uio_resid is initially
+ * set to the total size of the operation, and is
+ * decremented as the operation proceeds. uio_offset
+ * is incremented by the amount of each operation.
+ * uio_iov is incremented and uio_iovcnt is decremented
+ * after each vector is processed.
+ */
+struct uio {
+ struct iovec *uio_iov;
+ int uio_iovcnt;
+ off_t uio_offset;
+ int uio_resid;
+ enum uio_rw uio_rw;
+};
+
+enum uio_rw { UIO_READ, UIO_WRITE };
+.DE
+.DS
+.ta .5i +\w'caddr_t\0\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+/*
+ * Description of a contiguous section of an I/O operation.
+ * If iov_op is non-null, it is called to implement the copy
+ * operation, possibly by remapping, with the call
+ * (*iov_op)(from, to, count);
+ * where from and to are caddr_t and count is int.
+ * Otherwise, the copy is done in the normal way,
+ * treating base as a user or kernel virtual address
+ * according to iov_segflg.
+ */
+struct iovec {
+ caddr_t iov_base;
+ int iov_len;
+ enum uio_seg iov_segflg;
+ int (*iov_op)();
+};
+.DE
+.DS
+.ta .5i +\w'UIO_USERSPACE\0\0\0\0\0'u
+/*
+ * Segment flag values.
+ */
+enum uio_seg {
+ UIO_USERSPACE, /* from user data space */
+ UIO_SYSSPACE, /* from system space */
+};
+.DE
+.NH
+File and filesystem operations
+.PP
+With the introduction of the data structures used by the filesystem
+operations, the complete list of filesystem entry points may be listed.
+As noted, they derive mostly from the Sun VFS interface.
+Lines marked with \fB+\fP are additions to the Sun definitions;
+lines marked with \fB!\fP are modified from VFS.
+.PP
+The structure describing the externally-visible features of a mounted
+filesystem, \fIvfs\fP, is:
+.DS
+.ta .5i +\w'struct vfsops\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+/*
+ * Structure per mounted file system.
+ * Each mounted file system has an array of
+ * operations and an instance record.
+ * The file systems are put on a doubly linked list.
+ */
+struct vfs {
+ struct vfs *vfs_next; /* next vfs in vfs list */
+\fB+\fP struct vfs *vfs_prev; /* prev vfs in vfs list */
+ struct vfsops *vfs_op; /* operations on vfs */
+ struct vnode *vfs_vnodecovered; /* vnode we mounted on */
+ int vfs_flag; /* flags */
+\fB!\fP int vfs_fsize; /* fundamental block size */
+\fB+\fP int vfs_bsize; /* optimal transfer size */
+\fB!\fP uid_t vfs_exroot; /* exported fs uid 0 mapping */
+ short vfs_exflags; /* exported fs flags */
+ caddr_t vfs_data; /* private data */
+};
+.DE
+.DS
+.ta \w'\fB+\fP 'u +\w'#define\0\0'u +\w'VFS_EXPORTED\0\0'u +\w'0x40\0\0\0\0\0'u
+ /*
+ * vfs flags.
+ * VFS_MLOCK lock the vfs so that name lookup cannot proceed past the vfs.
+ * This keeps the subtree stable during mounts and unmounts.
+ */
+ #define VFS_RDONLY 0x01 /* read only vfs */
+\fB+\fP #define VFS_NOEXEC 0x02 /* can't exec from filesystem */
+ #define VFS_MLOCK 0x04 /* lock vfs so that subtree is stable */
+ #define VFS_MWAIT 0x08 /* someone is waiting for lock */
+ #define VFS_NOSUID 0x10 /* don't honor setuid bits on vfs */
+ #define VFS_EXPORTED 0x20 /* file system is exported (NFS) */
+
+ /*
+ * exported vfs flags.
+ */
+ #define EX_RDONLY 0x01 /* exported read only */
+.DE
+.LP
+The operations supported by the filesystem-specific layer
+on an individual filesystem are:
+.DS
+.ta .5i +\w'struct vfsops\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+/*
+ * Operations supported on virtual file system.
+ */
+struct vfsops {
+\fB!\fP int (*vfs_mount)( /* vfs, path, data, datalen */ );
+\fB!\fP int (*vfs_unmount)( /* vfs, forcibly */ );
+\fB+\fP int (*vfs_mountroot)();
+ int (*vfs_root)( /* vfs, vpp */ );
+\fB!\fP int (*vfs_statfs)( /* vfs, vp, sbp */ );
+\fB!\fP int (*vfs_sync)( /* vfs, waitfor */ );
+\fB+\fP int (*vfs_fhtovp)( /* vfs, fhp, vpp */ );
+\fB+\fP int (*vfs_vptofh)( /* vp, fhp */ );
+};
+.DE
+.LP
+The \fIvfs_statfs\fP entry returns a structure of the form:
+.DS
+.ta .5i +\w'struct vfsops\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+/*
+ * file system statistics
+ */
+struct statfs {
+\fB!\fP short f_type; /* type of filesystem */
+\fB+\fP short f_flags; /* copy of vfs (mount) flags */
+\fB!\fP long f_fsize; /* fundamental file system block size */
+\fB+\fP long f_bsize; /* optimal transfer block size */
+ long f_blocks; /* total data blocks in file system */
+ long f_bfree; /* free blocks in fs */
+ long f_bavail; /* free blocks avail to non-superuser */
+ long f_files; /* total file nodes in file system */
+ long f_ffree; /* free file nodes in fs */
+ fsid_t f_fsid; /* file system id */
+\fB+\fP char *f_mntonname; /* directory on which mounted */
+\fB+\fP char *f_mntfromname; /* mounted filesystem */
+ long f_spare[7]; /* spare for later */
+};
+
+typedef long fsid_t[2]; /* file system id type */
+.DE
+.LP
+The modifications to Sun's interface at this level are minor.
+Additional arguments are present for the \fIvfs_mount\fP and \fIvfs_umount\fP
+entries.
+\fIvfs_statfs\fP accepts a vnode as well as filesystem identifier,
+as the information may not be uniform throughout a filesystem.
+For example,
+if a client may mount a file tree that spans multiple physical
+filesystems on a server, different sections may have different amounts
+of free space.
+(NFS does not allow remotely-mounted file trees to span physical filesystems
+on the server.)
+The final additions are the entries that support file handles.
+\fIvfs_vptofh\fP is provided for the use of file servers,
+which need to obtain an opaque
+file handle to represent the current vnode for transmission to clients.
+This file handle may later be used to relocate the vnode using \fIvfs_fhtovp\fP
+without requiring the vnode to remain in memory.
+.PP
+Finally, the external form of a filesystem object, the \fIvnode\fP, is:
+.DS
+.ta .5i +\w'struct vnodeops\0\0'u +\w'*v_vfsmountedhere;\0\0\0'u
+/*
+ * vnode types. VNON means no type.
+ */
+enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK };
+
+struct vnode {
+ u_short v_flag; /* vnode flags (see below) */
+ u_short v_count; /* reference count */
+ u_short v_shlockc; /* count of shared locks */
+ u_short v_exlockc; /* count of exclusive locks */
+ struct vfs *v_vfsmountedhere; /* ptr to vfs mounted here */
+ struct vfs *v_vfsp; /* ptr to vfs we are in */
+ struct vnodeops *v_op; /* vnode operations */
+\fB+\fP struct text *v_text; /* text/mapped region */
+ enum vtype v_type; /* vnode type */
+ caddr_t v_data; /* private data for fs */
+};
+.DE
+.DS
+.ta \w'#define\0\0'u +\w'NOFOLLOW\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+/*
+ * vnode flags.
+ */
+#define VROOT 0x01 /* root of its file system */
+#define VTEXT 0x02 /* vnode is a pure text prototype */
+#define VEXLOCK 0x10 /* exclusive lock */
+#define VSHLOCK 0x20 /* shared lock */
+#define VLWAIT 0x40 /* proc is waiting on shared or excl. lock */
+.DE
+.LP
+The operations supported by the filesystems on individual \fIvnode\fP\^s
+are:
+.DS
+.ta .5i +\w'int\0\0\0\0\0'u +\w'(*vn_getattr)(\0\0\0\0\0'u
+/*
+ * Operations on vnodes.
+ */
+struct vnodeops {
+\fB!\fP int (*vn_lookup)( /* ndp */ );
+\fB!\fP int (*vn_create)( /* ndp, vap, fflags */ );
+\fB+\fP int (*vn_mknod)( /* ndp, vap, fflags */ );
+\fB!\fP int (*vn_open)( /* vp, fflags, cred */ );
+ int (*vn_close)( /* vp, fflags, cred */ );
+ int (*vn_access)( /* vp, fflags, cred */ );
+ int (*vn_getattr)( /* vp, vap, cred */ );
+ int (*vn_setattr)( /* vp, vap, cred */ );
+
+\fB+\fP int (*vn_read)( /* vp, uiop, offp, ioflag, cred */ );
+\fB+\fP int (*vn_write)( /* vp, uiop, offp, ioflag, cred */ );
+\fB!\fP int (*vn_ioctl)( /* vp, com, data, fflag, cred */ );
+ int (*vn_select)( /* vp, which, cred */ );
+\fB+\fP int (*vn_mmap)( /* vp, ..., cred */ );
+ int (*vn_fsync)( /* vp, cred */ );
+\fB+\fP int (*vn_seek)( /* vp, offp, off, whence */ );
+
+\fB!\fP int (*vn_remove)( /* ndp */ );
+\fB!\fP int (*vn_link)( /* vp, ndp */ );
+\fB!\fP int (*vn_rename)( /* src ndp, target ndp */ );
+\fB!\fP int (*vn_mkdir)( /* ndp, vap */ );
+\fB!\fP int (*vn_rmdir)( /* ndp */ );
+\fB!\fP int (*vn_symlink)( /* ndp, vap, nm */ );
+ int (*vn_readdir)( /* vp, uiop, offp, ioflag, cred */ );
+ int (*vn_readlink)( /* vp, uiop, ioflag, cred */ );
+
+\fB+\fP int (*vn_abortop)( /* ndp */ );
+\fB+\fP int (*vn_lock)( /* vp */ );
+\fB+\fP int (*vn_unlock)( /* vp */ );
+\fB!\fP int (*vn_inactive)( /* vp */ );
+};
+.DE
+.DS
+.ta \w'#define\0\0'u +\w'NOFOLLOW\0\0'u +\w'0x40\0\0\0\0\0'u
+/*
+ * flags for ioflag
+ */
+#define IO_UNIT 0x01 /* do io as atomic unit for VOP_RDWR */
+#define IO_APPEND 0x02 /* append write for VOP_RDWR */
+#define IO_SYNC 0x04 /* sync io for VOP_RDWR */
+.DE
+.LP
+The argument types listed in the comments following each operation are:
+.sp
+.IP ndp 10
+A pointer to a \fInameidata\fP structure.
+.IP vap
+A pointer to a \fIvattr\fP structure (vnode attributes; see below).
+.IP fflags
+File open flags, possibly including O_APPEND, O_CREAT, O_TRUNC and O_EXCL.
+.IP vp
+A pointer to a \fIvnode\fP previously obtained with \fIvn_lookup\fP.
+.IP cred
+A pointer to a \fIucred\fP credentials structure.
+.IP uiop
+A pointer to a \fIuio\fP structure.
+.IP ioflag
+Any of the IO flags defined above.
+.IP com
+An \fIioctl\fP command, with type \fIunsigned long\fP.
+.IP data
+A pointer to a character buffer used to pass data to or from an \fIioctl\fP.
+.IP which
+One of FREAD, FWRITE or 0 (select for exceptional conditions).
+.IP off
+A file offset of type \fIoff_t\fP.
+.IP offp
+A pointer to file offset of type \fIoff_t\fP.
+.IP whence
+One of L_SET, L_INCR, or L_XTND.
+.IP fhp
+A pointer to a file handle buffer.
+.sp
+.PP
+Several changes have been made to Sun's set of vnode operations.
+Most obviously, the \fIvn_lookup\fP receives a \fInameidata\fP structure
+containing its arguments and context as described.
+The same structure is also passed to one of the creation or deletion
+entries if the lookup operation is for CREATE or DELETE to complete
+an operation, or to the \fIvn_abortop\fP entry if no operation
+is undertaken.
+For filesystems that perform no locking between lookup for creation
+or deletion and the call to implement that action,
+the final pathname component may be left untranslated by the lookup
+routine.
+In any case, the pathname pointer points at the final name component,
+and the \fInameidata\fP contains a reference to the vnode of the parent
+directory.
+The interface is thus flexible enough to accommodate filesystems
+that are fully stateful or fully stateless, while avoiding redundant
+operations whenever possible.
+One operation remains problematical, the \fIvn_rename\fP call.
+It is tempting to look up the source of the rename for deletion
+and the target for creation.
+However, filesystems that lock directories during such lookups must avoid
+deadlock if the two paths cross.
+For that reason, the source is translated for LOOKUP only,
+with the WANTPARENT flag set;
+the target is then translated with an operation of CREATE.
+.PP
+In addition to the changes concerned with the \fInameidata\fP interface,
+several other changes were made in the vnode operations.
+The \fIvn_rdrw\fP entry was split into \fIvn_read\fP and \fIvn_write\fP;
+frequently, the read/write entry amounts to a routine that checks
+the direction flag, then calls either a read routine or a write routine.
+The two entries may be identical for any given filesystem;
+the direction flag is contained in the \fIuio\fP given as an argument.
+.PP
+All of the read and write operations use a \fIuio\fP to describe
+the file offset and buffer locations.
+All of these fields must be updated before return.
+In particular, the \fIvn_readdir\fP entry uses this
+to return a new file offset token for its current location.
+.PP
+Several new operations have been added.
+The first, \fIvn_seek\fP, is a concession to record-oriented files
+such as directories.
+It allows the filesystem to verify that a seek leaves a file at a sensible
+offset, or to return a new offset token relative to an earlier one.
+For most filesystems and files, this operation amounts to performing
+simple arithmetic.
+Another new entry point is \fIvn_mmap\fP, for use in mapping device memory
+into a user process address space.
+Its semantics are not yet decided.
+The final additions are the \fIvn_lock\fP and \fIvn_unlock\fP entries.
+These are used to request that the underlying file be locked against
+changes for short periods of time if the filesystem implementation allows it.
+They are used to maintain consistency
+during internal operations such as \fIexec\fP,
+and may not be used to construct atomic operations from other filesystem
+operations.
+.PP
+The attributes of a vnode are not stored in the vnode,
+as they might change with time and may need to be read from a remote
+source.
+Attributes have the form:
+.DS
+.ta .5i +\w'struct vnodeops\0\0'u +\w'*v_vfsmountedhere;\0\0\0'u
+/*
+ * Vnode attributes. A field value of -1
+ * represents a field whose value is unavailable
+ * (getattr) or which is not to be changed (setattr).
+ */
+struct vattr {
+ enum vtype va_type; /* vnode type (for create) */
+ u_short va_mode; /* files access mode and type */
+\fB!\fP uid_t va_uid; /* owner user id */
+\fB!\fP gid_t va_gid; /* owner group id */
+ long va_fsid; /* file system id (dev for now) */
+\fB!\fP long va_fileid; /* file id */
+ short va_nlink; /* number of references to file */
+ u_long va_size; /* file size in bytes (quad?) */
+\fB+\fP u_long va_size1; /* reserved if not quad */
+ long va_blocksize; /* blocksize preferred for i/o */
+ struct timeval va_atime; /* time of last access */
+ struct timeval va_mtime; /* time of last modification */
+ struct timeval va_ctime; /* time file changed */
+ dev_t va_rdev; /* device the file represents */
+ u_long va_bytes; /* bytes of disk space held by file */
+\fB+\fP u_long va_bytes1; /* reserved if va_bytes not a quad */
+};
+.DE
+.NH
+Conclusions
+.PP
+The Sun VFS filesystem interface is the most widely used generic
+filesystem interface.
+Of the interfaces examined, it creates the cleanest separation
+between the filesystem-independent and -dependent layers and data structures.
+It has several flaws, but it is felt that certain changes in the interface
+can ameliorate most of them.
+The interface proposed here includes those changes.
+The proposed interface is now being implemented by the Computer Systems
+Research Group at Berkeley.
+If the design succeeds in improving the flexibility and performance
+of the filesystem layering, it will be advanced as a model interface.
+.NH
+Acknowledgements
+.PP
+The filesystem interface described here is derived from Sun's VFS interface.
+It also includes features similar to those of DEC's GFS interface.
+We are indebted to members of the Sun and DEC system groups
+for long discussions of the issues involved.
+.br
+.ne 2i
+.NH
+References
+
+.IP Brownbridge82 \w'Satyanarayanan85\0\0'u
+Brownbridge, D.R., L.F. Marshall, B. Randell,
+``The Newcastle Connection, or UNIXes of the World Unite!,''
+\fISoftware\- Practice and Experience\fP, Vol. 12, pp. 1147-1162, 1982.
+
+.IP Cole85
+Cole, C.T., P.B. Flinn, A.B. Atlas,
+``An Implementation of an Extended File System for UNIX,''
+\fIUsenix Conference Proceedings\fP,
+pp. 131-150, June, 1985.
+
+.IP Kleiman86
+``Vnodes: An Architecture for Multiple File System Types in Sun UNIX,''
+\fIUsenix Conference Proceedings\fP,
+pp. 238-247, June, 1986.
+
+.IP Leffler84
+Leffler, S., M.K. McKusick, M. Karels,
+``Measuring and Improving the Performance of 4.2BSD,''
+\fIUsenix Conference Proceedings\fP, pp. 237-252, June, 1984.
+
+.IP McKusick84
+McKusick, M.K., W.N. Joy, S.J. Leffler, R.S. Fabry,
+``A Fast File System for UNIX,'' \fITransactions on Computer Systems\fP,
+Vol. 2, pp. 181-197,
+ACM, August, 1984.
+
+.IP McKusick85
+McKusick, M.K., M. Karels, S. Leffler,
+``Performance Improvements and Functional Enhancements in 4.3BSD,''
+\fIUsenix Conference Proceedings\fP, pp. 519-531, June, 1985.
+
+.IP Rifkin86
+Rifkin, A.P., M.P. Forbes, R.L. Hamilton, M. Sabrio, S. Shah, and K. Yueh,
+``RFS Architectural Overview,'' \fIUsenix Conference Proceedings\fP,
+pp. 248-259, June, 1986.
+
+.IP Ritchie74
+Ritchie, D.M. and K. Thompson, ``The Unix Time-Sharing System,''
+\fICommunications of the ACM\fP, Vol. 17, pp. 365-375, July, 1974.
+
+.IP Rodriguez86
+Rodriguez, R., M. Koehler, R. Hyde,
+``The Generic File System,'' \fIUsenix Conference Proceedings\fP,
+pp. 260-269, June, 1986.
+
+.IP Sandberg85
+Sandberg, R., D. Goldberg, S. Kleiman, D. Walsh, B. Lyon,
+``Design and Implementation of the Sun Network Filesystem,''
+\fIUsenix Conference Proceedings\fP,
+pp. 119-130, June, 1985.
+
+.IP Satyanarayanan85
+Satyanarayanan, M., \fIet al.\fP,
+``The ITC Distributed File System: Principles and Design,''
+\fIProc. 10th Symposium on Operating Systems Principles\fP, pp. 35-50,
+ACM, December, 1985.
+
+.IP Walker85
+Walker, B.J. and S.H. Kiser, ``The LOCUS Distributed Filesystem,''
+\fIThe LOCUS Distributed System Architecture\fP,
+G.J. Popek and B.J. Walker, ed., The MIT Press, Cambridge, MA, 1985.
+
+.IP Weinberger84
+Weinberger, P.J., ``The Version 8 Network File System,''
+\fIUsenix Conference presentation\fP,
+June, 1984.
diff --git a/share/doc/papers/fsinterface/slides.t b/share/doc/papers/fsinterface/slides.t
new file mode 100644
index 000000000000..4c32930cc122
--- /dev/null
+++ b/share/doc/papers/fsinterface/slides.t
@@ -0,0 +1,312 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.so macros
+.nf
+.LL
+Encapsulation of namei parameters
+.NP 0
+.ta .5i +\w'caddr_t\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+struct nameidata {
+ /* arguments and context: */
+ caddr_t ni_dirp;
+ enum uio_seg ni_seg;
+ short ni_nameiop;
+ struct vnode *ni_cdir;
+ struct vnode *ni_rdir;
+ struct ucred *ni_cred;
+.sp .2
+ /* shared with lookup and commit: */
+ caddr_t ni_pnbuf;
+ char *ni_ptr;
+ int ni_pathlen;
+ short ni_more;
+ short ni_loopcnt;
+.sp .2
+ /* results: */
+ struct vnode *ni_vp;
+ struct vnode *ni_dvp;
+.sp .2
+/* BEGIN UFS SPECIFIC */
+ struct diroffcache {
+ struct vnode *nc_prevdir;
+ long nc_id;
+ off_t nc_prevoffset;
+ } ni_nc;
+/* END UFS SPECIFIC */
+};
+.bp
+
+
+.LL
+Namei operations and modifiers
+
+.NP 0
+.ta \w'#define\0\0'u +\w'WANTPARENT\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+#define LOOKUP 0 /* name lookup only */
+#define CREATE 1 /* setup for creation */
+#define DELETE 2 /* setup for deletion */
+#define WANTPARENT 0x10 /* return parent vnode also */
+#define NOCACHE 0x20 /* remove name from cache */
+#define FOLLOW 0x40 /* follow symbolic links */
+.bp
+
+.LL
+Namei operations and modifiers
+
+.NP 0
+.ta \w'#define\0\0'u +\w'WANTPARENT\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+#define LOOKUP 0
+#define CREATE 1
+#define DELETE 2
+#define WANTPARENT 0x10
+#define NOCACHE 0x20
+#define FOLLOW 0x40
+.bp
+
+
+.LL
+Credentials
+
+.NP 0
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+struct ucred {
+ u_short cr_ref;
+ uid_t cr_uid;
+ short cr_ngroups;
+ gid_t cr_groups[NGROUPS];
+ /*
+ * The following either should not be here,
+ * or should be treated as opaque.
+ */
+ uid_t cr_ruid;
+ gid_t cr_svgid;
+};
+.bp
+.LL
+Scatter-gather I/O
+.NP 0
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+struct uio {
+ struct iovec *uio_iov;
+ int uio_iovcnt;
+ off_t uio_offset;
+ int uio_resid;
+ enum uio_rw uio_rw;
+};
+
+enum uio_rw { UIO_READ, UIO_WRITE };
+
+
+
+.ta .5i +\w'caddr_t\0\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+struct iovec {
+ caddr_t iov_base;
+ int iov_len;
+ enum uio_seg iov_segflg;
+ int (*iov_op)();
+};
+.bp
+.LL
+Per-filesystem information
+.NP 0
+.ta .25i +\w'struct vfsops\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+struct vfs {
+ struct vfs *vfs_next;
+\fB+\fP struct vfs *vfs_prev;
+ struct vfsops *vfs_op;
+ struct vnode *vfs_vnodecovered;
+ int vfs_flag;
+\fB!\fP int vfs_fsize;
+\fB+\fP int vfs_bsize;
+\fB!\fP uid_t vfs_exroot;
+ short vfs_exflags;
+ caddr_t vfs_data;
+};
+
+.NP 0
+.ta \w'\fB+\fP 'u +\w'#define\0\0'u +\w'VFS_EXPORTED\0\0'u +\w'0x40\0\0\0\0\0'u
+ /* vfs flags: */
+ #define VFS_RDONLY 0x01
+\fB+\fP #define VFS_NOEXEC 0x02
+ #define VFS_MLOCK 0x04
+ #define VFS_MWAIT 0x08
+ #define VFS_NOSUID 0x10
+ #define VFS_EXPORTED 0x20
+
+ /* exported vfs flags: */
+ #define EX_RDONLY 0x01
+.bp
+
+
+.LL
+Operations supported on virtual file system.
+
+.NP 0
+.ta .25i +\w'int\0\0'u +\w'*vfs_mountroot();\0'u
+struct vfsops {
+\fB!\fP int (*vfs_mount)(vfs, path, data, len);
+\fB!\fP int (*vfs_unmount)(vfs, forcibly);
+\fB+\fP int (*vfs_mountroot)();
+ int (*vfs_root)(vfs, vpp);
+ int (*vfs_statfs)(vfs, sbp);
+\fB!\fP int (*vfs_sync)(vfs, waitfor);
+\fB+\fP int (*vfs_fhtovp)(vfs, fhp, vpp);
+\fB+\fP int (*vfs_vptofh)(vp, fhp);
+};
+.bp
+
+
+.LL
+Dynamic file system information
+
+.NP 0
+.ta .5i +\w'struct\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+struct statfs {
+\fB!\fP short f_type;
+\fB+\fP short f_flags;
+\fB!\fP long f_fsize;
+\fB+\fP long f_bsize;
+ long f_blocks;
+ long f_bfree;
+ long f_bavail;
+ long f_files;
+ long f_ffree;
+ fsid_t f_fsid;
+\fB+\fP char *f_mntonname;
+\fB+\fP char *f_mntfromname;
+ long f_spare[7];
+};
+
+typedef long fsid_t[2];
+.bp
+.LL
+Filesystem objects (vnodes)
+.NP 0
+.ta .25i +\w'struct vnodeops\0\0'u +\w'*v_vfsmountedhere;\0\0\0'u
+enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK };
+
+struct vnode {
+ u_short v_flag;
+ u_short v_count;
+ u_short v_shlockc;
+ u_short v_exlockc;
+ struct vfs *v_vfsmountedhere;
+ struct vfs *v_vfsp;
+ struct vnodeops *v_op;
+\fB+\fP struct text *v_text;
+ enum vtype v_type;
+ caddr_t v_data;
+};
+.ta \w'#define\0\0'u +\w'NOFOLLOW\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+
+/* vnode flags */
+#define VROOT 0x01
+#define VTEXT 0x02
+#define VEXLOCK 0x10
+#define VSHLOCK 0x20
+#define VLWAIT 0x40
+.bp
+.LL
+Operations on vnodes
+
+.NP 0
+.ta .25i +\w'int\0\0'u +\w'(*vn_getattr)(\0\0\0\0\0'u
+struct vnodeops {
+\fB!\fP int (*vn_lookup)(ndp);
+\fB!\fP int (*vn_create)(ndp, vap, fflags);
+\fB+\fP int (*vn_mknod)(ndp, vap, fflags);
+\fB!\fP int (*vn_open)(vp, fflags, cred);
+ int (*vn_close)(vp, fflags, cred);
+ int (*vn_access)(vp, fflags, cred);
+ int (*vn_getattr)(vp, vap, cred);
+ int (*vn_setattr)(vp, vap, cred);
+.sp .5
+\fB+\fP int (*vn_read)(vp, uiop,
+ offp, ioflag, cred);
+\fB+\fP int (*vn_write)(vp, uiop,
+ offp, ioflag, cred);
+\fB!\fP int (*vn_ioctl)(vp, com,
+ data, fflag, cred);
+ int (*vn_select)(vp, which, cred);
+\fB+\fP int (*vn_mmap)(vp, ..., cred);
+ int (*vn_fsync)(vp, cred);
+\fB+\fP int (*vn_seek)(vp, offp, off,
+ whence);
+.bp
+.LL
+Operations on vnodes (cont)
+
+.NP 0
+.ta .25i +\w'int\0\0'u +\w'(*vn_getattr)(\0\0\0\0\0'u
+
+\fB!\fP int (*vn_remove)(ndp);
+\fB!\fP int (*vn_link)(vp, ndp);
+\fB!\fP int (*vn_rename)(sndp, tndp);
+\fB!\fP int (*vn_mkdir)(ndp, vap);
+\fB!\fP int (*vn_rmdir)(ndp);
+\fB!\fP int (*vn_symlink)(ndp, vap, nm);
+\fB!\fP int (*vn_readdir)(vp, uiop,
+ offp, ioflag, cred);
+\fB!\fP int (*vn_readlink)(vp, uiop,
+ offp, ioflag, cred);
+.sp .5
+\fB+\fP int (*vn_abortop)(ndp);
+\fB!\fP int (*vn_inactive)(vp);
+};
+
+.NP 0
+.ta \w'#define\0\0'u +\w'NOFOLLOW\0\0'u +\w'0x40\0\0\0\0\0'u
+/* flags for ioflag */
+#define IO_UNIT 0x01
+#define IO_APPEND 0x02
+#define IO_SYNC 0x04
+.bp
+
+.LL
+Vnode attributes
+
+.NP 0
+.ta .5i +\w'struct timeval\0\0'u +\w'*v_vfsmountedhere;\0\0\0'u
+struct vattr {
+ enum vtype va_type;
+ u_short va_mode;
+\fB!\fP uid_t va_uid;
+\fB!\fP gid_t va_gid;
+ long va_fsid;
+\fB!\fP long va_fileid;
+ short va_nlink;
+ u_long va_size;
+\fB+\fP u_long va_size1;
+ long va_blocksize;
+ struct timeval va_atime;
+ struct timeval va_mtime;
+ struct timeval va_ctime;
+ dev_t va_rdev;
+\fB!\fP u_long va_bytes;
+\fB+\fP u_long va_bytes1;
+};
diff --git a/share/doc/papers/jail/Makefile b/share/doc/papers/jail/Makefile
new file mode 100644
index 000000000000..d25059929908
--- /dev/null
+++ b/share/doc/papers/jail/Makefile
@@ -0,0 +1,12 @@
+VOLUME= papers
+DOC= jail
+SRCS= paper.ms-patched
+EXTRA= implementation.ms mgt.ms future.ms jail01.eps
+MACROS= -ms
+USE_SOELIM=
+CLEANFILES= paper.ms-patched
+
+paper.ms-patched: paper.ms
+ sed "s;jail01\.eps;${.CURDIR}/&;" ${.ALLSRC} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/jail/future.ms b/share/doc/papers/jail/future.ms
new file mode 100644
index 000000000000..d738c4650849
--- /dev/null
+++ b/share/doc/papers/jail/future.ms
@@ -0,0 +1,102 @@
+.\"
+.NH
+Future Directions
+.PP
+The jail facility has already been deployed in numerous capacities and
+a few opportunities for improvement have manifested themselves.
+.NH 2
+Improved Virtualisation
+.PP
+As it stands, the jail code provides a strict subset of system resources
+to the jail environment, based on access to processes, files, network
+resources, and privileged services.
+Virtualisation, or making the jail environments appear to be fully
+functional FreeBSD systems, allows maximum application support and the
+ability to offer a wide range of services within a jail environment.
+However, there are a number of limitations on the degree of virtualisation
+in the current code, and removing these limitations will enhance the
+ability to offer services in a jail environment.
+Two areas that deserve greater attention are the virtualisation of
+network resources, and management of scheduling resources.
+.PP
+Currently, a single IP address may be allocated to each jail, and all
+communication from the jail is limited to that IP address.
+In particular, these addresses are IPv4 addresses.
+There has been substantial interest in improving interface virtualisation,
+allowing one or more addresses to be assigned to an interface, and
+removing the requirement that the address be an IPv4 address, allowing
+the use of IPv6.
+Also, access to raw sockets is currently prohibited, as the current
+implementation of raw sockets allows access to raw IP packets associated
+with all interfaces.
+Limiting the scope of the raw socket would allow its safe use within
+a jail, re-enabling support for ping, and other network debugging and
+evaluation tools.
+.PP
+Another area of great interest to the current consumers of the jail code
+is the ability to limit the impact of one jail on the CPU resources
+available for other jails.
+Specifically, this would require that the jail of a process play a rule in
+its scheduling parameters.
+Prior work in the area of lottery scheduling, currently available as
+patches on FreeBSD 2.2.x, might be leveraged to allow some degree of
+partitioning between jail environments \s-2[LOTTERY1] [LOTTERY2]\s+2.
+However, as the current scheduling mechanism is targeted at time
+sharing, and FreeBSD does not currently support real time preemption
+of processes in kernel, complete partitioning is not possible within the
+current framework.
+.NH 2
+Improved Management
+.PP
+Management of jail environments is currently somewhat ad hoc--creating
+and starting jails is a well-documented procedure, but day-to-day
+management of jails, as well as special case procedures such as shutdown,
+are not well analysed and documented.
+The current kernel process management infrastructure does not have the
+ability to manage pools of processes in a jail-centric way.
+For example, it is possible to, within a jail, deliver a signal to all
+processes in a jail, but it is not possibly to atomically target all
+processes within a jail from outside of the jail.
+If the jail code is to effectively limit the behaviour of a jail, the
+ability to shut it down cleanly is paramount.
+Similarly, shutting down a jail cleanly from within is also not well
+defined, the traditional shutdown utilities having been written with
+a host environment in mind.
+This suggests a number of improvements, both in the kernel and in the
+user-land utility set.
+.PP
+First, the ability to address kernel-centric management mechanisms at
+jails is important.
+One way in which this might be done is to assign a unique jail id, not
+unlike a process id or process group id, at jail creation time.
+A new jailkill() syscall would permit the direction of signals to
+specific jailids, allowing for the effective termination of all processes
+in the jail.
+A unique jailid could also supplant the hostname as the unique
+identifier for a jail, allowing the hostname to be changed by the
+processes in the jail without interfering with jail management.
+.PP
+More carefully defining the user-land semantics of a jail during startup
+and shutdown is also important.
+The traditional FreeBSD environment makes use of an init process to
+bring the system up during the boot process, and to assist in shutdown.
+A similar technique might be used for jail, in effect a jailinit,
+formulated to handle the clean startup and shutdown, including calling
+out to jail-local /etc/rc.shutdown, and other useful shutdown functions.
+A jailinit would also present a central location for delivering
+management requests to within a jail from the host environment, allowing
+the host environment to request the shutdown of the jail cleanly, before
+resorting to terminating processes, in the same style as the host
+environment shutting down before killing all processes and halting the
+kernel.
+.PP
+Improvements in the host environment would also assist in improving
+jail management, possibly including automated runtime jail management tools,
+tools to more easily construct the per-jail file system area, and
+include jail shutdown as part of normal system shutdown.
+.PP
+These improvements in the jail framework would improve both raw
+functionality and usability from a management perspective.
+The jail code has raised significant interest in the FreeBSD community,
+and it is hoped that this type of improved functionality will be
+available in upcoming releases of FreeBSD.
diff --git a/share/doc/papers/jail/implementation.ms b/share/doc/papers/jail/implementation.ms
new file mode 100644
index 000000000000..73416216da6d
--- /dev/null
+++ b/share/doc/papers/jail/implementation.ms
@@ -0,0 +1,124 @@
+.\"
+.NH
+Implementation jail in the FreeBSD kernel.
+.NH 2
+The jail(2) system call, allocation, refcounting and deallocation of
+\fCstruct prison\fP.
+.PP
+The jail(2) system call is implemented as a non-optional system call
+in FreeBSD. Other system calls are controlled by compile time options
+in the kernel configuration file, but due to the minute footprint of
+the jail implementation, it was decided to make it a standard
+facility in FreeBSD.
+.PP
+The implementation of the system call is straightforward: a data structure
+is allocated and populated with the arguments provided. The data structure
+is attached to the current process' \fCstruct proc\fP, its reference count
+set to one and a call to the
+chroot(2) syscall implementation completes the task.
+.PP
+Hooks in the code implementing process creation and destruction maintains
+the reference count on the data structure and free it when the last reference
+is lost.
+Any new process created by a process in a jail will inherit a reference
+to the jail, which effectively puts the new process in the same jail.
+.PP
+There is no way to modify the contents of the data structure describing
+the jail after its creation, and no way to attach a process to an existing
+jail if it was not created from the inside that jail.
+.NH 2
+Fortification of the chroot(2) facility for filesystem name scoping.
+.PP
+A number of ways to escape the confines of a chroot(2)-created subscope
+of the filesystem view have been identified over the years.
+chroot(2) was never intended to be security mechanism as such, but even
+then the ftp daemon largely depended on the security provided by
+chroot(2) to provide the ``anonymous ftp'' access method.
+.PP
+Three classes of escape routes existed: recursive chroot(2) escapes,
+``..'' based escapes and fchdir(2) based escapes.
+All of these exploited the fact that chroot(2) didn't try sufficiently
+hard to enforce the new root directory.
+.PP
+New code were added to detect and thwart these escapes, amongst
+other things by tracking the directory of the first level of chroot(2)
+experienced by a process and refusing backwards traversal across
+this directory, as well as additional code to refuse chroot(2) if
+file-descriptors were open referencing directories.
+.NH 2
+Restriction of process visibility and interaction.
+.PP
+A macro was already in available in the kernel to determine if one process
+could affect another process. This macro did the rather complex checking
+of uid and gid values. It was felt that the complexity of the macro were
+approaching the lower edge of IOCCC entrance criteria, and it was therefore
+converted to a proper function named \fCp_trespass(p1, p2)\fP which does
+all the previous checks and additionally checks the jail aspect of the access.
+The check is implemented such that access fails if the origin process is jailed
+but the target process is not in the same jail.
+.PP
+Process visibility is provided through two mechanisms in FreeBSD,
+the \fCprocfs\fP file system and a sub-tree of the \fCsysctl\fP tree.
+Both of these were modified to report only the processes in the same
+jail to a jailed process.
+.NH 2
+Restriction to one IP number.
+.PP
+Restricting TCP and UDP access to just one IP number was done almost
+entirely in the code which manages ``protocol control blocks''.
+When a jailed process binds to a socket, the IP number provided by
+the process will not be used, instead the pre-configured IP number of
+the jail is used.
+.PP
+BSD based TCP/IP network stacks sport a special interface, the loop-back
+interface, which has the ``magic'' IP number 127.0.0.1.
+This is often used by processes to contact servers on the local machine,
+and consequently special handling for jails were needed.
+To handle this case it was necessary to also intercept and modify the
+behaviour of connection establishment, and when the 127.0.0.1 address
+were seen from a jailed process, substitute the jails configured IP number.
+.PP
+Finally the APIs through which the network configuration and connection
+state may be queried were modified to report only information relevant
+to the configured IP number of a jailed process.
+.NH 2
+Adding jail awareness to selected device drivers.
+.PP
+A couple of device drivers needed to be taught about jails, the ``pty''
+driver is one of them. The pty driver provides ``virtual terminals'' to
+services like telnet, ssh, rlogin and X11 terminal window programs.
+Therefore jails need access to the pty driver, and code had to be added
+to enforce that a particular virtual terminal were not accessed from more
+than one jail at the same time.
+.NH 2
+General restriction of super-users powers for jailed super-users.
+.PP
+This item proved to be the simplest but most tedious to implement.
+Tedious because a manual review of all places where the kernel allowed
+the super user special powers were called for,
+simple because very few places were required to let a jailed root through.
+Of the approximately 260 checks in the FreeBSD 4.0 kernel, only
+about 35 will let a jailed root through.
+.PP
+Since the default is for jailed roots to not receive privilege, new
+code or drivers in the FreeBSD kernel are automatically jail-aware: they
+will refuse jailed roots privilege.
+The other part of this protection comes from the fact that a jailed
+root cannot create new device nodes with the mknod(2) systemcall, so
+unless the machine administrator creates device nodes for a particular
+device inside the jails filesystem tree, the driver in effect does
+not exist in the jail.
+.PP
+As a side-effect of this work the suser(9) API were cleaned up and
+extended to cater for not only the jail facility, but also to make room
+for future partitioning facilities.
+.NH 2
+Implementation statistics
+.PP
+The change of the suser(9) API modified approx 350 source lines
+distributed over approx. 100 source files. The vast majority of
+these changes were generated automatically with a script.
+.PP
+The implementation of the jail facility added approx 200 lines of
+code in total, distributed over approx. 50 files. and about 200 lines
+in two new kernel files.
diff --git a/share/doc/papers/jail/jail01.eps b/share/doc/papers/jail/jail01.eps
new file mode 100644
index 000000000000..ffcfa30386f1
--- /dev/null
+++ b/share/doc/papers/jail/jail01.eps
@@ -0,0 +1,234 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: jail01.eps
+%%Creator: fig2dev Version 3.2 Patchlevel 1
+%%CreationDate: Fri Mar 24 20:37:59 2000
+%%For: $FreeBSD$
+%%Orientation: Portrait
+%%BoundingBox: 0 0 425 250
+%%Pages: 0
+%%BeginSetup
+%%EndSetup
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+-117.0 298.0 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+%%EndProlog
+
+$F2psBegin
+10 setmiterlimit
+n -1000 5962 m -1000 -1000 l 10022 -1000 l 10022 5962 l cp clip
+ 0.06000 0.06000 sc
+/Courier-BoldOblique ff 180.00 scf sf
+7725 3600 m
+gs 1 -1 sc (10.0.0.2) dup sw pop neg 0 rm col0 sh gr
+% Polyline
+15.000 slw
+n 9000 3300 m 9000 4275 l gs col0 s gr
+% Polyline
+2 slc
+n 7875 3225 m 7800 3225 l gs col0 s gr
+% Polyline
+0 slc
+n 7875 4125 m 7800 4125 l gs col0 s gr
+% Polyline
+n 7875 3225 m 7875 4425 l gs col0 s gr
+% Polyline
+n 7875 3825 m 7800 3825 l gs col0 s gr
+% Polyline
+n 7875 3525 m 7800 3525 l gs col0 s gr
+% Polyline
+n 8175 3825 m 7875 3825 l gs col0 s gr
+% Polyline
+2 slc
+n 7875 4425 m 7800 4425 l gs col0 s gr
+/Courier-Bold ff 180.00 scf sf
+8700 3900 m
+gs 1 -1 sc (fxp0) dup sw pop neg 0 rm col0 sh gr
+% Polyline
+0 slc
+7.500 slw
+n 2925 1425 m 3075 1425 l gs col0 s gr
+% Polyline
+15.000 slw
+n 2475 1350 m 2472 1347 l 2465 1342 l 2453 1334 l 2438 1323 l 2420 1311 l
+ 2401 1299 l 2383 1289 l 2366 1281 l 2351 1275 l 2338 1274 l
+ 2325 1275 l 2314 1279 l 2303 1285 l 2291 1293 l 2278 1303 l
+ 2264 1314 l 2250 1326 l 2236 1339 l 2222 1353 l 2209 1366 l
+ 2198 1379 l 2188 1391 l 2181 1403 l 2177 1414 l 2175 1425 l
+ 2177 1436 l 2181 1447 l 2188 1459 l 2198 1471 l 2209 1484 l
+ 2222 1497 l 2236 1511 l 2250 1524 l 2264 1536 l 2278 1547 l
+ 2291 1557 l 2303 1565 l 2314 1571 l 2325 1575 l 2338 1576 l
+ 2351 1575 l 2366 1569 l 2383 1561 l 2401 1551 l 2420 1539 l
+ 2438 1527 l 2453 1516 l 2465 1508 l 2472 1503 l 2475 1500 l gs col0 s gr
+/Courier-Bold ff 180.00 scf sf
+2550 1500 m
+gs 1 -1 sc (lo0) col0 sh gr
+/Courier-BoldOblique ff 180.00 scf sf
+3075 1500 m
+gs 1 -1 sc (127.0.0.1) col0 sh gr
+% Polyline
+7.500 slw
+n 2100 3525 m 2250 3525 l gs col0 s gr
+% Polyline
+n 2550 2100 m 2250 2400 l 2250 4500 l 2550 4800 l gs col0 s gr
+/Courier-Bold ff 180.00 scf sf
+1950 3600 m
+gs 1 -1 sc (/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+2550 3900 m
+gs 1 -1 sc (jail_1/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+2550 4200 m
+gs 1 -1 sc (jail_2/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+2550 4500 m
+gs 1 -1 sc (jail_3/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+2550 2400 m
+gs 1 -1 sc (dev/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+2550 2700 m
+gs 1 -1 sc (etc/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+2550 3000 m
+gs 1 -1 sc (usr/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+2550 3300 m
+gs 1 -1 sc (var/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+2550 3600 m
+gs 1 -1 sc (home/) col0 sh gr
+% Polyline
+n 3375 3825 m 3900 3825 l 4950 1800 l 5100 1800 l gs col0 s gr
+% Polyline
+n 3375 4125 m 3900 4125 l 4950 3900 l 5100 3900 l gs col0 s gr
+% Polyline
+n 5400 900 m 5100 1200 l 5100 2400 l 5400 2700 l gs col0 s gr
+% Polyline
+n 5400 3000 m 5100 3300 l 5100 4500 l 5400 4800 l gs col0 s gr
+% Polyline
+n 4650 825 m 4650 2775 l 6675 2775 l 6675 3375 l 7950 3375 l 7950 825 l
+ cp gs col0 s gr
+% Polyline
+n 4650 2775 m 4650 4950 l 6300 4950 l 6300 3675 l 7950 3675 l 7950 3375 l
+ 6675 3375 l 6675 2775 l cp gs col0 s gr
+/Courier-Bold ff 180.00 scf sf
+5400 1200 m
+gs 1 -1 sc (dev/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 1500 m
+gs 1 -1 sc (etc/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 1800 m
+gs 1 -1 sc (usr/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 2100 m
+gs 1 -1 sc (var/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 2400 m
+gs 1 -1 sc (home/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 3300 m
+gs 1 -1 sc (dev/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 3600 m
+gs 1 -1 sc (etc/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 3900 m
+gs 1 -1 sc (usr/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 4200 m
+gs 1 -1 sc (var/) col0 sh gr
+/Courier-Bold ff 180.00 scf sf
+5400 4500 m
+gs 1 -1 sc (home/) col0 sh gr
+/Courier-BoldOblique ff 180.00 scf sf
+7725 3300 m
+gs 1 -1 sc (10.0.0.1) dup sw pop neg 0 rm col0 sh gr
+/Courier-BoldOblique ff 180.00 scf sf
+7725 4500 m
+gs 1 -1 sc (10.0.0.5) dup sw pop neg 0 rm col0 sh gr
+/Courier-BoldOblique ff 180.00 scf sf
+7725 4200 m
+gs 1 -1 sc (10.0.0.4) dup sw pop neg 0 rm col0 sh gr
+/Courier-BoldOblique ff 180.00 scf sf
+7725 3900 m
+gs 1 -1 sc (10.0.0.3) dup sw pop neg 0 rm col0 sh gr
+% Polyline
+15.000 slw
+n 9000 3825 m 8775 3825 l gs col0 s gr
+$F2psEnd
+rs
diff --git a/share/doc/papers/jail/jail01.fig b/share/doc/papers/jail/jail01.fig
new file mode 100644
index 000000000000..d1cfd5c177cc
--- /dev/null
+++ b/share/doc/papers/jail/jail01.fig
@@ -0,0 +1,85 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+A4
+100.00
+Single
+-2
+1200 2
+6 7725 3150 9075 4500
+6 8700 3225 9075 4350
+2 1 0 2 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 9000 3825 8775 3825
+2 1 0 2 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 9000 3300 9000 4275
+-6
+2 1 0 2 0 7 100 0 -1 0.000 0 2 -1 0 0 2
+ 7875 3225 7800 3225
+2 1 0 2 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 7875 4125 7800 4125
+2 1 0 2 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 7875 3225 7875 4425
+2 1 0 2 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 7875 3825 7800 3825
+2 1 0 2 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 7875 3525 7800 3525
+2 1 0 2 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 8175 3825 7875 3825
+2 1 0 2 0 7 100 0 -1 0.000 0 2 -1 0 0 2
+ 7875 4425 7800 4425
+4 2 0 100 0 14 12 0.0000 4 180 420 8700 3900 fxp0\001
+-6
+6 2100 1200 4050 1650
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2925 1425 3075 1425
+3 2 0 2 0 7 100 0 -1 0.000 0 0 0 5
+ 2475 1350 2325 1275 2175 1425 2325 1575 2475 1500
+ 0.000 -1.000 -1.000 -1.000 0.000
+4 0 0 100 0 14 12 0.0000 4 135 315 2550 1500 lo0\001
+4 0 0 100 0 15 12 0.0000 4 135 945 3075 1500 127.0.0.1\001
+-6
+6 1950 2100 3300 4800
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2100 3525 2250 3525
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 4
+ 2550 2100 2250 2400 2250 4500 2550 4800
+4 0 0 100 0 14 12 0.0000 4 150 105 1950 3600 /\001
+4 0 0 100 0 14 12 0.0000 4 180 735 2550 3900 jail_1/\001
+4 0 0 100 0 14 12 0.0000 4 180 735 2550 4200 jail_2/\001
+4 0 0 100 0 14 12 0.0000 4 180 735 2550 4500 jail_3/\001
+4 0 0 100 0 14 12 0.0000 4 165 420 2550 2400 dev/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 2550 2700 etc/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 2550 3000 usr/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 2550 3300 var/\001
+4 0 0 100 0 14 12 0.0000 4 165 525 2550 3600 home/\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 4
+ 3375 3825 3900 3825 4950 1800 5100 1800
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 4
+ 3375 4125 3900 4125 4950 3900 5100 3900
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 4
+ 5400 900 5100 1200 5100 2400 5400 2700
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 4
+ 5400 3000 5100 3300 5100 4500 5400 4800
+2 3 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 7
+ 4650 825 4650 2775 6675 2775 6675 3375 7950 3375 7950 825
+ 4650 825
+2 3 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 9
+ 4650 2775 4650 4950 6300 4950 6300 3675 7950 3675 7950 3375
+ 6675 3375 6675 2775 4650 2775
+4 0 0 100 0 14 12 0.0000 4 165 420 5400 1200 dev/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 5400 1500 etc/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 5400 1800 usr/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 5400 2100 var/\001
+4 0 0 100 0 14 12 0.0000 4 165 525 5400 2400 home/\001
+4 0 0 100 0 14 12 0.0000 4 165 420 5400 3300 dev/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 5400 3600 etc/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 5400 3900 usr/\001
+4 0 0 100 0 14 12 0.0000 4 150 420 5400 4200 var/\001
+4 0 0 100 0 14 12 0.0000 4 165 525 5400 4500 home/\001
+4 2 0 100 0 15 12 0.0000 4 135 840 7725 3300 10.0.0.1\001
+4 2 0 100 0 15 12 0.0000 4 135 840 7725 4500 10.0.0.5\001
+4 2 0 100 0 15 12 0.0000 4 135 840 7725 4200 10.0.0.4\001
+4 2 0 100 0 15 12 0.0000 4 135 840 7725 3900 10.0.0.3\001
+4 2 0 100 0 15 12 0.0000 4 135 840 7725 3600 10.0.0.2\001
diff --git a/share/doc/papers/jail/mgt.ms b/share/doc/papers/jail/mgt.ms
new file mode 100644
index 000000000000..5b81af7d1375
--- /dev/null
+++ b/share/doc/papers/jail/mgt.ms
@@ -0,0 +1,214 @@
+.\"
+.NH
+Managing Jails and the Jail File System Environment
+.NH 2
+Creating a Jail Environment
+.PP
+While the jail(2) call could be used in a number of ways, the expected
+configuration creates a complete FreeBSD installation for each jail.
+This includes copies of all relevant system binaries, data files, and its
+own \fC/etc\fP directory.
+Such a configuration maximises the independence of various jails,
+and reduces the chances of interference between jails being possible,
+especially when it is desirable to provide root access within a jail to
+a less trusted user.
+.PP
+On a box making use of the jail facility, we refer to two types of
+environment: the host environment, and the jail environment.
+The host environment is the real operating system environment, which is
+used to configure interfaces, and start up the jails.
+There are then one or more jail environments, effectively virtual
+FreeBSD machines.
+When configuring Jail for use, it is necessary to configure both the
+host and jail environments to prevent overlap.
+.PP
+As jailed virtual machines are generally bound to an IP address configured
+using the normal IP alias mechanism, those jail IP addresses are also
+accessible to host environment applications to use.
+If the accessibility of some host applications in the jail environment is
+not desirable, it is necessary to configure those applications to only
+listen on appropriate addresses.
+.PP
+In most of the production environments where jail is currently in use,
+one IP address is allocated to the host environment, and then a number
+are allocated to jail boxes, with each jail box receiving a unique IP.
+In this situation, it is sufficient to configure the networking applications
+on the host to listen only on the host IP.
+Generally, this consists of specifying the appropriate IP address to be
+used by inetd and SSH, and disabling applications that are not capable
+of limiting their address scope, such as sendmail, the port mapper, and
+syslogd.
+Other third party applications that have been installed on the host must also be
+configured in this manner, or users connecting to the jailbox will
+discover the host environment service, unless the jailbox has
+specifically bound a service to that port.
+In some situations, this can actually be the desirable behaviour.
+.PP
+The jail environments must also be custom-configured.
+This consists of building and installing a miniature version of the
+FreeBSD file system tree off of a subdirectory in the host environment,
+usually \fC/usr/jail\fP, or \fC/data/jail\fP, with a subdirectory per jail.
+Appropriate instructions for generating this tree are included in the
+jail(8) man page, but generally this process may be automated using the
+FreeBSD build environment.
+.PP
+One notable difference from the default FreeBSD install is that only
+a limited set of device nodes should be created.
+.PP
+To improve storage efficiency, a fair number of the binaries in the system tree
+may be deleted, as they are not relevant in a jail environment.
+This includes the kernel, boot loader, and related files, as well as
+hardware and network configuration tools.
+.PP
+After the creation of the jail tree, the easiest way to configure it is
+to start up the jail in single-user mode.
+The sysinstall admin tool may be used to help with the task, although
+it is not installed by default as part of the system tree.
+These tools should be run in the jail environment, or they will affect
+the host environment's configuration.
+.DS
+.ft C
+.ps -2
+# mkdir /data/jail/192.168.11.100/stand
+# cp /stand/sysinstall /data/jail/192.168.11.100/stand
+# jail /data/jail/192.168.11.100 testhostname 192.168.11.100 \e
+ /bin/sh
+.ps +2
+.R
+.DE
+.PP
+After running the jail command, the shell is now within the jail environment,
+and all further commands
+will be limited to the scope of the jail until the shell exits.
+If the network alias has not yet been configured, then the jail will be
+unable to access the network.
+.PP
+The startup configuration of the jail environment may be configured so
+as to quell warnings from services that cannot run in the jail.
+Also, any per-system configuration required for a normal FreeBSD system
+is also required for each jailbox.
+Typically, this includes:
+.IP "" 5n
+\(bu Create empty /etc/fstab
+.IP
+\(bu Disable portmapper
+.IP
+\(bu Run newaliases
+.IP
+\(bu Disabling interface configuration
+.IP
+\(bu Configure the resolver
+.IP
+\(bu Set root password
+.IP
+\(bu Set timezone
+.IP
+\(bu Add any local accounts
+.IP
+\(bu Install any packets
+.NH 2
+Starting Jails
+.PP
+Jails are typically started by executing their /etc/rc script in much
+the same manner a shell was started in the previous section.
+Before starting the jail, any relevant networking configuration
+should also be performed.
+Typically, this involves adding an additional IP address to the
+appropriate network interface, setting network properties for the
+IP address using IP filtering, forwarding, and bandwidth shaping,
+and mounting a process file system for the jail, if the ability to
+debug processes from within the jail is desired.
+.DS
+.ft C
+.ps -2
+# ifconfig ed0 inet add 192.168.11.100 netmask 255.255.255.255
+# mount -t procfs proc /data/jail/192.168.11.100/proc
+# jail /data/jail/192.168.11.100 testhostname 192.168.11.100 \e
+ /bin/sh /etc/rc
+.ps +2
+.ft P
+.DE
+.PP
+A few warnings are generated for sysctl's that are not permitted
+to be set within the jail, but the end result is a set of processes
+in an isolated process environment, bound to a single IP address.
+Normal procedures for accessing a FreeBSD machine apply: telneting in
+through the network reveals a telnet prompt, login, and shell.
+.DS
+.ft C
+.ps -2
+% ps ax
+ PID TT STAT TIME COMMAND
+ 228 ?? SsJ 0:18.73 syslogd
+ 247 ?? IsJ 0:00.05 inetd -wW
+ 249 ?? IsJ 0:28.43 cron
+ 252 ?? SsJ 0:30.46 sendmail: accepting connections on port 25
+ 291 ?? IsJ 0:38.53 /usr/local/sbin/sshd
+93694 ?? SJ 0:01.01 sshd: rwatson@ttyp0 (sshd)
+93695 p0 SsJ 0:00.06 -csh (csh)
+93700 p0 R+J 0:00.00 ps ax
+.ps +2
+.ft P
+.DE
+.PP
+It is immediately obvious that the environment is within a jailbox: there
+is no init process, no kernel daemons, and a J flag is present beside all
+processes indicating the presence of a jail.
+.PP
+As with any FreeBSD system, accounts may be created and deleted,
+mail is delivered, logs are generated, packages may be added, and the
+system may be hacked into if configured incorrectly, or running a buggy
+version of a piece of software.
+However, all of this happens strictly within the scope of the jail.
+.NH 2
+Jail Management
+.PP
+Jail management is an interesting prospect, as there are two perspectives
+from which a jail environment may be administered: from within the jail,
+and from the host environment.
+From within the jail, as described above, the process is remarkably similar
+to any regular FreeBSD install, although certain actions are prohibited,
+such as mounting file systems, modifying system kernel properties, etc.
+The only area that really differs are that of shutting
+the system down: the processes within the jail may deliver signals
+between them, allowing all processes to be killed, but bringing the
+system back up requires intervention from outside of the jailbox.
+.PP
+From outside of the jail, there are a range of capabilities, as well
+as limitations.
+The jail environment is, in effect, a subset of the host environment:
+the jail file system appears as part of the host file system, and may
+be directly modified by processes in the host environment.
+Processes within the jail appear in the process listing of the host,
+and may likewise be signalled or debugged.
+The host process file system makes the hostname of the jail environment
+accessible in /proc/procnum/status, allowing utilities in the host
+environment to manage processes based on jailname.
+However, the default configuration allows privileged processes within
+jails to set the hostname of the jail, which makes the status file less
+useful from a management perspective if the contents of the jail are
+malicious.
+To prevent a jail from changing its hostname, the
+"security.jail.set_hostname_allowed" sysctl may be set to 0 prior to
+starting any jails.
+.PP
+One aspect immediately observable in an environment with multiple jails
+is that uids and gids are local to each jail environment: the uid associated
+with a process in one jail may be for a different user than in another
+jail.
+This collision of identifiers is only visible in the host environment,
+as normally processes from one jail are never visible in an environment
+with another scope for user/uid and group/gid mapping.
+Managers in the host environment should understand these scoping issues,
+or confusion and unintended consequences may result.
+.PP
+Jailed processes are subject to the normal restrictions present for
+any processes, including resource limits, and limits placed by the network
+code, including firewall rules.
+By specifying firewall rules for the IP address bound to a jail, it is
+possible to place connectivity and bandwidth limitations on individual
+jails, restricting services that may be consumed or offered.
+.PP
+Management of jails is an area that will see further improvement in
+future versions of FreeBSD. Some of these potential improvements are
+discussed later in this paper.
diff --git a/share/doc/papers/jail/paper.ms b/share/doc/papers/jail/paper.ms
new file mode 100644
index 000000000000..fca37fcac7b7
--- /dev/null
+++ b/share/doc/papers/jail/paper.ms
@@ -0,0 +1,436 @@
+.\"
+.if n .ftr C R
+.ig TL
+.ds CH "
+.nr PI 2n
+.nr PS 12
+.nr LL 15c
+.nr PO 3c
+.nr FM 3.5c
+.po 3c
+.TL
+Jails: Confining the omnipotent root.
+.FS
+This paper was presented at the 2nd International System Administration and Networking Conference "SANE 2000" May 22-25, 2000 in Maastricht, The Netherlands and is published in the proceedings.
+.FE
+.AU
+Poul-Henning Kamp <phk@FreeBSD.org>
+.AU
+Robert N. M. Watson <rwatson@FreeBSD.org>
+.AI
+The FreeBSD Project
+.FS
+This work was sponsored by \fChttp://www.servetheweb.com/\fP and
+donated to the FreeBSD Project for inclusion in the FreeBSD
+OS. FreeBSD 4.0-RELEASE was the first release including this
+code.
+Follow-on work was sponsored by Safeport Network Services,
+\fChttp://www.safeport.com/\fP
+.FE
+.AB
+The traditional UNIX security model is simple but inexpressive.
+Adding fine-grained access control improves the expressiveness,
+but often dramatically increases both the cost of system management
+and implementation complexity.
+In environments with a more complex management model, with delegation
+of some management functions to parties under varying degrees of trust,
+the base UNIX model and most natural
+extensions are inappropriate at best.
+Where multiple mutually un-trusting parties are introduced,
+``inappropriate'' rapidly transitions to ``nightmarish'', especially
+with regards to data integrity and privacy protection.
+.PP
+The FreeBSD ``Jail'' facility provides the ability to partition
+the operating system environment, while maintaining the simplicity
+of the UNIX ``root'' model.
+In Jail, users with privilege find that the scope of their requests
+is limited to the jail, allowing system administrators to delegate
+management capabilities for each virtual machine
+environment.
+Creating virtual machines in this manner has many potential uses; the
+most popular thus far has been for providing virtual machine services
+in Internet Service Provider environments.
+.AE
+.NH
+Introduction
+.PP
+The UNIX access control mechanism is designed for an environment with two
+types of users: those with, and without administrative privilege.
+Within this framework, every attempt is made to provide an open
+system, allowing easy sharing of files and inter-process communication.
+As a member of the UNIX family, FreeBSD inherits these
+security properties.
+Users of FreeBSD in non-traditional UNIX environments must balance
+their need for strong application support, high network performance
+and functionality, and low total cost of ownership with the need
+for alternative security models that are difficult or impossible to
+implement with the UNIX security mechanisms.
+.PP
+One such consideration is the desire to delegate some (but not all)
+administrative functions to untrusted or less trusted parties, and
+simultaneously impose system-wide mandatory policies on process
+interaction and sharing.
+Attempting to create such an environment in the current-day FreeBSD
+security environment is both difficult and costly: in many cases,
+the burden of implementing these policies falls on user
+applications, which means an increase in the size and complexity
+of the code base, in turn translating to higher development
+and maintenance cost, as well as less overall flexibility.
+.PP
+This abstract risk becomes more clear when applied to a practical,
+real-world example:
+many web service providers turn to the FreeBSD
+operating system to host customer web sites, as it provides a
+high-performance, network-centric server environment.
+However, these providers have a number of concerns on their plate, both in
+terms of protecting the integrity and confidentiality of their own
+files and services from their customers, as well as protecting the files
+and services of one customer from (accidental or
+intentional) access by any other customer.
+At the same time, a provider would like to provide
+substantial autonomy to customers, allowing them to install and
+maintain their own software, and to manage their own services,
+such as web servers and other content-related daemon programs.
+.PP
+This problem space points strongly in the direction of a partitioning
+solution, in which customer processes and storage are isolated from those of
+other customers, both in terms of accidental disclosure of data or process
+information, but also in terms of the ability to modify files or processes
+outside of a compartment.
+Delegation of management functions within the system must
+be possible, but not at the cost of system-wide requirements, including
+integrity and privacy protection between partitions.
+.PP
+However, UNIX-style access control makes it notoriously difficult to
+compartmentalise functionality.
+While mechanisms such as chroot(2) provide a modest
+level compartmentalisation, it is well known
+that these mechanisms have serious shortcomings, both in terms of the
+scope of their functionality, and effectiveness at what they provide \s-2[CHROOT]\s+2.
+.PP
+In the case of the chroot(2) call, a process's visibility of
+the file system name-space is limited to a single subtree.
+However, the compartmentalisation does not extend to the process
+or networking spaces and therefore both observation of and interference
+with processes outside their compartment is possible.
+.PP
+To this end, we describe the new FreeBSD ``Jail'' facility, which
+provides a strong partitioning solution, leveraging existing
+mechanisms, such as chroot(2), to what effectively amounts to a
+virtual machine environment. Processes in a jail are provided
+full access to the files that they may manipulate, processes they
+may influence, and network services they can make use of, and neither
+access nor visibility of files, processes or network services outside
+their partition.
+.PP
+Unlike other fine-grained security solutions, Jail does not
+substantially increase the policy management requirements for the
+system administrator, as each Jail is a virtual FreeBSD environment
+permitting local policy to be independently managed, with much the
+same properties as the main system itself, making Jail easy to use
+for the administrator, and far more compatible with applications.
+.NH
+Traditional UNIX Security, or, ``God, root, what difference?" \s-2[UF]\s+2.
+.PP
+The traditional UNIX access model assigns numeric uids to each user of the
+system. In turn, each process ``owned'' by a user will be tagged with that
+user's uid in an unforgeable manner. The uids serve two purposes: first,
+they determine how discretionary access control mechanisms will be applied, and
+second, they are used to determine whether special privileges are accorded.
+.PP
+In the case of discretionary access controls, the primary object protected is
+a file. The uid (and related gids indicating group membership) are mapped to
+a set of rights for each object, courtesy the UNIX file mode, in effect acting
+as a limited form of access control list. Jail is, in general, not concerned
+with modifying the semantics of discretionary access control mechanisms,
+although there are important implications from a management perspective.
+.PP
+For the purposes of determining whether special privileges are accorded to a
+process, the check is simple: ``is the numeric uid equal to 0 ?''.
+If so, the
+process is acting with ``super-user privileges'', and all access checks are
+granted, in effect allowing the process the ability to do whatever it wants
+to \**.
+.FS
+\&... no matter how patently stupid it may be.
+.FE
+.PP
+For the purposes of human convenience, uid 0 is canonically allocated
+to the ``root'' user \s-2[ROOT]\s+2.
+For the purposes of jail, this behaviour is extremely relevant: many of
+these privileged operations can be used to manage system hardware and
+configuration, file system name-space, and special network operations.
+.PP
+Many limitations to this model are immediately clear: the root user is a
+single, concentrated source of privilege that is exposed to many pieces of
+software, and as such an immediate target for attacks. In the event of a
+compromise of the root capability set, the attacker has complete control over
+the system. Even without an attacker, the risks of a single administrative
+account are serious: delegating a narrow scope of capability to an
+inexperienced administrator is difficult, as the granularity of delegation is
+that of all system management abilities. These features make the omnipotent
+root account a sharp, efficient and extremely dangerous tool.
+.PP
+The BSD family of operating systems have implemented the ``securelevel''
+mechanism which allows the administrator to block certain configuration
+and management functions from being performed by root,
+until the system is restarted and brought up into single-user mode.
+While this does provide some amount of protection in the case of a root
+compromise of the machine, it does nothing to address the need for
+delegation of certain root abilities.
+.NH
+Other Solutions to the Root Problem
+.PP
+Many operating systems attempt to address these limitations by providing
+fine-grained access controls for system resources \s-2[BIBA]\s+2.
+These efforts vary in
+degrees of success, but almost all suffer from at least three serious
+limitations:
+.PP
+First, increasing the granularity of security controls increases the
+complexity of the administration process, in turn increasing both the
+opportunity for incorrect configuration, as well as the demand on
+administrator time and resources. In many cases, the increased complexity
+results in significant frustration for the administrator, which may result
+in two
+disastrous types of policy: ``all doors open as it's too much trouble'', and
+``trust that the system is secure, when in fact it isn't''.
+.PP
+The extent of the trouble is best illustrated by the fact that an entire
+niche industry has emerged providing tools to manage fine grained security
+controls \s-2[UAS]\s+2.
+.PP
+Second, usefully segregating capabilities and assigning them to running code
+and users is very difficult. Many privileged operations in UNIX seem
+independent, but are in fact closely related, and the handing out of one
+privilege may, in effect, be transitive to the many others. For example, in
+some trusted operating systems, a system capability may be assigned to a
+running process to allow it to read any file, for the purposes of backup.
+However, this capability is, in effect, equivalent to the ability to switch to
+any other account, as the ability to access any file provides access to system
+keying material, which in turn provides the ability to authenticate as any
+user. Similarly, many operating systems attempt to segregate management
+capabilities from auditing capabilities. In a number of these operating
+systems, however, ``management capabilities'' permit the administrator to
+assign ``auditing capabilities'' to itself, or another account, circumventing
+the segregation of capability.
+.PP
+Finally, introducing new security features often involves introducing new
+security management APIs. When fine-grained capabilities are introduced to
+replace the setuid mechanism in UNIX-like operating systems, applications that
+previously did an ``appropriateness check'' to see if they were running as
+root before executing must now be changed to know that they need not run as
+root. In the case of applications running with privilege and executing other
+programs, there is now a new set of privileges that must be voluntarily given
+up before executing another program. These change can introduce significant
+incompatibility for existing applications, and make life more difficult for
+application developers who may not be aware of differing security semantics on
+different systems \s-2[POSIX1e]\s+2.
+.NH
+The Jail Partitioning Solution
+.PP
+Jail neatly side-steps the majority of these problems through partitioning.
+Rather
+than introduce additional fine-grained access control mechanism, we partition
+a FreeBSD environment (processes, file system, network resources) into a
+management environment, and optionally subset Jail environments. In doing so,
+we simultaneously maintain the existing UNIX security model, allowing
+multiple users and a privileged root user in each jail, while
+limiting the scope of root's activities to his jail.
+Consequently the administrator of a
+FreeBSD machine can partition the machine into separate jails, and provide
+access to the super-user account in each of these without losing control of
+the over-all environment.
+.PP
+A process in a partition is referred to as ``in jail''. When a FreeBSD
+system is booted up after a fresh install, no processes will be in jail.
+When
+a process is placed in a jail, it, and any descendents of the process created
+after the jail creation, will be in that jail. A process may be in only one
+jail, and after creation, it can not leave the jail.
+Jails are created when a
+privileged process calls the jail(2) syscall, with a description of the jail as an
+argument to the call. Each call to jail(2) creates a new jail; the only way
+for a new process to enter the jail is by inheriting access to the jail from
+another process already in that jail.
+Processes may never
+leave the jail they created, or were created in.
+.KF
+.if t .PSPIC jail01.eps 4i
+.ce 1
+Fig. 1 \(em Schematic diagram of machine with two configured jails
+.sp
+.KE
+.PP
+Membership in a jail involves a number of restrictions: access to the file
+name-space is restricted in the style of chroot(2), the ability to bind network
+resources is limited to a specific IP address, the ability to manipulate
+system resources and perform privileged operations is sharply curtailed, and
+the ability to interact with other processes is limited to only processes
+inside the same jail.
+.PP
+Jail takes advantage of the existing chroot(2) behaviour to limit access to the
+file system name-space for jailed processes. When a jail is created, it is
+bound to a particular file system root.
+Processes are unable to manipulate files that they cannot address,
+and as such the integrity and confidentiality of files outside of the jail
+file system root are protected. Traditional mechanisms for breaking out of
+chroot(2) have been blocked.
+In the expected and documented configuration, each jail is provided
+with its exclusive file system root, and standard FreeBSD directory layout,
+but this is not mandated by the implementation.
+.PP
+Each jail is bound to a single IP address: processes within the jail may not
+make use of any other IP address for outgoing or incoming connections; this
+includes the ability to restrict what network services a particular jail may
+offer. As FreeBSD distinguishes attempts to bind all IP addresses from
+attempts to bind a particular address, bind requests for all IP addresses are
+redirected to the individual Jail address. Some network functionality
+associated with privileged calls are wholesale disabled due to the nature of the
+functionality offered, in particular facilities which would allow ``spoofing''
+of IP numbers or disruptive traffic to be generated have been disabled.
+.PP
+Processes running without root privileges will notice few, if any differences
+between a jailed environment or un-jailed environment. Processes running with
+root privileges will find that many restrictions apply to the privileged calls
+they may make. Some calls will now return an access error \(em for example, an
+attempt to create a device node will now fail. Others will have a more
+limited scope than normal \(em attempts to bind a reserved port number on all
+available addresses will result in binding only the address associated with
+the jail. Other calls will succeed as normal: root may read a file owned by
+any uid, as long as it is accessible through the jail file system name-space.
+.PP
+Processes within the jail will find that they are unable to interact or
+even verify the existence of
+processes outside the jail \(em processes within the jail are
+prevented from delivering signals to processes outside the jail, as well as
+connecting to those processes with debuggers, or even see them in the
+sysctl or process file system monitoring mechanisms. Jail does not prevent,
+nor is it intended to prevent, the use of covert channels or communications
+mechanisms via accepted interfaces \(em for example, two processes may communicate
+via sockets over the IP network interface. Nor does it attempt to provide
+scheduling services based on the partition; however, it does prevent calls
+that interfere with normal process operation.
+.PP
+As a result of these attempts to retain the standard FreeBSD API and
+framework, almost all applications will run unaffected. Standard system
+services such as Telnet, FTP, and SSH all behave normally, as do most third
+party applications, including the popular Apache web server.
+.NH
+Jail Implementation
+.PP
+Processes running with root privileges in the jail find that there are serious
+restrictions on what it is capable of doing \(em in particular, activities that
+would extend outside of the jail:
+.IP "" 5n
+\(bu Modifying the running kernel by direct access and loading kernel
+modules is prohibited.
+.IP
+\(bu Modifying any of the network configuration, interfaces, addresses, and
+routing table is prohibited.
+.IP
+\(bu Mounting and unmounting file systems is prohibited.
+.IP
+\(bu Creating device nodes is prohibited.
+.IP
+\(bu Accessing raw, divert, or routing sockets is prohibited.
+.IP
+\(bu Modifying kernel runtime parameters, such as most sysctl settings, is
+prohibited.
+.IP
+\(bu Changing securelevel-related file flags is prohibited.
+.IP
+\(bu Accessing network resources not associated with the jail is prohibited.
+.PP
+Other privileged activities are permitted as long as they are limited to the
+scope of the jail:
+.IP "" 5n
+\(bu Signalling any process within the jail is permitted.
+.IP
+\(bu Changing the ownership and mode of any file within the jail is permitted, as
+long as the file flags permit this.
+.IP
+\(bu Deleting any file within the jail is permitted, as long as the file flags
+permit this.
+.IP
+\(bu Binding reserved TCP and UDP port numbers on the jails IP address is
+permitted. (Attempts to bind TCP and UDP ports using INADDR_ANY will be
+redirected to the jails IP address.)
+.IP
+\(bu Functions which operate on the uid/gid space are all permitted since they
+act as labels for filesystem objects of proceses
+which are partitioned off by other mechanisms.
+.PP
+These restrictions on root access limit the scope of root processes, enabling
+most applications to run un-hindered, but preventing calls that might allow an
+application to reach beyond the jail and influence other processes or
+system-wide configuration.
+.PP
+.so implementation.ms
+.so mgt.ms
+.so future.ms
+.NH
+Conclusion
+.PP
+The jail facility provides FreeBSD with a conceptually simple security
+partitioning mechanism, allowing the delegation of administrative rights
+within virtual machine partitions.
+.PP
+The implementation relies on
+restricting access within the jail environment to a well-defined subset
+of the overall host environment. This includes limiting interaction
+between processes, and to files, network resources, and privileged
+operations. Administrative overhead is reduced through avoiding
+fine-grained access control mechanisms, and maintaining a consistent
+administrative interface across partitions and the host environment.
+.PP
+The jail facility has already seen widespread deployment in particular as
+a vehicle for delivering "virtual private server" services.
+.PP
+The jail code is included in the base system as part of FreeBSD 4.0-RELEASE,
+and fully documented in the jail(2) and jail(8) man-pages.
+.bp
+.SH
+Notes & References
+.IP \s-2[BIBA]\s+2 .5i
+K. J. Biba, Integrity Considerations for Secure
+Computer Systems, USAF Electronic Systems Division, 1977
+.IP \s-2[CHROOT]\s+2 .5i
+Dr. Marshall Kirk Mckusick, private communication:
+``According to the SCCS logs, the chroot call was added by Bill Joy
+on March 18, 1982 approximately 1.5 years before 4.2BSD was released.
+That was well before we had ftp servers of any sort (ftp did not
+show up in the source tree until January 1983). My best guess as
+to its purpose was to allow Bill to chroot into the /4.2BSD build
+directory and build a system using only the files, include files,
+etc contained in that tree. That was the only use of chroot that
+I remember from the early days.''
+.IP \s-2[LOTTERY1]\s+2 .5i
+David Petrou and John Milford. Proportional-Share Scheduling:
+Implementation and Evaluation in a Widely-Deployed Operating System,
+December 1997.
+.nf
+\s-2\fChttp://www.cs.cmu.edu/~dpetrou/papers/freebsd_lottery_writeup98.ps\fP\s+2
+\s-2\fChttp://www.cs.cmu.edu/~dpetrou/code/freebsd_lottery_code.tar.gz\fP\s+2
+.IP \s-2[LOTTERY2]\s+2 .5i
+Carl A. Waldspurger and William E. Weihl. Lottery Scheduling: Flexible Proportional-Share Resource Management, Proceedings of the First Symposium on Operating Systems Design and Implementation (OSDI '94), pages 1-11, Monterey, California, November 1994.
+.nf
+\s-2\fChttp://www.research.digital.com/SRC/personal/caw/papers.html\fP\s+2
+.IP \s-2[POSIX1e]\s+2 .5i
+Draft Standard for Information Technology \(em
+Portable Operating System Interface (POSIX) \(em
+Part 1: System Application Program Interface (API) \(em Amendment:
+Protection, Audit and Control Interfaces [C Language]
+IEEE Std 1003.1e Draft 17 Editor Casey Schaufler
+.IP \s-2[ROOT]\s+2 .5i
+Historically other names have been used at times, Zilog for instance
+called the super-user account ``zeus''.
+.IP \s-2[UAS]\s+2 .5i
+One such niche product is the ``UAS'' system to maintain and audit
+RACF configurations on MVS systems.
+.nf
+\s-2\fChttp://www.entactinfo.com/products/uas/\fP\s+2
+.IP \s-2[UF]\s+2 .5i
+Quote from the User-Friendly cartoon by Illiad.
+.nf
+\s-2\fChttp://www.userfriendly.org/cartoons/archives/98nov/19981111.html\fP\s+2
diff --git a/share/doc/papers/kernmalloc/Makefile b/share/doc/papers/kernmalloc/Makefile
new file mode 100644
index 000000000000..f353016251b5
--- /dev/null
+++ b/share/doc/papers/kernmalloc/Makefile
@@ -0,0 +1,11 @@
+VOLUME= papers
+DOC= kernmalloc
+SRCS= kernmalloc.t appendix.ms
+EXTRA= alloc.fig usage.tbl
+MACROS= -ms
+USE_EQN=
+USE_PIC=
+USE_SOELIM=
+USE_TBL=
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/kernmalloc/alloc.fig b/share/doc/papers/kernmalloc/alloc.fig
new file mode 100644
index 000000000000..be313de4e673
--- /dev/null
+++ b/share/doc/papers/kernmalloc/alloc.fig
@@ -0,0 +1,109 @@
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.PS
+scale=100
+define m0 |
+[ box invis ht 16 wid 32 with .sw at 0,0
+line from 4,12 to 4,4
+line from 8,12 to 8,4
+line from 12,12 to 12,4
+line from 16,12 to 16,4
+line from 20,12 to 20,4
+line from 24,12 to 24,4
+line from 28,12 to 28,4
+line from 0,16 to 0,0
+line from 0,8 to 32,8
+] |
+
+define m1 |
+[ box invis ht 16 wid 32 with .sw at 0,0
+line from 8,12 to 8,4
+line from 16,12 to 16,4
+line from 24,12 to 24,4
+line from 0,8 to 32,8
+line from 0,16 to 0,0
+] |
+
+define m2 |
+[ box invis ht 16 wid 32 with .sw at 0,0
+line from 0,8 to 32,8
+line from 0,16 to 0,0
+] |
+
+define m3 |
+[ box invis ht 16 wid 31 with .sw at 0,0
+line from 15,12 to 15,4
+line from 0,8 to 31,8
+line from 0,16 to 0,0
+] |
+
+box invis ht 212 wid 580 with .sw at 0,0
+"\f1\s10\&kernel memory pages\f1\s0" at 168,204
+"\f1\s10\&Legend:\f1\s0" at 36,144
+"\f1\s10\&cont \- continuation of previous page\f1\s0" at 28,112 ljust
+"\f1\s10\&free \- unused page\f1\s0" at 28,128 ljust
+"\f1\s10\&Usage:\f1\s0" at 34,87
+"\f1\s10\&memsize(addr)\f1\s0" at 36,71 ljust
+"\f1\s10\&char *addr;\f1\s0" at 66,56 ljust
+"\f1\s10\&{\f1\s0" at 36,43 ljust
+"\f1\s10\&return(kmemsizes[(addr \- kmembase) \- \s-1PAGESIZE\s+1]);\f1" at 66,29 ljust
+"\f1\s10\&}\f1\s0" at 36,8 ljust
+line from 548,192 to 548,176
+line from 548,184 to 580,184 dotted
+"\f1\s10\&1024,\f1\s0" at 116,168
+"\f1\s10\&256,\f1\s0" at 148,168
+"\f1\s10\&512,\f1\s0" at 180,168
+"\f1\s10\&3072,\f1\s0" at 212,168
+"\f1\s10\&cont,\f1\s0" at 276,168
+"\f1\s10\&cont,\f1\s0" at 244,168
+"\f1\s10\&128,\f1\s0" at 308,168
+"\f1\s10\&128,\f1\s0" at 340,168
+"\f1\s10\&free,\f1\s0" at 372,168
+"\f1\s10\&cont,\f1\s0" at 404,168
+"\f1\s10\&128,\f1\s0" at 436,168
+"\f1\s10\&1024,\f1\s0" at 468,168
+"\f1\s10\&free,\f1\s0" at 500,168
+"\f1\s10\&cont,\f1\s0" at 532,168
+"\f1\s10\&cont,\f1\s0" at 564,168
+m2 with .nw at 100,192
+m1 with .nw at 132,192
+m3 with .nw at 164,192
+m2 with .nw at 196,192
+m2 with .nw at 228,192
+m2 with .nw at 260,192
+m0 with .nw at 292,192
+m0 with .nw at 324,192
+m2 with .nw at 356,192
+m2 with .nw at 388,192
+m0 with .nw at 420,192
+m2 with .nw at 452,192
+m2 with .nw at 484,192
+m2 with .nw at 516,192
+"\f1\s10\&kmemsizes[] = {\f1\s0" at 100,168 rjust
+"\f1\s10\&char *kmembase\f1\s0" at 97,184 rjust
+.PE
diff --git a/share/doc/papers/kernmalloc/appendix.ms b/share/doc/papers/kernmalloc/appendix.ms
new file mode 100644
index 000000000000..1c1f3dc092b0
--- /dev/null
+++ b/share/doc/papers/kernmalloc/appendix.ms
@@ -0,0 +1,268 @@
+.am vS
+..
+.am vE
+..
+'ss 23
+'ds _ \d\(mi\u
+'ps 9z
+'vs 10p
+'ds - \(mi
+'ds / \\h'\\w' 'u-\\w'/'u'/
+'ds /* \\h'\\w' 'u-\\w'/'u'/*
+'bd B 3
+'bd S B 3
+'nr cm 0
+'nf
+'de vH
+'ev 2
+'ft 1
+'sp .35i
+'tl '\s14\f3\\*(=F\fP\s0'\\*(=H'\f3\s14\\*(=F\fP\s0'
+'sp .25i
+'ft 1
+\f2\s12\h'\\n(.lu-\w'\\*(=f'u'\\*(=f\fP\s0\h'|0u'
+.sp .05i
+'ev
+'ds =G \\*(=F
+..
+'de vF
+'ev 2
+'sp .35i
+'ie o 'tl '\f2\\*(=M''Page % of \\*(=G\fP'
+'el 'tl '\f2Page % of \\*(=G''\\*(=M\fP'
+'bp
+'ev
+'ft 1
+'if \\n(cm=1 'ft 2
+..
+'de ()
+'pn 1
+..
+'de +C
+'nr cm 1
+'ft 2
+'ds +K
+'ds -K
+..
+'de -C
+'nr cm 0
+'ft 1
+'ds +K \f3
+'ds -K \fP
+..
+'+C
+'-C
+'am +C
+'ne 3
+..
+'de FN
+\f2\s14\h'\\n(.lu-\w'\\$1'u'\\$1\fP\s0\h'|0u'\c
+.if r x .if \\nx .if d =F .tm \\$1 \\*(=F \\n%
+'ds =f \&...\\$1
+..
+'de FC
+.if r x .if \\nx .if d =F .tm \\$1 \\*(=F \\n%
+'ds =f \&...\\$1
+..
+'de -F
+'rm =f
+..
+'ft 1
+'lg 0
+'-F
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.bp
+.H 1 "Appendix A - Implementation Details"
+.LP
+.nf
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Constants for setting the parameters of the kernel memory allocator\&.
+ \fI*\fP
+ \fI*\fP 2 \fI*\fP\fI*\fP MINBUCKET is the smallest unit of memory that will be
+ \fI*\fP allocated\&. It must be at least large enough to hold a pointer\&.
+ \fI*\fP
+ \fI*\fP Units of memory less or equal to MAXALLOCSAVE will permanently
+ \fI*\fP allocate physical memory; requests for these size pieces of memory
+ \fI*\fP are quite fast\&. Allocations greater than MAXALLOCSAVE must
+ \fI*\fP always allocate and free physical memory; requests for these size
+ \fI*\fP allocations should be done infrequently as they will be slow\&.
+ \fI*\fP Constraints: CLBYTES <= MAXALLOCSAVE <= 2 \fI*\fP\fI*\fP (MINBUCKET + 14)
+ \fI*\fP and MAXALLOCSIZE must be a power of two\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+K#define\*(-K MINBUCKET\h'|31n'4\h'|51n'\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+ 4 => min allocation of 16 bytes \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+'FN MAXALLOCSAVE
+\*(+K#define\*(-K MAXALLOCSAVE\h'|31n'(2 \fI*\fP CLBYTES)
+
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Maximum amount of kernel dynamic memory\&.
+ \fI*\fP Constraints: must be a multiple of the pagesize\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+'FN MAXKMEM
+\*(+K#define\*(-K MAXKMEM\h'|31n'(1024 \fI*\fP PAGESIZE)
+
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Arena for all kernel dynamic memory allocation\&.
+ \fI*\fP This arena is known to start on a page boundary\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+Kextern\*(-K \*(+Kchar\*(-K kmembase[MAXKMEM];
+
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Array of descriptors that describe the contents of each page
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+Kstruct\*(-K kmemsizes \*(+K{\*(-K
+\h'|11n'\*(+Kshort\*(-K\h'|21n'ks\*_indx;\h'|41n'\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+ bucket index, size of small allocations \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\h'|11n'u\*_short\h'|21n'ks\*_pagecnt;\h'|41n'\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+ for large allocations, pages allocated \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+K}\*(-K\c\c
+'-F
+ kmemsizes[MAXKMEM \fI\h'\w' 'u-\w'/'u'/\fP PAGESIZE];
+'FC MAXALLOCSAVE
+
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Set of buckets for each size of memory block that is retained
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+Kstruct\*(-K kmembuckets \*(+K{\*(-K
+\h'|11n'caddr\*_t kb\*_next;\h'|41n'\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+ list of free blocks \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+K}\*(-K\c\c
+'-F
+ bucket[MINBUCKET + 16];
+.bp
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Macro to convert a size to a bucket index\&. If the size is constant,
+ \fI*\fP this macro reduces to a compile time constant\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+'FN MINALLOCSIZE
+\*(+K#define\*(-K MINALLOCSIZE\h'|31n'(1 << MINBUCKET)
+'FN BUCKETINDX
+\*(+K#define\*(-K BUCKETINDX(size) \e
+\h'|11n'(size) <= (MINALLOCSIZE \fI*\fP 128) \e
+\h'|21n'? (size) <= (MINALLOCSIZE \fI*\fP 8) \e
+\h'|31n'? (size) <= (MINALLOCSIZE \fI*\fP 2) \e
+\h'|41n'? (size) <= (MINALLOCSIZE \fI*\fP 1) \e
+\h'|51n'? (MINBUCKET + 0) \e
+\h'|51n': (MINBUCKET + 1) \e
+\h'|41n': (size) <= (MINALLOCSIZE \fI*\fP 4) \e
+\h'|51n'? (MINBUCKET + 2) \e
+\h'|51n': (MINBUCKET + 3) \e
+\h'|31n': (size) <= (MINALLOCSIZE\fI*\fP 32) \e
+\h'|41n'? (size) <= (MINALLOCSIZE \fI*\fP 16) \e
+\h'|51n'? (MINBUCKET + 4) \e
+\h'|51n': (MINBUCKET + 5) \e
+\h'|41n': (size) <= (MINALLOCSIZE \fI*\fP 64) \e
+\h'|51n'? (MINBUCKET + 6) \e
+\h'|51n': (MINBUCKET + 7) \e
+\h'|21n': (size) <= (MINALLOCSIZE \fI*\fP 2048) \e
+\h'|31n'\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+ etc \&.\&.\&. \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Macro versions for the usual cases of malloc\fI\h'\w' 'u-\w'/'u'/\fPfree
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+'FN MALLOC
+\*(+K#define\*(-K MALLOC(space, cast, size, flags) \*(+K{\*(-K \e
+\h'|11n'\*(+Kregister\*(-K \*(+Kstruct\*(-K kmembuckets \fI*\fPkbp = &bucket[BUCKETINDX(size)]; \e
+\h'|11n'\*(+Klong\*(-K s = splimp(); \e
+\h'|11n'\*(+Kif\*(-K (kbp\*->kb\*_next == NULL) \*(+K{\*(-K \e
+\h'|21n'(space) = (cast)malloc(size, flags); \e
+\h'|11n'\*(+K}\*(-K \*(+Kelse\*(-K \*(+K{\*(-K \e
+\h'|21n'(space) = (cast)kbp\*->kb\*_next; \e
+\h'|21n'kbp\*->kb\*_next = \fI*\fP(caddr\*_t \fI*\fP)(space); \e
+\h'|11n'\*(+K}\*(-K \e
+\h'|11n'splx(s); \e
+\*(+K}\*(-K\c\c
+'-F
+
+'FC BUCKETINDX
+
+'FN FREE
+\*(+K#define\*(-K FREE(addr) \*(+K{\*(-K \e
+\h'|11n'\*(+Kregister\*(-K \*(+Kstruct\*(-K kmembuckets \fI*\fPkbp; \e
+\h'|11n'\*(+Kregister\*(-K \*(+Kstruct\*(-K kmemsizes \fI*\fPksp = \e
+\h'|21n'&kmemsizes[((addr) \*- kmembase) \fI\h'\w' 'u-\w'/'u'/\fP PAGESIZE]; \e
+\h'|11n'\*(+Klong\*(-K s = splimp(); \e
+\h'|11n'\*(+Kif\*(-K (1 << ksp\*->ks\*_indx > MAXALLOCSAVE) \*(+K{\*(-K \e
+\h'|21n'free(addr); \e
+\h'|11n'\*(+K}\*(-K \*(+Kelse\*(-K \*(+K{\*(-K \e
+\h'|21n'kbp = &bucket[ksp\*->ks\*_indx]; \e
+\h'|21n'\fI*\fP(caddr\*_t \fI*\fP)(addr) = kbp\*->kb\*_next; \e
+\h'|21n'kbp\*->kb\*_next = (caddr\*_t)(addr); \e
+\h'|11n'\*(+K}\*(-K \e
+\h'|11n'splx(s); \e
+\*(+K}\*(-K\c\c
+'-F
+
+'FC BUCKETINDX
+.vE
diff --git a/share/doc/papers/kernmalloc/appendix.t b/share/doc/papers/kernmalloc/appendix.t
new file mode 100644
index 000000000000..b0248856938b
--- /dev/null
+++ b/share/doc/papers/kernmalloc/appendix.t
@@ -0,0 +1,131 @@
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.bp
+.H 1 "Appendix A - Implementation Details"
+.LP
+.nf
+.vS
+/*
+ * Constants for setting the parameters of the kernel memory allocator.
+ *
+ * 2 ** MINBUCKET is the smallest unit of memory that will be
+ * allocated. It must be at least large enough to hold a pointer.
+ *
+ * Units of memory less or equal to MAXALLOCSAVE will permanently
+ * allocate physical memory; requests for these size pieces of memory
+ * are quite fast. Allocations greater than MAXALLOCSAVE must
+ * always allocate and free physical memory; requests for these size
+ * allocations should be done infrequently as they will be slow.
+ * Constraints: CLBYTES <= MAXALLOCSAVE <= 2 ** (MINBUCKET + 14)
+ * and MAXALLOCSIZE must be a power of two.
+ */
+#define MINBUCKET 4 /* 4 => min allocation of 16 bytes */
+#define MAXALLOCSAVE (2 * CLBYTES)
+
+/*
+ * Maximum amount of kernel dynamic memory.
+ * Constraints: must be a multiple of the pagesize.
+ */
+#define MAXKMEM (1024 * PAGESIZE)
+
+/*
+ * Arena for all kernel dynamic memory allocation.
+ * This arena is known to start on a page boundary.
+ */
+extern char kmembase[MAXKMEM];
+
+/*
+ * Array of descriptors that describe the contents of each page
+ */
+struct kmemsizes {
+ short ks_indx; /* bucket index, size of small allocations */
+ u_short ks_pagecnt; /* for large allocations, pages allocated */
+} kmemsizes[MAXKMEM / PAGESIZE];
+
+/*
+ * Set of buckets for each size of memory block that is retained
+ */
+struct kmembuckets {
+ caddr_t kb_next; /* list of free blocks */
+} bucket[MINBUCKET + 16];
+.bp
+/*
+ * Macro to convert a size to a bucket index. If the size is constant,
+ * this macro reduces to a compile time constant.
+ */
+#define MINALLOCSIZE (1 << MINBUCKET)
+#define BUCKETINDX(size) \
+ (size) <= (MINALLOCSIZE * 128) \
+ ? (size) <= (MINALLOCSIZE * 8) \
+ ? (size) <= (MINALLOCSIZE * 2) \
+ ? (size) <= (MINALLOCSIZE * 1) \
+ ? (MINBUCKET + 0) \
+ : (MINBUCKET + 1) \
+ : (size) <= (MINALLOCSIZE * 4) \
+ ? (MINBUCKET + 2) \
+ : (MINBUCKET + 3) \
+ : (size) <= (MINALLOCSIZE* 32) \
+ ? (size) <= (MINALLOCSIZE * 16) \
+ ? (MINBUCKET + 4) \
+ : (MINBUCKET + 5) \
+ : (size) <= (MINALLOCSIZE * 64) \
+ ? (MINBUCKET + 6) \
+ : (MINBUCKET + 7) \
+ : (size) <= (MINALLOCSIZE * 2048) \
+ /* etc ... */
+
+/*
+ * Macro versions for the usual cases of malloc/free
+ */
+#define MALLOC(space, cast, size, flags) { \
+ register struct kmembuckets *kbp = &bucket[BUCKETINDX(size)]; \
+ long s = splimp(); \
+ if (kbp->kb_next == NULL) { \
+ (space) = (cast)malloc(size, flags); \
+ } else { \
+ (space) = (cast)kbp->kb_next; \
+ kbp->kb_next = *(caddr_t *)(space); \
+ } \
+ splx(s); \
+}
+
+#define FREE(addr) { \
+ register struct kmembuckets *kbp; \
+ register struct kmemsizes *ksp = \
+ &kmemsizes[((addr) - kmembase) / PAGESIZE]; \
+ long s = splimp(); \
+ if (1 << ksp->ks_indx > MAXALLOCSAVE) { \
+ free(addr); \
+ } else { \
+ kbp = &bucket[ksp->ks_indx]; \
+ *(caddr_t *)(addr) = kbp->kb_next; \
+ kbp->kb_next = (caddr_t)(addr); \
+ } \
+ splx(s); \
+}
+.vE
diff --git a/share/doc/papers/kernmalloc/kernmalloc.t b/share/doc/papers/kernmalloc/kernmalloc.t
new file mode 100644
index 000000000000..26cb605b6958
--- /dev/null
+++ b/share/doc/papers/kernmalloc/kernmalloc.t
@@ -0,0 +1,646 @@
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" reference a system routine name
+.de RN
+\fI\\$1\fP\^(\h'1m/24u')\\$2
+..
+.\" reference a header name
+.de H
+.NH \\$1
+\\$2
+..
+.\" begin figure
+.\" .FI "title"
+.nr Fn 0 1
+.de FI
+.ds Lb Figure \\n+(Fn
+.ds Lt \\$1
+.KF
+.DS B
+.nf
+..
+.\"
+.\" end figure
+.de Fe
+.DE
+.ce
+\\*(Lb. \\*(Lt
+.sp
+.KE
+..
+.EQ
+delim $$
+.EN
+.ds CH "
+.pn 295
+.sp
+.rs
+.ps -1
+.sp -1
+.fi
+Reprinted from:
+\fIProceedings of the San Francisco USENIX Conference\fP,
+pp. 295-303, June 1988.
+.ps
+.\".sp |\n(HMu
+.rm CM
+.nr PO 1.25i
+.TL
+Design of a General Purpose Memory Allocator for the 4.3BSD UNIX\(dg Kernel
+.ds LF Summer USENIX '88
+.ds CF "%
+.ds RF San Francisco, June 20-24
+.EH 'Design of a General Purpose Memory ...''McKusick, Karels'
+.OH 'McKusick, Karels''Design of a General Purpose Memory ...'
+.FS
+\(dgUNIX is a registered trademark of AT&T in the US and other countries.
+.FE
+.AU
+Marshall Kirk McKusick
+.AU
+Michael J. Karels
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+The 4.3BSD UNIX kernel uses many memory allocation mechanisms,
+each designed for the particular needs of the utilizing subsystem.
+This paper describes a general purpose dynamic memory allocator
+that can be used by all of the kernel subsystems.
+The design of this allocator takes advantage of known memory usage
+patterns in the UNIX kernel and a hybrid strategy that is time-efficient
+for small allocations and space-efficient for large allocations.
+This allocator replaces the multiple memory allocation interfaces
+with a single easy-to-program interface,
+results in more efficient use of global memory by eliminating
+partitioned and specialized memory pools,
+and is quick enough that no performance loss is observed
+relative to the current implementations.
+The paper concludes with a discussion of our experience in using
+the new memory allocator,
+and directions for future work.
+.AE
+.LP
+.H 1 "Kernel Memory Allocation in 4.3BSD
+.PP
+The 4.3BSD kernel has at least ten different memory allocators.
+Some of them handle large blocks,
+some of them handle small chained data structures,
+and others include information to describe I/O operations.
+Often the allocations are for small pieces of memory that are only
+needed for the duration of a single system call.
+In a user process such short-term
+memory would be allocated on the run-time stack.
+Because the kernel has a limited run-time stack,
+it is not feasible to allocate even moderate blocks of memory on it.
+Consequently, such memory must be allocated through a more dynamic mechanism.
+For example,
+when the system must translate a pathname,
+it must allocate a one kilobye buffer to hold the name.
+Other blocks of memory must be more persistent than a single system call
+and really have to be allocated from dynamic memory.
+Examples include protocol control blocks that remain throughout
+the duration of the network connection.
+.PP
+Demands for dynamic memory allocation in the kernel have increased
+as more services have been added.
+Each time a new type of memory allocation has been required,
+a specialized memory allocation scheme has been written to handle it.
+Often the new memory allocation scheme has been built on top
+of an older allocator.
+For example, the block device subsystem provides a crude form of
+memory allocation through the allocation of empty buffers [Thompson78].
+The allocation is slow because of the implied semantics of
+finding the oldest buffer, pushing its contents to disk if they are dirty,
+and moving physical memory into or out of the buffer to create
+the requested size.
+To reduce the overhead, a ``new'' memory allocator was built in 4.3BSD
+for name translation that allocates a pool of empty buffers.
+It keeps them on a free list so they can
+be quickly allocated and freed [McKusick85].
+.PP
+This memory allocation method has several drawbacks.
+First, the new allocator can only handle a limited range of sizes.
+Second, it depletes the buffer pool, as it steals memory intended
+to buffer disk blocks to other purposes.
+Finally, it creates yet another interface of
+which the programmer must be aware.
+.PP
+A generalized memory allocator is needed to reduce the complexity
+of writing code inside the kernel.
+Rather than providing many semi-specialized ways of allocating memory,
+the kernel should provide a single general purpose allocator.
+With only a single interface,
+programmers do not need to figure
+out the most appropriate way to allocate memory.
+If a good general purpose allocator is available,
+it helps avoid the syndrome of creating yet another special
+purpose allocator.
+.PP
+To ease the task of understanding how to use it,
+the memory allocator should have an interface similar to the interface
+of the well-known memory allocator provided for
+applications programmers through the C library routines
+.RN malloc
+and
+.RN free .
+Like the C library interface,
+the allocation routine should take a parameter specifying the
+size of memory that is needed.
+The range of sizes for memory requests should not be constrained.
+The free routine should take a pointer to the storage being freed,
+and should not require additional information such as the size
+of the piece of memory being freed.
+.H 1 "Criteria for a Kernel Memory Allocator
+.PP
+The design specification for a kernel memory allocator is similar to,
+but not identical to,
+the design criteria for a user level memory allocator.
+The first criterion for a memory allocator is that it make good use
+of the physical memory.
+Good use of memory is measured by the amount of memory needed to hold
+a set of allocations at any point in time.
+Percentage utilization is expressed as:
+.ie t \{\
+.EQ
+utilization~=~requested over required
+.EN
+.\}
+.el \{\
+.sp
+.ce
+\fIutilization\fP=\fIrequested\fP/\fIrequired\fP
+.sp
+.\}
+Here, ``requested'' is the sum of the memory that has been requested
+and not yet freed.
+``Required'' is the amount of memory that has been
+allocated for the pool from which the requests are filled.
+An allocator requires more memory than requested because of fragmentation
+and a need to have a ready supply of free memory for future requests.
+A perfect memory allocator would have a utilization of 100%.
+In practice,
+having a 50% utilization is considered good [Korn85].
+.PP
+Good memory utilization in the kernel is more important than
+in user processes.
+Because user processes run in virtual memory,
+unused parts of their address space can be paged out.
+Thus pages in the process address space
+that are part of the ``required'' pool that are not
+being ``requested'' need not tie up physical memory.
+Because the kernel is not paged,
+all pages in the ``required'' pool are held by the kernel and
+cannot be used for other purposes.
+To keep the kernel utilization percentage as high as possible,
+it is desirable to release unused memory in the ``required'' pool
+rather than to hold it as is typically done with user processes.
+Because the kernel can directly manipulate its own page maps,
+releasing unused memory is fast;
+a user process must do a system call to release memory.
+.PP
+The most important criterion for a memory allocator is that it be fast.
+Because memory allocation is done frequently,
+a slow memory allocator will degrade the system performance.
+Speed of allocation is more critical when executing in the
+kernel than in user code,
+because the kernel must allocate many data structure that user
+processes can allocate cheaply on their run-time stack.
+In addition, the kernel represents the platform on which all user
+processes run,
+and if it is slow, it will degrade the performance of every process
+that is running.
+.PP
+Another problem with a slow memory allocator is that programmers
+of frequently-used kernel interfaces will feel that they
+cannot afford to use it as their primary memory allocator.
+Instead they will build their own memory allocator on top of the
+original by maintaining their own pool of memory blocks.
+Multiple allocators reduce the efficiency with which memory is used.
+The kernel ends up with many different free lists of memory
+instead of a single free list from which all allocation can be drawn.
+For example,
+consider the case of two subsystems that need memory.
+If they have their own free lists,
+the amount of memory tied up in the two lists will be the
+sum of the greatest amount of memory that each of
+the two subsystems has ever used.
+If they share a free list,
+the amount of memory tied up in the free list may be as low as the
+greatest amount of memory that either subsystem used.
+As the number of subsystems grows,
+the savings from having a single free list grow.
+.H 1 "Existing User-level Implementations
+.PP
+There are many different algorithms and
+implementations of user-level memory allocators.
+A survey of those available on UNIX systems appeared in [Korn85].
+Nearly all of the memory allocators tested made good use of memory,
+though most of them were too slow for use in the kernel.
+The fastest memory allocator in the survey by nearly a factor of two
+was the memory allocator provided on 4.2BSD originally
+written by Chris Kingsley at California Institute of Technology.
+Unfortunately,
+the 4.2BSD memory allocator also wasted twice as much memory
+as its nearest competitor in the survey.
+.PP
+The 4.2BSD user-level memory allocator works by maintaining a set of lists
+that are ordered by increasing powers of two.
+Each list contains a set of memory blocks of its corresponding size.
+To fulfill a memory request,
+the size of the request is rounded up to the next power of two.
+A piece of memory is then removed from the list corresponding
+to the specified power of two and returned to the requester.
+Thus, a request for a block of memory of size 53 returns
+a block from the 64-sized list.
+A typical memory allocation requires a roundup calculation
+followed by a linked list removal.
+Only if the list is empty is a real memory allocation done.
+The free operation is also fast;
+the block of memory is put back onto the list from which it came.
+The correct list is identified by a size indicator stored
+immediately preceding the memory block.
+.H 1 "Considerations Unique to a Kernel Allocator
+.PP
+There are several special conditions that arise when writing a
+memory allocator for the kernel that do not apply to a user process
+memory allocator.
+First, the maximum memory allocation can be determined at
+the time that the machine is booted.
+This number is never more than the amount of physical memory on the machine,
+and is typically much less since a machine with all its
+memory dedicated to the operating system is uninteresting to use.
+Thus, the kernel can statically allocate a set of data structures
+to manage its dynamically allocated memory.
+These data structures never need to be
+expanded to accommodate memory requests;
+yet, if properly designed, they need not be large.
+For a user process, the maximum amount of memory that may be allocated
+is a function of the maximum size of its virtual memory.
+Although it could allocate static data structures to manage
+its entire virtual memory,
+even if they were efficiently encoded they would potentially be huge.
+The other alternative is to allocate data structures as they are needed.
+However, that adds extra complications such as new
+failure modes if it cannot allocate space for additional
+structures and additional mechanisms to link them all together.
+.PP
+Another special condition of the kernel memory allocator is that it
+can control its own address space.
+Unlike user processes that can only grow and shrink their heap at one end,
+the kernel can keep an arena of kernel addresses and allocate
+pieces from that arena which it then populates with physical memory.
+The effect is much the same as a user process that has parts of
+its address space paged out when they are not in use,
+except that the kernel can explicitly control the set of pages
+allocated to its address space.
+The result is that the ``working set'' of pages in use by the
+kernel exactly corresponds to the set of pages that it is really using.
+.FI "One day memory usage on a Berkeley time-sharing machine"
+.so usage.tbl
+.Fe
+.PP
+A final special condition that applies to the kernel is that
+all of the different uses of dynamic memory are known in advance.
+Each one of these uses of dynamic memory can be assigned a type.
+For each type of dynamic memory that is allocated,
+the kernel can provide allocation limits.
+One reason given for having separate allocators is that
+no single allocator could starve the rest of the kernel of all
+its available memory and thus a single runaway
+client could not paralyze the system.
+By putting limits on each type of memory,
+the single general purpose memory allocator can provide the same
+protection against memory starvation.\(dg
+.FS
+\(dgOne might seriously ask the question what good it is if ``only''
+one subsystem within the kernel hangs if it is something like the
+network on a diskless workstation.
+.FE
+.PP
+\*(Lb shows the memory usage of the kernel over a one day period
+on a general timesharing machine at Berkeley.
+The ``In Use'', ``Free'', and ``Mem Use'' fields are instantaneous values;
+the ``Requests'' field is the number of allocations since system startup;
+the ``High Use'' field is the maximum value of
+the ``Mem Use'' field since system startup.
+The figure demonstrates that most
+allocations are for small objects.
+Large allocations occur infrequently,
+and are typically for long-lived objects
+such as buffers to hold the superblock for
+a mounted file system.
+Thus, a memory allocator only needs to be
+fast for small pieces of memory.
+.H 1 "Implementation of the Kernel Memory Allocator
+.PP
+In reviewing the available memory allocators,
+none of their strategies could be used without some modification.
+The kernel memory allocator that we ended up with is a hybrid
+of the fast memory allocator found in the 4.2BSD C library
+and a slower but more-memory-efficient first-fit allocator.
+.PP
+Small allocations are done using the 4.2BSD power-of-two list strategy;
+the typical allocation requires only a computation of
+the list to use and the removal of an element if it is available,
+so it is quite fast.
+Macros are provided to avoid the cost of a subroutine call.
+Only if the request cannot be fulfilled from a list is a call
+made to the allocator itself.
+To ensure that the allocator is always called for large requests,
+the lists corresponding to large allocations are always empty.
+Appendix A shows the data structures and implementation of the macros.
+.PP
+Similarly, freeing a block of memory can be done with a macro.
+The macro computes the list on which to place the request
+and puts it there.
+The free routine is called only if the block of memory is
+considered to be a large allocation.
+Including the cost of blocking out interrupts,
+the allocation and freeing macros generate respectively
+only nine and sixteen (simple) VAX instructions.
+.PP
+Because of the inefficiency of power-of-two allocation strategies
+for large allocations,
+a different strategy is used for allocations larger than two kilobytes.
+The selection of two kilobytes is derived from our statistics on
+the utilization of memory within the kernel,
+that showed that 95 to 98% of allocations are of size one kilobyte or less.
+A frequent caller of the memory allocator
+(the name translation function)
+always requests a one kilobyte block.
+Additionally the allocation method for large blocks is based on allocating
+pieces of memory in multiples of pages.
+Consequently the actual allocation size for requests of size
+$2~times~pagesize$ or less are identical.\(dg
+.FS
+\(dgTo understand why this number is $size 8 {2~times~pagesize}$ one
+observes that the power-of-two algorithm yields sizes of 1, 2, 4, 8, \&...
+pages while the large block algorithm that allocates in multiples
+of pages yields sizes of 1, 2, 3, 4, \&... pages.
+Thus for allocations of sizes between one and two pages
+both algorithms use two pages;
+it is not until allocations of sizes between two and three pages
+that a difference emerges where the power-of-two algorithm will use
+four pages while the large block algorithm will use three pages.
+.FE
+In 4.3BSD on the VAX, the (software) page size is one kilobyte,
+so two kilobytes is the smallest logical cutoff.
+.PP
+Large allocations are first rounded up to be a multiple of the page size.
+The allocator then uses a first-fit algorithm to find space in the
+kernel address arena set aside for dynamic allocations.
+Thus a request for a five kilobyte piece of memory will use exactly
+five pages of memory rather than eight kilobytes as with
+the power-of-two allocation strategy.
+When a large piece of memory is freed,
+the memory pages are returned to the free memory pool,
+and the address space is returned to the kernel address arena
+where it is coalesced with adjacent free pieces.
+.PP
+Another technique to improve both the efficiency of memory utilization
+and the speed of allocation
+is to cluster same-sized small allocations on a page.
+When a list for a power-of-two allocation is empty,
+a new page is allocated and divided into pieces of the needed size.
+This strategy speeds future allocations as several pieces of memory
+become available as a result of the call into the allocator.
+.PP
+.FI "Calculation of allocation size"
+.so alloc.fig
+.Fe
+Because the size is not specified when a block of memory is freed,
+the allocator must keep track of the sizes of the pieces it has handed out.
+The 4.2BSD user-level allocator stores the size of each block
+in a header just before the allocation.
+However, this strategy doubles the memory requirement for allocations that
+require a power-of-two-sized block.
+Therefore,
+instead of storing the size of each piece of memory with the piece itself,
+the size information is associated with the memory page.
+\*(Lb shows how the kernel determines
+the size of a piece of memory that is being freed,
+by calculating the page in which it resides,
+and looking up the size associated with that page.
+Eliminating the cost of the overhead per piece improved utilization
+far more than expected.
+The reason is that many allocations in the kernel are for blocks of
+memory whose size is exactly a power of two.
+These requests would be nearly doubled if the user-level strategy were used.
+Now they can be accommodated with no wasted memory.
+.PP
+The allocator can be called both from the top half of the kernel,
+which is willing to wait for memory to become available,
+and from the interrupt routines in the bottom half of the kernel
+that cannot wait for memory to become available.
+Clients indicate their willingness (and ability) to wait with a flag
+to the allocation routine.
+For clients that are willing to wait,
+the allocator guarrentees that their request will succeed.
+Thus, these clients can need not check the return value from the allocator.
+If memory is unavailable and the client cannot wait,
+the allocator returns a null pointer.
+These clients must be prepared to cope with this
+(hopefully infrequent) condition
+(usually by giving up and hoping to do better later).
+.H 1 "Results of the Implementation
+.PP
+The new memory allocator was written about a year ago.
+Conversion from the old memory allocators to the new allocator
+has been going on ever since.
+Many of the special purpose allocators have been eliminated.
+This list includes
+.RN calloc ,
+.RN wmemall ,
+and
+.RN zmemall .
+Many of the special purpose memory allocators built on
+top of other allocators have also been eliminated.
+For example, the allocator that was built on top of the buffer pool allocator
+.RN geteblk
+to allocate pathname buffers in
+.RN namei
+has been eliminated.
+Because the typical allocation is so fast,
+we have found that none of the special purpose pools are needed.
+Indeed, the allocation is about the same as the previous cost of
+allocating buffers from the network pool (\fImbuf\fP\^s).
+Consequently applications that used to allocate network
+buffers for their own uses have been switched over to using
+the general purpose allocator without increasing their running time.
+.PP
+Quantifying the performance of the allocator is difficult because
+it is hard to measure the amount of time spent allocating
+and freeing memory in the kernel.
+The usual approach is to compile a kernel for profiling
+and then compare the running time of the routines that
+implemented the old abstraction versus those that implement the new one.
+The old routines are difficult to quantify because
+individual routines were used for more than one purpose.
+For example, the
+.RN geteblk
+routine was used both to allocate one kilobyte memory blocks
+and for its intended purpose of providing buffers to the filesystem.
+Differentiating these uses is often difficult.
+To get a measure of the cost of memory allocation before
+putting in our new allocator,
+we summed up the running time of all the routines whose
+exclusive task was memory allocation.
+To this total we added the fraction
+of the running time of the multi-purpose routines that could
+clearly be identified as memory allocation usage.
+This number showed that approximately three percent of
+the time spent in the kernel could be accounted to memory allocation.
+.PP
+The new allocator is difficult to measure
+because the usual case of the memory allocator is implemented as a macro.
+Thus, its running time is a small fraction of the running time of the
+numerous routines in the kernel that use it.
+To get a bound on the cost,
+we changed the macro always to call the memory allocation routine.
+Running in this mode, the memory allocator accounted for six percent
+of the time spent in the kernel.
+Factoring out the cost of the statistics collection and the
+subroutine call overhead for the cases that could
+normally be handled by the macro,
+we estimate that the allocator would account for
+at most four percent of time in the kernel.
+These measurements show that the new allocator does not introduce
+significant new run-time costs.
+.PP
+The other major success has been in keeping the size information
+on a per-page basis.
+This technique allows the most frequently requested sizes to be
+allocated without waste.
+It also reduces the amount of bookkeeping information associated
+with the allocator to four kilobytes of information
+per megabyte of memory under management (with a one kilobyte page size).
+.H 1 "Future Work
+.PP
+Our next project is to convert many of the static
+kernel tables to be dynamically allocated.
+Static tables include the process table, the file table,
+and the mount table.
+Making these tables dynamic will have two benefits.
+First, it will reduce the amount of memory
+that must be statically allocated at boot time.
+Second, it will eliminate the arbitrary upper limit imposed
+by the current static sizing
+(although a limit will be retained to constrain runaway clients).
+Other researchers have already shown the memory savings
+achieved by this conversion [Rodriguez88].
+.PP
+Under the current implementation,
+memory is never moved from one size list to another.
+With the 4.2BSD memory allocator this causes problems,
+particularly for large allocations where a process may use
+a quarter megabyte piece of memory once,
+which is then never available for any other size request.
+In our hybrid scheme,
+memory can be shuffled between large requests so that large blocks
+of memory are never stranded as they are with the 4.2BSD allocator.
+However, pages allocated to small requests are allocated once
+to a particular size and never changed thereafter.
+If a burst of requests came in for a particular size,
+that size would acquire a large amount of memory
+that would then not be available for other future requests.
+.PP
+In practice, we do not find that the free lists become too large.
+However, we have been investigating ways to handle such problems
+if they occur in the future.
+Our current investigations involve a routine
+that can run as part of the idle loop that would sort the elements
+on each of the free lists into order of increasing address.
+Since any given page has only one size of elements allocated from it,
+the effect of the sorting would be to sort the list into distinct pages.
+When all the pieces of a page became free,
+the page itself could be released back to the free pool so that
+it could be allocated to another purpose.
+Although there is no guarantee that all the pieces of a page would ever
+be freed,
+most allocations are short-lived, lasting only for the duration of
+an open file descriptor, an open network connection, or a system call.
+As new allocations would be made from the page sorted to
+the front of the list,
+return of elements from pages at the back would eventually
+allow pages later in the list to be freed.
+.PP
+Two of the traditional UNIX
+memory allocators remain in the current system.
+The terminal subsystem uses \fIclist\fP\^s (character lists).
+That part of the system is expected to undergo major revision within
+the next year or so, and it will probably be changed to use
+\fImbuf\fP\^s as it is merged into the network system.
+The other major allocator that remains is
+.RN getblk ,
+the routine that manages the filesystem buffer pool memory
+and associated control information.
+Only the filesystem uses
+.RN getblk
+in the current system;
+it manages the constant-sized buffer pool.
+We plan to merge the filesystem buffer cache into the virtual memory system's
+page cache in the future.
+This change will allow the size of the buffer pool to be changed
+according to memory load,
+but will require a policy for balancing memory needs
+with filesystem cache performance.
+.H 1 "Acknowledgments
+.PP
+In the spirit of community support,
+we have made various versions of our allocator available to our test sites.
+They have been busily burning it in and giving
+us feedback on their experiences.
+We acknowledge their invaluable input.
+The feedback from the Usenix program committee on the initial draft of
+our paper suggested numerous important improvements.
+.H 1 "References
+.LP
+.IP Korn85 \w'Rodriguez88\0\0'u
+David Korn, Kiem-Phong Vo,
+``In Search of a Better Malloc''
+\fIProceedings of the Portland Usenix Conference\fP,
+pp 489-506, June 1985.
+.IP McKusick85
+M. McKusick, M. Karels, S. Leffler,
+``Performance Improvements and Functional Enhancements in 4.3BSD''
+\fIProceedings of the Portland Usenix Conference\fP,
+pp 519-531, June 1985.
+.IP Rodriguez88
+Robert Rodriguez, Matt Koehler, Larry Palmer, Ricky Palmer,
+``A Dynamic UNIX Operating System''
+\fIProceedings of the San Francisco Usenix Conference\fP,
+June 1988.
+.IP Thompson78
+Ken Thompson,
+``UNIX Implementation''
+\fIBell System Technical Journal\fP, volume 57, number 6,
+pp 1931-1946, 1978.
diff --git a/share/doc/papers/kernmalloc/spell.ok b/share/doc/papers/kernmalloc/spell.ok
new file mode 100644
index 000000000000..10c3ab7d8ed4
--- /dev/null
+++ b/share/doc/papers/kernmalloc/spell.ok
@@ -0,0 +1,57 @@
+BUCKETINDX
+CLBYTES
+CM
+Karels
+Kiem
+Koehler
+Korn
+Korn85
+MAXALLOCSAVE
+MAXALLOCSIZE
+MAXKMEM
+MINALLOCSIZE
+MINBUCKET
+Matt
+McKusick
+McKusick85
+Mem
+Phong
+Ricky
+Rodriguez88
+S.Leffler
+Thompson78
+ULTRIX
+Usenix
+VAX
+Vo
+arptbl
+caddr
+devbuf
+extern
+fragtbl
+freelist
+geteblk
+indx
+ioctlops
+kb
+kbp
+kmembase
+kmembuckets
+kmemsizes
+ks
+ksp
+mbuf
+mbufs
+namei
+pagecnt
+pathname
+pcb
+pp
+routetbl
+runtime
+splimp
+splx
+superblk
+temp
+wmemall
+zmemall
diff --git a/share/doc/papers/kernmalloc/usage.tbl b/share/doc/papers/kernmalloc/usage.tbl
new file mode 100644
index 000000000000..ccbc5d2afa6b
--- /dev/null
+++ b/share/doc/papers/kernmalloc/usage.tbl
@@ -0,0 +1,69 @@
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.TS
+box;
+c s s s
+c c c c
+n n n n.
+Memory statistics by bucket size
+=
+Size In Use Free Requests
+_
+128 329 39 3129219
+256 0 0 0
+512 4 0 16
+1024 17 5 648771
+2048 13 0 13
+2049\-4096 0 0 157
+4097\-8192 2 0 103
+8193\-16384 0 0 0
+16385\-32768 1 0 1
+.TE
+.DE
+.DS B
+.TS
+box;
+c s s s s
+c c c c c
+c n n n n.
+Memory statistics by type
+=
+Type In Use Mem Use High Use Requests
+_
+mbuf 6 1K 17K 3099066
+devbuf 13 53K 53K 13
+socket 37 5K 6K 1275
+pcb 55 7K 8K 1512
+routetbl 229 29K 29K 2424
+fragtbl 0 0K 1K 404
+zombie 3 1K 1K 24538
+namei 0 0K 5K 648754
+ioctlops 0 0K 1K 12
+superblk 24 34K 34K 24
+temp 0 0K 8K 258
+.TE
diff --git a/share/doc/papers/kerntune/0.t b/share/doc/papers/kerntune/0.t
new file mode 100644
index 000000000000..6e8af80fcfd2
--- /dev/null
+++ b/share/doc/papers/kerntune/0.t
@@ -0,0 +1,123 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.EQ
+delim $$
+.EN
+.if n .ND
+.TL
+Using gprof to Tune the 4.2BSD Kernel
+.AU
+Marshall Kirk McKusick
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+This paper describes how the \fIgprof\fP profiler
+accounts for the running time of called routines
+in the running time of the routines that call them.
+It then explains how to configure a profiling kernel on
+the 4.2 Berkeley Software Distribution of
+.UX
+for the VAX\(dd
+.FS
+\(dd VAX is a trademark of Digital Equipment Corporation.
+.FE
+and discusses tradeoffs in techniques for collecting
+profile data.
+\fIGprof\fP identifies problems
+that severely affects the overall performance of the kernel.
+Once a potential problem areas is identified
+benchmark programs are devised to highlight the bottleneck.
+These benchmarks verify that the problem exist and provide
+a metric against which to validate proposed solutions.
+Two caches are added to the kernel to alleviate the bottleneck
+and \fIgprof\fP is used to validates their effectiveness.
+.AE
+.LP
+.de PT
+.lt \\n(LLu
+.pc %
+.nr PN \\n%
+.tl '\\*(LH'\\*(CH'\\*(RH'
+.lt \\n(.lu
+..
+.af PN i
+.ds LH 4.2BSD Performance
+.ds RH Contents
+.bp 1
+.if t .ds CF May 21, 1984
+.if t .ds LF
+.if t .ds RF McKusick
+.ce
+.B "TABLE OF CONTENTS"
+.LP
+.sp 1
+.nf
+.B "1. Introduction"
+.LP
+.sp .5v
+.nf
+.B "2. The \fIgprof\fP Profiler"
+\0.1. Data Presentation"
+\0.1.1. The Flat Profile
+\0.1.2. The Call Graph Profile
+\0.2 Profiling the Kernel
+.LP
+.sp .5v
+.nf
+.B "3. Using \fIgprof\fP to Improve Performance
+\0.1. Using the Profiler
+\0.2. An Example of Tuning
+.LP
+.sp .5v
+.nf
+.B "4. Conclusions"
+.LP
+.sp .5v
+.nf
+.B Acknowledgements
+.LP
+.sp .5v
+.nf
+.B References
+.af PN 1
+.bp 1
+.de _d
+.if t .ta .6i 2.1i 2.6i
+.\" 2.94 went to 2.6, 3.64 to 3.30
+.if n .ta .84i 2.6i 3.30i
+..
+.de _f
+.if t .ta .5i 1.25i 2.5i
+.\" 3.5i went to 3.8i
+.if n .ta .7i 1.75i 3.8i
+..
diff --git a/share/doc/papers/kerntune/1.t b/share/doc/papers/kerntune/1.t
new file mode 100644
index 000000000000..ad0e25d7b029
--- /dev/null
+++ b/share/doc/papers/kerntune/1.t
@@ -0,0 +1,42 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Introduction
+.NH 1
+Introduction
+.PP
+The purpose of this paper is to describe the tools and techniques
+that are available for improving the performance of the kernel.
+The primary tool used to measure the kernel is the hierarchical
+profiler \fIgprof\fP.
+The profiler enables the user to measure the cost of
+the abstractions that the kernel provides to the user.
+Once the expensive abstractions are identified,
+optimizations are postulated to help improve their performance.
+These optimizations are each individually
+verified to insure that they are producing a measurable improvement.
diff --git a/share/doc/papers/kerntune/2.t b/share/doc/papers/kerntune/2.t
new file mode 100644
index 000000000000..1a90ceb2a7ee
--- /dev/null
+++ b/share/doc/papers/kerntune/2.t
@@ -0,0 +1,228 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH The \fIgprof\fP Profiler
+.NH 1
+The \fIgprof\fP Profiler
+.PP
+The purpose of the \fIgprof\fP profiling tool is to
+help the user evaluate alternative implementations
+of abstractions.
+The \fIgprof\fP design takes advantage of the fact that the kernel
+though large, is structured and hierarchical.
+We provide a profile in which the execution time
+for a set of routines that implement an
+abstraction is collected and charged
+to that abstraction.
+The profile can be used to compare and assess the costs of
+various implementations [Graham82] [Graham83].
+.NH 2
+Data presentation
+.PP
+The data is presented to the user in two different formats.
+The first presentation simply lists the routines
+without regard to the amount of time their descendants use.
+The second presentation incorporates the call graph of the
+kernel.
+.NH 3
+The Flat Profile
+.PP
+The flat profile consists of a list of all the routines
+that are called during execution of the kernel,
+with the count of the number of times they are called
+and the number of seconds of execution time for which they
+are themselves accountable.
+The routines are listed in decreasing order of execution time.
+A list of the routines that are never called during execution of
+the kernel is also available
+to verify that nothing important is omitted by
+this profiling run.
+The flat profile gives a quick overview of the routines that are used,
+and shows the routines that are themselves responsible
+for large fractions of the execution time.
+In practice,
+this profile usually shows that no single function
+is overwhelmingly responsible for
+the total time of the kernel.
+Notice that for this profile,
+the individual times sum to the total execution time.
+.NH 3
+The Call Graph Profile
+.PP
+Ideally, we would like to print the call graph of the kernel,
+but we are limited by the two-dimensional nature of our output
+devices.
+We cannot assume that a call graph is planar,
+and even if it is, that we can print a planar version of it.
+Instead, we choose to list each routine,
+together with information about
+the routines that are its direct parents and children.
+This listing presents a window into the call graph.
+Based on our experience,
+both parent information and child information
+is important,
+and should be available without searching
+through the output.
+Figure 1 shows a sample \fIgprof\fP entry.
+.KF
+.DS L
+.TS
+box center;
+c c c c c l l
+c c c c c l l
+c c c c c l l
+l n n n c l l.
+ called/total \ \ parents
+index %time self descendants called+self name index
+ called/total \ \ children
+_
+ 0.20 1.20 4/10 \ \ \s-1CALLER1\s+1 [7]
+ 0.30 1.80 6/10 \ \ \s-1CALLER2\s+1 [1]
+[2] 41.5 0.50 3.00 10+4 \s-1EXAMPLE\s+1 [2]
+ 1.50 1.00 20/40 \ \ \s-1SUB1\s+1 <cycle1> [4]
+ 0.00 0.50 1/5 \ \ \s-1SUB2\s+1 [9]
+ 0.00 0.00 0/5 \ \ \s-1SUB3\s+1 [11]
+.TE
+.ce
+Figure 1. Profile entry for \s-1EXAMPLE\s+1.
+.DE
+.KE
+.PP
+The major entries of the call graph profile are the entries from the
+flat profile, augmented by the time propagated to each
+routine from its descendants.
+This profile is sorted by the sum of the time for the routine
+itself plus the time inherited from its descendants.
+The profile shows which of the higher level routines
+spend large portions of the total execution time
+in the routines that they call.
+For each routine, we show the amount of time passed by each child
+to the routine, which includes time for the child itself
+and for the descendants of the child
+(and thus the descendants of the routine).
+We also show the percentage these times represent of the total time
+accounted to the child.
+Similarly, the parents of each routine are listed,
+along with time,
+and percentage of total routine time,
+propagated to each one.
+.PP
+Cycles are handled as single entities.
+The cycle as a whole is shown as though it were a single routine,
+except that members of the cycle are listed in place of the children.
+Although the number of calls of each member
+from within the cycle are shown,
+they do not affect time propagation.
+When a child is a member of a cycle,
+the time shown is the appropriate fraction of the time
+for the whole cycle.
+Self-recursive routines have their calls broken
+down into calls from the outside and self-recursive calls.
+Only the outside calls affect the propagation of time.
+.PP
+The example shown in Figure 2 is the fragment of a call graph
+corresponding to the entry in the call graph profile listing
+shown in Figure 1.
+.KF
+.DS L
+.so fig2.pic
+.ce
+Figure 2. Example call graph fragment.
+.DE
+.KE
+.PP
+The entry is for routine \s-1EXAMPLE\s+1, which has
+the Caller routines as its parents,
+and the Sub routines as its children.
+The reader should keep in mind that all information
+is given \fIwith respect to \s-1EXAMPLE\s+1\fP.
+The index in the first column shows that \s-1EXAMPLE\s+1
+is the second entry in the profile listing.
+The \s-1EXAMPLE\s+1 routine is called ten times, four times by \s-1CALLER1\s+1,
+and six times by \s-1CALLER2\s+1.
+Consequently 40% of \s-1EXAMPLE\s+1's time is propagated to \s-1CALLER1\s+1,
+and 60% of \s-1EXAMPLE\s+1's time is propagated to \s-1CALLER2\s+1.
+The self and descendant fields of the parents
+show the amount of self and descendant time \s-1EXAMPLE\s+1
+propagates to them (but not the time used by
+the parents directly).
+Note that \s-1EXAMPLE\s+1 calls itself recursively four times.
+The routine \s-1EXAMPLE\s+1 calls routine \s-1SUB1\s+1 twenty times, \s-1SUB2\s+1 once,
+and never calls \s-1SUB3\s+1.
+Since \s-1SUB2\s+1 is called a total of five times,
+20% of its self and descendant time is propagated to \s-1EXAMPLE\s+1's
+descendant time field.
+Because \s-1SUB1\s+1 is a member of \fIcycle 1\fR,
+the self and descendant times
+and call count fraction
+are those for the cycle as a whole.
+Since cycle 1 is called a total of forty times
+(not counting calls among members of the cycle),
+it propagates 50% of the cycle's self and descendant
+time to \s-1EXAMPLE\s+1's descendant time field.
+Finally each name is followed by an index that shows
+where on the listing to find the entry for that routine.
+.NH 2
+Profiling the Kernel
+.PP
+It is simple to build a 4.2BSD kernel that will automatically
+collect profiling information as it operates simply by specifying the
+.B \-p
+option to \fIconfig\fP\|(8) when configuring a kernel.
+The program counter sampling can be driven by the system clock,
+or by an alternate real time clock.
+The latter is highly recommended as use of the system clock results
+in statistical anomalies in accounting for
+the time spent in the kernel clock routine.
+.PP
+Once a profiling system has been booted statistic gathering is
+handled by \fIkgmon\fP\|(8).
+\fIKgmon\fP allows profiling to be started and stopped
+and the internal state of the profiling buffers to be dumped.
+\fIKgmon\fP can also be used to reset the state of the internal
+buffers to allow multiple experiments to be run without
+rebooting the machine.
+The profiling data can then be processed with \fIgprof\fP\|(1)
+to obtain information regarding the system's operation.
+.PP
+A profiled system is about 5-10% larger in its text space because of
+the calls to count the subroutine invocations.
+When the system executes,
+the profiling data is stored in a buffer that is 1.2
+times the size of the text space.
+All the information is summarized in memory,
+it is not necessary to have a trace file
+being continuously dumped to disk.
+The overhead for running a profiled system varies;
+under normal load we see anywhere from 5-25%
+of the system time spent in the profiling code.
+Thus the system is noticeably slower than an unprofiled system,
+yet is not so bad that it cannot be used in a production environment.
+This is important since it allows us to gather data
+in a real environment rather than trying to
+devise synthetic work loads.
diff --git a/share/doc/papers/kerntune/3.t b/share/doc/papers/kerntune/3.t
new file mode 100644
index 000000000000..59487e29b604
--- /dev/null
+++ b/share/doc/papers/kerntune/3.t
@@ -0,0 +1,284 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Techniques for Improving Performance
+.NH 1
+Techniques for Improving Performance
+.PP
+This section gives several hints on general optimization techniques.
+It then proceeds with an example of how they can be
+applied to the 4.2BSD kernel to improve its performance.
+.NH 2
+Using the Profiler
+.PP
+The profiler is a useful tool for improving
+a set of routines that implement an abstraction.
+It can be helpful in identifying poorly coded routines,
+and in evaluating the new algorithms and code that replace them.
+Taking full advantage of the profiler
+requires a careful examination of the call graph profile,
+and a thorough knowledge of the abstractions underlying
+the kernel.
+.PP
+The easiest optimization that can be performed
+is a small change
+to a control construct or data structure.
+An obvious starting point
+is to expand a small frequently called routine inline.
+The drawback to inline expansion is that the data abstractions
+in the kernel may become less parameterized,
+hence less clearly defined.
+The profiling will also become less useful since the loss of
+routines will make its output more granular.
+.PP
+Further potential for optimization lies in routines that
+implement data abstractions whose total execution
+time is long.
+If the data abstraction function cannot easily be speeded up,
+it may be advantageous to cache its results,
+and eliminate the need to rerun
+it for identical inputs.
+These and other ideas for program improvement are discussed in
+[Bentley81].
+.PP
+This tool is best used in an iterative approach:
+profiling the kernel,
+eliminating one bottleneck,
+then finding some other part of the kernel
+that begins to dominate execution time.
+.PP
+A completely different use of the profiler is to analyze the control
+flow of an unfamiliar section of the kernel.
+By running an example that exercises the unfamiliar section of the kernel,
+and then using \fIgprof\fR, you can get a view of the
+control structure of the unfamiliar section.
+.NH 2
+An Example of Tuning
+.PP
+The first step is to come up with a method for generating
+profile data.
+We prefer to run a profiling system for about a one day
+period on one of our general timesharing machines.
+While this is not as reproducible as a synthetic workload,
+it certainly represents a realistic test.
+We have run one day profiles on several
+occasions over a three month period.
+Despite the long period of time that elapsed
+between the test runs the shape of the profiles,
+as measured by the number of times each system call
+entry point was called, were remarkably similar.
+.PP
+A second alternative is to write a small benchmark
+program to repeated exercise a suspected bottleneck.
+While these benchmarks are not useful as a long term profile
+they can give quick feedback on whether a hypothesized
+improvement is really having an effect.
+It is important to realize that the only real assurance
+that a change has a beneficial effect is through
+long term measurements of general timesharing.
+We have numerous examples where a benchmark program
+suggests vast improvements while the change
+in the long term system performance is negligible,
+and conversely examples in which the benchmark program run more slowly,
+but the long term system performance improves significantly.
+.PP
+An investigation of our long term profiling showed that
+the single most expensive function performed by the kernel
+is path name translation.
+We find that our general time sharing systems do about
+500,000 name translations per day.
+The cost of doing name translation in the original 4.2BSD
+is 24.2 milliseconds,
+representing 40% of the time processing system calls,
+which is 19% of the total cycles in the kernel,
+or 11% of all cycles executed on the machine.
+The times are shown in Figure 3.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 14.3 ms/call 11.3%
+child 9.9 ms/call 7.9%
+_
+total 24.2 ms/call 19.2%
+.TE
+.ce
+Figure 3. Call times for \fInamei\fP.
+.DE
+.KE
+.PP
+The system measurements collected showed the
+pathname translation routine, \fInamei\fP,
+was clearly worth optimizing.
+An inspection of \fInamei\fP shows that
+it consists of two nested loops.
+The outer loop is traversed once per pathname component.
+The inner loop performs a linear search through a directory looking
+for a particular pathname component.
+.PP
+Our first idea was to observe that many programs
+step through a directory performing an operation on
+each entry in turn.
+This caused us to modify \fInamei\fP to cache
+the directory offset of the last pathname
+component looked up by a process.
+The cached offset is then used
+as the point at which a search in the same directory
+begins. Changing directories invalidates the cache, as
+does modifying the directory.
+For programs that step sequentially through a directory with
+$N$ files, search time decreases from $O ( N sup 2 )$
+to $O(N)$.
+.PP
+The cost of the cache is about 20 lines of code
+(about 0.2 kilobytes)
+and 16 bytes per process, with the cached data
+stored in a process's \fIuser\fP vector.
+.PP
+As a quick benchmark to verify the effectiveness of the
+cache we ran ``ls \-l''
+on a directory containing 600 files.
+Before the per-process cache this command
+used 22.3 seconds of system time.
+After adding the cache the program used the same amount
+of user time, but the system time dropped to 3.3 seconds.
+.PP
+This change prompted our rerunning a profiled system
+on a machine containing the new \fInamei\fP.
+The results showed that the time in \fInamei\fP
+dropped by only 2.6 ms/call and
+still accounted for 36% of the system call time,
+18% of the kernel, or about 10% of all the machine cycles.
+This amounted to a drop in system time from 57% to about 55%.
+The results are shown in Figure 4.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 11.0 ms/call 9.2%
+child 10.6 ms/call 8.9%
+_
+total 21.6 ms/call 18.1%
+.TE
+.ce
+Figure 4. Call times for \fInamei\fP with per-process cache.
+.DE
+.KE
+.PP
+The small performance improvement
+was caused by a low cache hit ratio.
+Although the cache was 90% effective when hit,
+it was only usable on about 25% of the names being translated.
+An additional reason for the small improvement was that
+although the amount of time spent in \fInamei\fP itself
+decreased substantially,
+more time was spent in the routines that it called
+since each directory had to be accessed twice;
+once to search from the middle to the end,
+and once to search from the beginning to the middle.
+.PP
+Most missed names were caused by path name components
+other than the last.
+Thus Robert Elz introduced a system wide cache of most recent
+name translations.
+The cache is keyed on a name and the
+inode and device number of the directory that contains it.
+Associated with each entry is a pointer to the corresponding
+entry in the inode table.
+This has the effect of short circuiting the outer loop of \fInamei\fP.
+For each path name component,
+\fInamei\fP first looks in its cache of recent translations
+for the needed name.
+If it exists, the directory search can be completely eliminated.
+If the name is not recognized,
+then the per-process cache may still be useful in
+reducing the directory search time.
+The two cacheing schemes complement each other well.
+.PP
+The cost of the name cache is about 200 lines of code
+(about 1.2 kilobytes)
+and 44 bytes per cache entry.
+Depending on the size of the system,
+about 200 to 1000 entries will normally be configured,
+using 10-44 kilobytes of physical memory.
+The name cache is resident in memory at all times.
+.PP
+After adding the system wide name cache we reran ``ls \-l''
+on the same directory.
+The user time remained the same,
+however the system time rose slightly to 3.7 seconds.
+This was not surprising as \fInamei\fP
+now had to maintain the cache,
+but was never able to make any use of it.
+.PP
+Another profiled system was created and measurements
+were collected over a one day period. These measurements
+showed a 6 ms/call decrease in \fInamei\fP, with
+\fInamei\fP accounting for only 31% of the system call time,
+16% of the time in the kernel,
+or about 7% of all the machine cycles.
+System time dropped from 55% to about 49%.
+The results are shown in Figure 5.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 9.5 ms/call 9.6%
+child 6.1 ms/call 6.1%
+_
+total 15.6 ms/call 15.7%
+.TE
+.ce
+Figure 5. Call times for \fInamei\fP with both caches.
+.DE
+.KE
+.PP
+Statistics on the performance of both caches show
+the large performance improvement is
+caused by the high hit ratio.
+On the profiled system a 60% hit rate was observed in
+the system wide cache. This, coupled with the 25%
+hit rate in the per-process offset cache yielded an
+effective cache hit rate of 85%.
+While the system wide cache reduces both the amount of time in
+the routines that \fInamei\fP calls as well as \fInamei\fP itself
+(since fewer directories need to be accessed or searched),
+it is interesting to note that the actual percentage of system
+time spent in \fInamei\fP itself increases even though the
+actual time per call decreases.
+This is because less total time is being spent in the kernel,
+hence a smaller absolute time becomes a larger total percentage.
diff --git a/share/doc/papers/kerntune/4.t b/share/doc/papers/kerntune/4.t
new file mode 100644
index 000000000000..fd813bc0870c
--- /dev/null
+++ b/share/doc/papers/kerntune/4.t
@@ -0,0 +1,93 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Conclusions
+.NH 1
+Conclusions
+.PP
+We have created a profiler that aids in the evaluation
+of the kernel.
+For each routine in the kernel,
+the profile shows the extent to which that routine
+helps support various abstractions,
+and how that routine uses other abstractions.
+The profile assesses the cost of routines
+at all levels of the kernel decomposition.
+The profiler is easily used,
+and can be compiled into the kernel.
+It adds only five to thirty percent execution overhead to the kernel
+being profiled,
+produces no additional output while the kernel is running
+and allows the kernel to be measured in its real environment.
+Kernel profiles can be used to identify bottlenecks in performance.
+We have shown how to improve performance
+by caching recently calculated name translations.
+The combined caches added to the name translation process
+reduce the average cost of translating a pathname to an inode by 35%.
+These changes reduce the percentage of time spent running
+in the system by nearly 9%.
+.nr H2 1
+.ds RH Acknowledgements
+.NH
+\s+2Acknowledgements\s0
+.PP
+I would like to thank Robert Elz for sharing his ideas and
+his code for cacheing system wide names.
+Thanks also to all the users at Berkeley who provided all the
+input to generate the kernel profiles.
+This work was supported by
+the Defense Advance Research Projects Agency (DoD) under
+Arpa Order No. 4031 monitored by Naval Electronic System Command under
+Contract No. N00039-82-C-0235.
+.ds RH References
+.nr H2 1
+.sp 2
+.NH
+\s+2References\s-2
+.LP
+.IP [Bentley81] 20
+Bentley, J. L.,
+``Writing Efficient Code'',
+Department of Computer Science,
+Carnegie-Mellon University,
+Pittsburgh, Pennsylvania,
+CMU-CS-81-116, 1981.
+.IP [Graham82] 20
+Graham, S., Kessler, P., McKusick, M.,
+``gprof: A Call Graph Execution Profiler'',
+Proceedings of the SIGPLAN '82 Symposium on Compiler Construction,
+Volume 17, Number 6, June 1982. pp 120-126
+.IP [Graham83] 20
+Graham, S., Kessler, P., McKusick, M.,
+``An Execution Profiler for Modular Programs''
+Software - Practice and Experience,
+Volume 13, 1983. pp 671-685
+.IP [Ritchie74] 20
+Ritchie, D. M. and Thompson, K.,
+``The UNIX Time-Sharing System'',
+CACM 17, 7. July 1974. pp 365-375
diff --git a/share/doc/papers/kerntune/Makefile b/share/doc/papers/kerntune/Makefile
new file mode 100644
index 000000000000..6dbc8c67ab2b
--- /dev/null
+++ b/share/doc/papers/kerntune/Makefile
@@ -0,0 +1,11 @@
+VOLUME= papers
+DOC= kerntune
+SRCS= 0.t 1.t 2.t 3.t 4.t
+EXTRA= fig2.pic
+MACROS= -ms
+USE_EQN=
+USE_PIC=
+USE_SOELIM=
+USE_TBL=
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/kerntune/fig2.pic b/share/doc/papers/kerntune/fig2.pic
new file mode 100644
index 000000000000..5fe73fdebe5d
--- /dev/null
+++ b/share/doc/papers/kerntune/fig2.pic
@@ -0,0 +1,51 @@
+.\" Copyright (c) 1987 M. K. McKusick
+.\" Copyright (c) 1987 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.PS
+ellipse ht .3i wid .75i "\s-1CALLER1\s+1"
+ellipse ht .3i wid .75i "\s-1CALLER2\s+1" at 1st ellipse + (2i,0i)
+ellipse ht .3i wid .8i "\s-1EXAMPLE\s+1" at 1st ellipse + (1i,-.5i)
+ellipse ht .3i wid .5i "\s-1SUB1\s+1" at 1st ellipse - (0i,1i)
+ellipse ht .3i wid .5i "\s-1SUB2\s+1" at 3rd ellipse - (0i,.5i)
+ellipse ht .3i wid .5i "\s-1SUB3\s+1" at 2nd ellipse - (0i,1i)
+line <- from 1st ellipse up .5i left .5i chop .1875i
+line <- from 1st ellipse up .5i right .5i chop .1875i
+line <- from 2nd ellipse up .5i left .5i chop .1875i
+line <- from 2nd ellipse up .5i right .5i chop .1875i
+arrow from 1st ellipse to 3rd ellipse chop
+arrow from 2nd ellipse to 3rd ellipse chop
+arrow from 3rd ellipse to 4th ellipse chop
+arrow from 3rd ellipse to 5th ellipse chop .15i chop .15i
+arrow from 3rd ellipse to 6th ellipse chop
+arrow from 4th ellipse down .5i left .5i chop .1875i
+arrow from 4th ellipse down .5i right .5i chop .1875i
+arrow from 5th ellipse down .5i left .5i chop .1875i
+arrow from 5th ellipse down .5i right .5i chop .1875i
+arrow from 6th ellipse down .5i left .5i chop .1875i
+arrow from 6th ellipse down .5i right .5i chop .1875i
+.PE
diff --git a/share/doc/papers/malloc/Makefile b/share/doc/papers/malloc/Makefile
new file mode 100644
index 000000000000..755264466aa3
--- /dev/null
+++ b/share/doc/papers/malloc/Makefile
@@ -0,0 +1,7 @@
+VOLUME= papers
+DOC= malloc
+SRCS= abs.ms intro.ms kernel.ms malloc.ms problems.ms alternatives.ms \
+ performance.ms implementation.ms conclusion.ms
+MACROS= -ms
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/malloc/abs.ms b/share/doc/papers/malloc/abs.ms
new file mode 100644
index 000000000000..ef70cec772af
--- /dev/null
+++ b/share/doc/papers/malloc/abs.ms
@@ -0,0 +1,33 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.if n .ND
+.TL
+Malloc(3) in modern Virtual Memory environments.
+.sp
+Revised
+Fri Apr 5 12:50:07 1996
+.AU
+Poul-Henning Kamp
+.AI
+<phk@FreeBSD.org>
+Den Andensidste Viking
+Valbygaardsvej 8
+DK-4200 Slagelse
+Denmark
+.AB
+Malloc/free is one of the oldest parts of the C language environment
+and obviously the world has changed a bit since it was first made.
+The fact that most UNIX kernels have changed from swap/segment to
+virtual memory/page based memory management has not been sufficiently
+reflected in the implementations of the malloc/free API.
+.PP
+A new implementation was designed, written, tested and bench-marked
+with an eye on the workings and performance characteristics of modern
+Virtual Memory systems.
+.AE
diff --git a/share/doc/papers/malloc/alternatives.ms b/share/doc/papers/malloc/alternatives.ms
new file mode 100644
index 000000000000..02efa1f88901
--- /dev/null
+++ b/share/doc/papers/malloc/alternatives.ms
@@ -0,0 +1,43 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.ds RH Alternative implementations
+.NH
+Alternative implementations
+.PP
+These problems were actually the inspiration for the first alternative
+malloc implementations.
+Since their main aim was debugging, they would often use techniques
+like allocating a guard zone before and after the chunk,
+and possibly filling these guard zones
+with some pattern, so accesses outside the allocated chunk could be detected
+with some decent probability.
+Another widely used technique is to use tables to keep track of which
+chunks are actually in which state and so on.
+.PP
+This class of debugging has been taken to its practical extreme by
+the product "Purify" which does the entire memory-coloring exercise
+and not only keeps track of what is in use and what isn't, but also
+detects if the first reference is a read (which would return undefined
+values) and other such violations.
+.PP
+Later actual complete implementations of malloc arrived, but many of
+these still based their workings on the basic schema mentioned previously,
+disregarding that in the meantime virtual memory and paging have
+become the standard environment.
+.PP
+The most widely used "alternative" malloc is undoubtedly ``gnumalloc''
+which has received wide acclaim and certainly runs faster than
+most stock mallocs. It does, however, tend to fare badly in
+cases where paging is the norm rather than the exception.
+.PP
+The particular malloc that prompted this work basically didn't bother
+reusing storage until the kernel forced it to do so by refusing
+further allocations with sbrk(2).
+That may make sense if you work alone on your own personal mainframe,
+but as a general policy it is less than optimal.
diff --git a/share/doc/papers/malloc/conclusion.ms b/share/doc/papers/malloc/conclusion.ms
new file mode 100644
index 000000000000..9d0d1f2a83a6
--- /dev/null
+++ b/share/doc/papers/malloc/conclusion.ms
@@ -0,0 +1,46 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.ds RH Conclusion and experience.
+.NH
+Conclusion and experience.
+.PP
+In general the performance differences between gnumalloc and this
+malloc are not that big.
+The major difference comes when primary storage is seriously
+over-committed, in which case gnumalloc
+wastes time paging in pages it's not going to use.
+In such cases as much as a factor of five in wall-clock time has
+been seen in difference.
+Apart from that gnumalloc and this implementation are pretty
+much head-on performance-wise.
+.PP
+Several legacy programs in the BSD 4.4 Lite distribution had
+code that depended on the memory returned from malloc
+being zeroed. In a couple of cases, free(3) was called more than
+once for the same allocation, and a few cases even called free(3)
+with pointers to objects in the data section or on the stack.
+.PP
+A couple of users have reported that using this malloc on other
+platforms yielded "pretty impressive results", but no hard benchmarks
+have been made.
+.ds RH Acknowledgements & references.
+.NH
+Acknowledgements & references.
+.PP
+The first implementation of this algorithm was actually a file system,
+done in assembler using 5-hole ``Baudot'' paper tape for a drum storage
+device attached to a 20 bit germanium transistor computer with 2000 words
+of memory, but that was many years ago.
+.PP
+Peter Wemm <peter@FreeBSD.org> came up with the idea to store the
+page-directory in mmap(2)'ed memory instead of in the heap.
+This has proven to be a good move.
+.PP
+Lars Fredriksen <fredriks@mcs.com> found and identified a
+fence-post bug in the code.
diff --git a/share/doc/papers/malloc/implementation.ms b/share/doc/papers/malloc/implementation.ms
new file mode 100644
index 000000000000..f9b547be18f5
--- /dev/null
+++ b/share/doc/papers/malloc/implementation.ms
@@ -0,0 +1,223 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.ds RH Implementation
+.NH
+Implementation
+.PP
+A new malloc(3) implementation was written to meet the goals,
+and to the extent possible to address the shortcomings listed previously.
+.PP
+The source is 1218 lines of C code, and can be found in FreeBSD 2.2
+(and probably later versions as well) as src/lib/libc/stdlib/malloc.c.
+.PP
+The main data structure is the
+.I page-directory
+which contains a
+.B void*
+for each page we have control over.
+The value can be one of:
+.IP
+.B MALLOC_NOT_MINE
+Another part of the code may call brk(2) to get a piece of the cake.
+Consequently, we cannot rely on the memory we get from the kernel
+being one consecutive piece of memory, and therefore we need a way to
+mark such pages as "untouchable".
+.IP
+.B MALLOC_FREE
+This is a free page.
+.IP
+.B MALLOC_FIRST
+This is the first page in a (multi-)page allocation.
+.IP
+.B MALLOC_FOLLOW
+This is a subsequent page in a multi-page allocation.
+.IP
+.B
+struct pginfo*
+.R
+A pointer to a structure describing a partitioned page.
+.PP
+In addition, there exists a linked list of small data structures that
+describe the free space as runs of free pages.
+.PP
+Notice that these structures are not part of the free pages themselves,
+but rather allocated with malloc so that the free pages themselves
+are never referenced while they are free.
+.PP
+When a request for storage comes in, it will be treated as a ``page''
+allocation if it is bigger than half a page.
+The free list will be searched and the first run of free pages that
+can satisfy the request is used. The first page gets set to
+.B MALLOC_FIRST
+status. If more than that one page is needed, the rest of them get
+.B MALLOC_FOLLOW
+status in the page-directory.
+.PP
+If there were no pages on the free list, brk(2) will be called, and
+the pages will get added to the page-directory with status
+.B MALLOC_FREE
+and the search restarts.
+.PP
+Freeing a number of pages is done by changing their state in the
+page directory to MALLOC_FREE, and then traversing the free-pages list to
+find the right place for this run of pages, possibly collapsing
+with the two neighboring runs into one run and, if possible,
+releasing some memory back to the kernel by calling brk(2).
+.PP
+If the request is less than or equal to half of a page, its size will be
+rounded up to the nearest power of two before being processed
+and if the request is less than some minimum size, it is rounded up to
+that size.
+.PP
+These sub-page allocations are served from pages which are split up
+into some number of equal size chunks.
+For each of these pages a
+.B
+struct pginfo
+.R
+describes the size of the chunks on this page, how many there are,
+how many are free and so on.
+The description consist of a bitmap of used chunks, and various counters
+and numbers used to keep track of the stuff in the page.
+.PP
+For each size of sub-page allocation, the pginfo structures for the
+pages that have free chunks in them form a list.
+The heads of these lists are stored in predetermined slots at
+the beginning of the page directory to make access fast.
+.PP
+To allocate a chunk of some size, the head of the list for the
+corresponding size is examined, and a free chunk found. The number
+of free chunks on that page is decreased by one and, if zero, the
+pginfo structure is unlinked from the list.
+.PP
+To free a chunk, the page is derived from the pointer, the page table
+for that page contains a pointer to the pginfo structure, where the
+free bit is set for the chunk, the number of free chunks increased by
+one, and if equal to one, the pginfo structure is linked into the
+proper place on the list for this size of chunks.
+If the count increases to match the number of chunks on the page, the
+pginfo structure is unlinked from the list and free(3)'ed and the
+actual page itself is free(3)'ed too.
+.PP
+To be 100% correct performance-wise these lists should be ordered
+according to the recent number of accesses to that page. This
+information is not available and it would essentially mean a reordering
+of the list on every memory reference to keep it up-to-date.
+Instead they are ordered according to the address of the pages.
+Interestingly enough, in practice this comes out to almost the same
+thing performance-wise.
+.PP
+It's not that surprising after all, it's the difference between
+following the crowd or actively directing where it can go, in both
+ways you can end up in the middle of it all.
+.PP
+The side effect of this compromise is that it also uses less storage,
+and the list never has to be reordered, all the ordering happens when
+pages are added or deleted.
+.PP
+It is an interesting twist to the implementation that the
+.B
+struct pginfo
+.R
+is allocated with malloc.
+That is, "as with malloc" to be painfully correct.
+The code knows the special case where the first (couple) of allocations on
+the page is actually the pginfo structure and deals with it accordingly.
+This avoids some silly "chicken and egg" issues.
+.ds RH Bells and whistles.
+.NH
+Bells and whistles.
+.PP
+brk(2) is actually not a very fast system call when you ask for storage.
+This is mainly because of the need by the kernel to zero the pages before
+handing them over, so therefore this implementation does not release
+heap pages until there is a large chunk to release back to the kernel.
+Chances are pretty good that we will need it again pretty soon anyway.
+Since these pages are not accessed at all, they will soon be paged out
+and don't affect anything but swap-space usage.
+.PP
+The page directory is actually kept in a mmap(2)'ed piece of
+anonymous memory. This avoids some rather silly cases that
+would otherwise have to be handled when the page directory
+has to be extended.
+.PP
+One particularly nice feature is that all pointers passed to free(3)
+and realloc(3) can be checked conclusively for validity:
+First the pointer is masked to find the page. The page directory
+is then examined, it must contain either MALLOC_FIRST, in which
+case the pointer must point exactly at the page, or it can contain
+a struct pginfo*, in which case the pointer must point to one of
+the chunks described by that structure.
+Warnings will be printed on
+.B stderr
+and nothing will be done with
+the pointer if it is found to be invalid.
+.PP
+An environment variable
+.B MALLOC_OPTIONS
+allows the user some control over the behavior of malloc.
+Some of the more interesting options are:
+.IP
+.B Abort
+If malloc fails to allocate storage, core-dump the process with
+a message rather than expect it handle this correctly.
+It's amazing how few programs actually handle this condition correctly,
+and consequently the havoc they can create is the more creative or
+destructive.
+.IP
+.B Dump
+Writes malloc statistics to a file called ``malloc.out'' prior
+to process termination.
+.IP
+.B Hint
+Pass a hint to the kernel about pages we no longer need through the
+madvise(2) system call. This can help performance on machines that
+page heavily by eliminating unnecessary page-ins and page-outs of
+unused data.
+.IP
+.B Realloc
+Always do a free and malloc when realloc(3) is called.
+For programs doing garbage collection using realloc(3), this makes the
+heap collapse faster since malloc will reallocate from the
+lowest available address.
+The default
+is to leave things alone if the size of the allocation is still in
+the same size-class.
+.IP
+.B Junk
+will explicitly fill the allocated area with a particular value
+to try to detect if programs rely on it being zero.
+.IP
+.B Zero
+will explicitly zero out the allocated chunk of memory, while any
+space after the allocation in the chunk will be filled with the
+junk value to try to catch out of the chunk references.
+.ds RH The road not taken.
+.NH
+The road not yet taken.
+.PP
+A couple of avenues were explored that could be interesting in some
+set of circumstances.
+.PP
+Using mmap(2) instead of brk(2) was actually slower, since brk(2)
+knows a lot of the things that mmap has to find out first.
+.PP
+In general there is little room for further improvement of the
+time-overhead of the malloc, further improvements will have to
+be in the area of improving paging behavior.
+.PP
+It is still under consideration to add a feature such that
+if realloc is called with two zero arguments, the internal
+allocations will be reallocated to perform a garbage collect.
+This could be used in certain types of programs to collapse
+the memory use, but so far it doesn't seem to be worth the effort.
+.PP
+Malloc/Free can be a significant point of contention in multi-threaded
+programs. Low-grain locking of the data-structures inside the
+implementation should be implemented to avoid excessive spin-waiting.
diff --git a/share/doc/papers/malloc/intro.ms b/share/doc/papers/malloc/intro.ms
new file mode 100644
index 000000000000..59064ee166f1
--- /dev/null
+++ b/share/doc/papers/malloc/intro.ms
@@ -0,0 +1,72 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.ds RH Introduction
+.NH
+Introduction
+.PP
+Most programs need to allocate storage dynamically in addition
+to whatever static storage the compiler reserved at compile-time.
+To C programmers this fact is rather obvious, but for many years
+this was not an accepted and recognized fact, and many languages
+still used today don't support this notion adequately.
+.PP
+The classic UNIX kernel provides two very simple and powerful
+mechanisms for obtaining dynamic storage, the execution stack
+and the heap.
+The stack is usually put at the far upper end of the address-space,
+from where it grows down as far as needed, though this may depend on
+the CPU design.
+The heap starts at the end of the
+.B bss
+segment and grows upwards as needed.
+.PP
+There isn't really a kernel-interface to the stack as such.
+The kernel will allocate some amount of memory for it,
+not even telling the process the exact size.
+If the process needs more space than that, it will simply try to access
+it, hoping that the kernel will detect that an access has been
+attempted outside the allocated memory, and try to extend it.
+If the kernel fails to extend the stack, this could be because of lack
+of resources or permissions or because it may just be impossible
+to do in the first place, the process will usually be shot down by the
+kernel.
+.PP
+In the C language, there exists a little used interface to the stack,
+.B alloca(3) ,
+which will explicitly allocate space on the stack.
+This is not an interface to the kernel, but merely an adjustment
+done to the stack-pointer such that space will be available and
+unharmed by any subroutine calls yet to be made while the context
+of the current subroutine is intact.
+.PP
+Due to the nature of normal use of the stack, there is no corresponding
+"free" operator, but instead the space is returned when the current
+function returns to its caller and the stack frame is dismantled.
+This is the cause of much grief, and probably the single most important
+reason that alloca(3) is not, and should not be, used widely.
+.PP
+The heap on the other hand has an explicit kernel-interface in the
+system call
+.B brk(2) .
+The argument to brk(2) is a pointer to where the process wants the
+heap to end.
+There is also an interface called
+.B sbrk(2)
+taking an increment to the current end of the heap, but this is merely a
+.B libc
+front for brk(2).
+.PP
+In addition to these two memory resources, modern virtual memory kernels
+provide the mmap(2)/munmap(2) interface which allows almost complete
+control over any bit of virtual memory in the process address space.
+.PP
+Because of the generality of the mmap(2) interface and the way the
+data structures representing the regions are laid out, sbrk(2) is actually
+faster in use than the equivalent mmap(2) call, simply because
+mmap(2) has to search for information that is implicit in the sbrk(2) call.
diff --git a/share/doc/papers/malloc/kernel.ms b/share/doc/papers/malloc/kernel.ms
new file mode 100644
index 000000000000..3672065ddef4
--- /dev/null
+++ b/share/doc/papers/malloc/kernel.ms
@@ -0,0 +1,54 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.ds RH The kernel and memory
+.NH
+The kernel and memory
+.PP
+Brk(2) isn't a particularly convenient interface,
+it was probably made more to fit the memory model of the
+hardware being used, than to fill the needs of the programmers.
+.PP
+Before paged and/or virtual memory systems became
+common, the most popular memory management facility used for
+UNIX was segments.
+This was also very often the only vehicle for imposing protection on
+various parts of memory.
+Depending on the hardware, segments can be anything, and consequently
+how the kernels exploited them varied a lot from UNIX to UNIX and from
+machine to machine.
+.PP
+Typically a process would have one segment for the text section, one
+for the data and bss section combined and one for the stack.
+On some systems the text shared a segment with the data and bss, and was
+consequently just as writable as them.
+.PP
+In this setup all the brk(2) system call has to do is to find the
+right amount of free storage, possibly moving things around in physical
+memory, maybe even swapping out a segment or two to make space,
+and change the upper limit on the data segment according to the address given.
+.PP
+In a more modern page based virtual memory implementation this is still
+pretty much the situation, except that the granularity is now pages:
+The kernel finds the right number of free pages, possibly paging some
+pages out to free them up, and then plugs them into the page-table of
+the process.
+.PP
+As such the difference is very small, the real difference is that in
+the old world of swapping, either the entire process was in primary
+storage or it wouldn't be selected to be run. In a modern VM kernel,
+a process might only have a subset of its pages in primary memory,
+the rest will be paged in, if and when the process tries to access them.
+.PP
+Only very few programs deal with the brk(2) interface directly.
+The few that do usually have their own memory management facilities.
+LISP or FORTH interpreters are good examples.
+Most other programs use the
+.B malloc(3)
+interface instead, and leave it to the malloc implementation to
+use brk(2) to get storage allocated from the kernel.
diff --git a/share/doc/papers/malloc/malloc.ms b/share/doc/papers/malloc/malloc.ms
new file mode 100644
index 000000000000..79e5173226b2
--- /dev/null
+++ b/share/doc/papers/malloc/malloc.ms
@@ -0,0 +1,70 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.ds RH Malloc and free
+.NH
+Malloc and free
+.PP
+The job of malloc(3) is to turn the rather simple
+brk(2) facility into a service programs can
+actually use without getting hurt.
+.PP
+The archetypical malloc(3) implementation keeps track of the memory between
+the end of the bss section, as defined by the
+.B _end
+symbol, and the current brk(2) point using a linked list of chunks of memory.
+Each item on the list has a status as either free or used, a pointer
+to the next entry and in most cases to the previous as well, to speed
+up inserts and deletes in the list.
+.PP
+When a malloc(3) request comes in, the list is traversed from the
+front and if a free chunk big enough to hold the request is found,
+it is returned, if the free chunk is bigger than the size requested,
+a new free chunk is made from the excess and put back on the list.
+.PP
+When a chunk is
+.B free(3) 'ed,
+the chunk is found in the list, its status
+is changed to free and if one or both of the surrounding chunks
+are free, they are collapsed to one.
+.PP
+A third kind of request,
+.B realloc(3) ,
+will resize
+a chunk, trying to avoid copying the contents if possible.
+It is seldom used, and has only had a significant impact on performance
+in a few special situations.
+The typical pattern of use is to malloc(3) a chunk of the maximum size
+needed, read in the data and adjust the size of the chunk to match the
+size of the data read using realloc(3).
+.PP
+For reasons of efficiency, the original implementation of malloc(3)
+put the small structure used to contain the next and previous pointers
+plus the state of the chunk right before the chunk itself.
+.PP
+As a matter of fact, the canonical malloc(3) implementation can be
+studied in the ``Old testament'', chapter 8 verse 7 [Kernighan & Ritchie]
+.PP
+Various optimisations can be applied to the above basic algorithm:
+.IP
+If in freeing a chunk, we end up with the last chunk on the list being
+free, we can return that to the kernel by calling brk(2) with the first
+address of that chunk and then make the previous chunk the last on the
+chain by terminating its ``next'' pointer.
+.IP
+A best-fit algorithm can be used instead of first-fit at an expense
+of memory, because statistically fewer chances to brk(2) backwards will
+present themselves.
+.IP
+Splitting the list in two, one for used and one for free chunks, to
+speed the searching.
+.IP
+Putting free chunks on one of several free lists, depending on their size,
+to speed allocation.
+.IP
+\&...
diff --git a/share/doc/papers/malloc/performance.ms b/share/doc/papers/malloc/performance.ms
new file mode 100644
index 000000000000..49e9d6b75517
--- /dev/null
+++ b/share/doc/papers/malloc/performance.ms
@@ -0,0 +1,111 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.ds RH Performance
+.NH
+Performance
+.PP
+Performance for a malloc(3) implementation comes as two variables:
+.IP
+A: How much time does it use for searching and manipulating data structures.
+We will refer to this as ``overhead time''.
+.IP
+B: How well does it manage the storage.
+This rather vague metric we call ``quality of allocation''.
+.PP
+The overhead time is easy to measure, just do a lot of malloc/free calls
+of various kinds and combination, and compare the results.
+.PP
+The quality of allocation is not quite as simple as that.
+One measure of quality is the size of the process, that should obviously
+be minimized.
+Another measure is the execution time of the process.
+This is not an obvious indicator of quality, but people will generally
+agree that it should be minimized as well, and if malloc(3) can do
+anything to do so, it should.
+Explanation why it is still a good metric follows:
+.PP
+In a traditional segment/swap kernel, the desirable behavior of a process
+is to keep the brk(2) as low as possible, thus minimizing the size of the
+data/bss/heap segment, which in turn translates to a smaller process and
+a smaller probability of the process being swapped out, qed: faster
+execution time as an average.
+.PP
+In a paging environment this is not a bad choice for a default, but
+a couple of details needs to be looked at much more carefully.
+.PP
+First of all, the size of a process becomes a more vague concept since
+only the pages that are actually used need to be in primary storage
+for execution to progress, and they only need to be there when used.
+That implies that many more processes can fit in the same amount of
+primary storage, since most processes have a high degree of locality
+of reference and thus only need some fraction of their pages to actually
+do their job.
+.PP
+From this it follows that the interesting size of the process, is some
+subset of the total amount of virtual memory occupied by the process.
+This number isn't a constant, it varies depending on the whereabouts
+of the process, and it may indeed fluctuate wildly over the lifetime
+of the process.
+.PP
+One of the names for this vague concept is ``current working set''.
+It has been defined many different ways over the years, mostly to
+satisfy and support claims in marketing or benchmark contexts.
+.PP
+For now we can simply say that it is the number of pages the process
+needs in order to run at a sufficiently low paging rate in a congested
+primary storage.
+(If primary storage isn't congested, this is not really important
+of course, but most systems would be better off using the pages for
+disk-cache or similar functions, so from that perspective it will
+always be congested.)
+If the number of pages is too small, the process will wait for its
+pages to be read from secondary storage much of the time, if it's too
+big, the space could be used better for something else.
+.PP
+From the view of any single process, this number of pages is
+"all of my pages", but from the point of view of the OS it should
+be tuned to maximise the total throughput of all the processes on
+the machine at the time.
+This is usually done using various kinds of least-recently-used
+replacement algorithms to select page candidates for replacement.
+.PP
+With this knowledge, can we decide what the performance goal is for
+a modern malloc(3) ?
+Well, it's almost as simple as it used to be:
+.B
+Minimize the number of pages accessed.
+.R
+.PP
+This really is the core of it all.
+If the number of accessed pages is smaller, then locality of reference is
+higher, and all kinds of caches (which is essentially what the
+primary storage is in a VM system) work better.
+.PP
+It's interesting to notice that the classical malloc fails on this one
+because the information about free chunks is kept with the free
+chunks themselves. In some of the benchmarks this came out as all the
+pages being paged in every time a malloc call was made, because malloc
+had to traverse the free list to find a suitable chunk for the allocation.
+If memory is not in use, then you shouldn't access it.
+.PP
+The secondary goal is more evident:
+.B
+Try to work in pages.
+.R
+.PP
+That makes it easier for the kernel, and wastes less virtual memory.
+Most modern implementations do this when they interact with the
+kernel, but few try to avoid objects spanning pages.
+.PP
+If an object's size
+is less than or equal to a page, there is no reason for it to span two pages.
+Having objects span pages means that two pages must be
+paged in, if that object is accessed.
+.PP
+With this analysis in the luggage, we can start coding.
diff --git a/share/doc/papers/malloc/problems.ms b/share/doc/papers/malloc/problems.ms
new file mode 100644
index 000000000000..fa27effce999
--- /dev/null
+++ b/share/doc/papers/malloc/problems.ms
@@ -0,0 +1,52 @@
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.ds RH The problems
+.NH
+The problems
+.PP
+Even though malloc(3) is a lot simpler to use
+than the raw brk(2)/sbrk(2) interface,
+or maybe exactly because
+of that,
+a lot of problems arise from its use.
+.IP
+Writing to memory outside the allocated chunk.
+The most likely result being that the data structure used to hold
+the links and flags about this chunk or the next one gets thrashed.
+.IP
+Freeing a pointer to memory not allocated by malloc.
+This is often a pointer that points to an object on the stack or in the
+data-section, in newer implementations of C it may even be in the text-
+section where it is likely to be readonly.
+Some malloc implementations detect this, some don't.
+.IP
+Freeing a modified pointer. This is a very common mistake, freeing
+not the pointer malloc(3) returned, but rather some offset from it.
+Some mallocs will handle this correctly if the offset is positive.
+.IP
+Freeing the same pointer more than once.
+.IP
+Accessing memory in a chunk after it has been free(3)'ed.
+.PP
+The handling of these problems have traditionally been weak.
+A core-dump was the most common form for "handling", but in rare
+cases one could experience the famous "malloc: corrupt arena."
+message before the core-dump.
+Even worse though, very often the program will just continue,
+possibly giving wrong results.
+.PP
+An entirely different form of problem is that
+the memory returned by malloc(3) can contain any value.
+Unfortunately most kernels, correctly, zero out the storage they
+provide with brk(2), and thus the storage malloc returns will be zeroed
+in many cases as well, so programmers are not particular apt to notice
+that their code depends on malloc'ed storage being zeroed.
+.PP
+With problems this big and error handling this weak, it is not
+surprising that problems are hard and time consuming to find and fix.
diff --git a/share/doc/papers/newvm/0.t b/share/doc/papers/newvm/0.t
new file mode 100644
index 000000000000..e53b6714f535
--- /dev/null
+++ b/share/doc/papers/newvm/0.t
@@ -0,0 +1,80 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.rm CM
+.TL
+A New Virtual Memory Implementation for Berkeley
+.UX
+.AU
+Marshall Kirk McKusick
+Michael J. Karels
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+With the cost per byte of memory approaching that of the cost per byte
+for disks, and with file systems increasingly distant from the host
+machines, a new approach to the implementation of virtual memory is
+necessary. Rather than preallocating swap space which limits the
+maximum virtual memory that can be supported to the size of the swap
+area, the system should support virtual memory up to the sum of the
+sizes of physical memory plus swap space. For systems with a local swap
+disk, but remote file systems, it may be useful to use some of the memory
+to keep track of the contents of the swap space to avoid multiple fetches
+of the same data from the file system.
+.PP
+The new implementation should also add new functionality. Processes
+should be allowed to have large sparse address spaces, to map files
+into their address spaces, to map device memory into their address
+spaces, and to share memory with other processes. The shared address
+space may either be obtained by mapping a file into (possibly
+different) parts of their address space, or by arranging to share
+``anonymous memory'' (that is, memory that is zero fill on demand, and
+whose contents are lost when the last process unmaps the memory) with
+another process as is done in System V.
+.PP
+One use of shared memory is to provide a high-speed
+Inter-Process Communication (IPC) mechanism between two or more
+cooperating processes. To insure the integrity of data structures
+in a shared region, processes must be able to use semaphores to
+coordinate their access to these shared structures. In System V,
+these semaphores are provided as a set of system calls. Unfortunately,
+the use of system calls reduces the throughput of the shared memory
+IPC to that of existing IPC mechanisms. We are proposing a scheme
+that places the semaphores in the shared memory segment, so that
+machines that have a test-and-set instruction can handle the usual
+uncontested lock and unlock without doing a system call. Only in
+the unusual case of trying to lock an already-locked lock or in
+releasing a wanted lock will a system call be required. The
+interface will allow a user-level implementation of the System V
+semaphore interface on most machines with a much lower runtime cost.
+.AE
+.LP
+.bp
diff --git a/share/doc/papers/newvm/1.t b/share/doc/papers/newvm/1.t
new file mode 100644
index 000000000000..f363f58fe4ae
--- /dev/null
+++ b/share/doc/papers/newvm/1.t
@@ -0,0 +1,371 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.NH
+Motivations for a New Virtual Memory System
+.PP
+The virtual memory system distributed with Berkeley UNIX has served
+its design goals admirably well over the ten years of its existence.
+However the relentless advance of technology has begun to render it
+obsolete.
+This section of the paper describes the current design,
+points out the current technological trends,
+and attempts to define the new design considerations that should
+be taken into account in a new virtual memory design.
+.NH 2
+Implementation of 4.3BSD virtual memory
+.PP
+All Berkeley Software Distributions through 4.3BSD
+have used the same virtual memory design.
+All processes, whether active or sleeping, have some amount of
+virtual address space associated with them.
+This virtual address space
+is the combination of the amount of address space with which they initially
+started plus any stack or heap expansions that they have made.
+All requests for address space are allocated from available swap space
+at the time that they are first made;
+if there is insufficient swap space left to honor the allocation,
+the system call requesting the address space fails synchronously.
+Thus, the limit to available virtual memory is established by the
+amount of swap space allocated to the system.
+.PP
+Memory pages are used in a sort of shell game to contain the
+contents of recently accessed locations.
+As a process first references a location
+a new page is allocated and filled either with initialized data or
+zeros (for new stack and break pages).
+As the supply of free pages begins to run out, dirty pages are
+pushed to the previously allocated swap space so that they can be reused
+to contain newly faulted pages.
+If a previously accessed page that has been pushed to swap is once
+again used, a free page is reallocated and filled from the swap area
+[Babaoglu79], [Someren84].
+.NH 2
+Design assumptions for 4.3BSD virtual memory
+.PP
+The design criteria for the current virtual memory implementation
+were made in 1979.
+At that time the cost of memory was about a thousand times greater per
+byte than magnetic disks.
+Most machines were used as centralized time sharing machines.
+These machines had far more disk storage than they had memory
+and given the cost tradeoff between memory and disk storage,
+wanted to make maximal use of the memory even at the cost of
+wasting some of the disk space or generating extra disk I/O.
+.PP
+The primary motivation for virtual memory was to allow the
+system to run individual programs whose address space exceeded
+the memory capacity of the machine.
+Thus the virtual memory capability allowed programs to be run that
+could not have been run on a swap based system.
+Equally important in the large central timesharing environment
+was the ability to allow the sum of the memory requirements of
+all active processes to exceed the amount of physical memory on
+the machine.
+The expected mode of operation for which the system was tuned
+was to have the sum of active virtual memory be one and a half
+to two times the physical memory on the machine.
+.PP
+At the time that the virtual memory system was designed,
+most machines ran with little or no networking.
+All the file systems were contained on disks that were
+directly connected to the machine.
+Similarly all the disk space devoted to swap space was also
+directly connected.
+Thus the speed and latency with which file systems could be accessed
+were roughly equivalent to the speed and latency with which swap
+space could be accessed.
+Given the high cost of memory there was little incentive to have
+the kernel keep track of the contents of the swap area once a process
+exited since it could almost as easily and quickly be reread from the
+file system.
+.NH 2
+New influences
+.PP
+In the ten years since the current virtual memory system was designed,
+many technological advances have occurred.
+One effect of the technological revolution is that the
+micro-processor has become powerful enough to allow users to have their
+own personal workstations.
+Thus the computing environment is moving away from a purely centralized
+time sharing model to an environment in which users have a
+computer on their desk.
+This workstation is linked through a network to a centralized
+pool of machines that provide filing, computing, and spooling services.
+The workstations tend to have a large quantity of memory,
+but little or no disk space.
+Because users do not want to be bothered with backing up their disks,
+and because of the difficulty of having a centralized administration
+backing up hundreds of small disks, these local disks are typically
+used only for temporary storage and as swap space.
+Long term storage is managed by the central file server.
+.PP
+Another major technical advance has been in all levels of storage capacity.
+In the last ten years we have experienced a factor of four decrease in the
+cost per byte of disk storage.
+In this same period of time the cost per byte of memory has dropped
+by a factor of a hundred!
+Thus the cost per byte of memory compared to the cost per byte of disk is
+approaching a difference of only about a factor of ten.
+The effect of this change is that the way in which a machine is used
+is beginning to change dramatically.
+As the amount of physical memory on machines increases and the number of
+users per machine decreases, the expected
+mode of operation is changing from that of supporting more active virtual
+memory than physical memory to that of having a surplus of memory that can
+be used for other purposes.
+.PP
+Because many machines will have more physical memory than they do swap
+space (with diskless workstations as an extreme example!),
+it is no longer reasonable to limit the maximum virtual memory
+to the amount of swap space as is done in the current design.
+Consequently, the new design will allow the maximum virtual memory
+to be the sum of physical memory plus swap space.
+For machines with no swap space, the maximum virtual memory will
+be governed by the amount of physical memory.
+.PP
+Another effect of the current technology is that the latency and overhead
+associated with accessing the file system is considerably higher
+since the access must be over the network
+rather than to a locally-attached disk.
+One use of the surplus memory would be to
+maintain a cache of recently used files;
+repeated uses of these files would require at most a verification from
+the file server that the data was up to date.
+Under the current design, file caching is done by the buffer pool,
+while the free memory is maintained in a separate pool.
+The new design should have only a single memory pool so that any
+free memory can be used to cache recently accessed files.
+.PP
+Another portion of the memory will be used to keep track of the contents
+of the blocks on any locally-attached swap space analogously
+to the way that memory pages are handled.
+Thus inactive swap blocks can also be used to cache less-recently-used
+file data.
+Since the swap disk is locally attached, it can be much more quickly
+accessed than a remotely located file system.
+This design allows the user to simply allocate their entire local disk
+to swap space, thus allowing the system to decide what files should
+be cached to maximize its usefulness.
+This design has two major benefits.
+It relieves the user of deciding what files
+should be kept in a small local file system.
+It also insures that all modified files are migrated back to the
+file server in a timely fashion, thus eliminating the need to dump
+the local disk or push the files manually.
+.NH
+User Interface
+.PP
+This section outlines our new virtual memory interface as it is
+currently envisioned.
+The details of the system call interface are contained in Appendix A.
+.NH 2
+Regions
+.PP
+The virtual memory interface is designed to support both large,
+sparse address spaces as well as small, densely-used address spaces.
+In this context, ``small'' is an address space roughly the
+size of the physical memory on the machine,
+while ``large'' may extend up to the maximum addressability of the machine.
+A process may divide its address space up into a number of regions.
+Initially a process begins with four regions;
+a shared read-only fill-on-demand region with its text,
+a private fill-on-demand region for its initialized data,
+a private zero-fill-on-demand region for its uninitialized data and heap,
+and a private zero-fill-on-demand region for its stack.
+In addition to these regions, a process may allocate new ones.
+The regions may not overlap and the system may impose an alignment
+constraint, but the size of the region should not be limited
+beyond the constraints of the size of the virtual address space.
+.PP
+Each new region may be mapped either as private or shared.
+When it is privately mapped, changes to the contents of the region
+are not reflected to any other process that map the same region.
+Regions may be mapped read-only or read-write.
+As an example, a shared library would be implemented as two regions;
+a shared read-only region for the text, and a private read-write
+region for the global variables associated with the library.
+.PP
+A region may be allocated with one of several allocation strategies.
+It may map some memory hardware on the machine such as a frame buffer.
+Since the hardware is responsible for storing the data,
+such regions must be exclusive use if they are privately mapped.
+.PP
+A region can map all or part of a file.
+As the pages are first accessed, the region is filled in with the
+appropriate part of the file.
+If the region is mapped read-write and shared, changes to the
+contents of the region are reflected back into the contents of the file.
+If the region is read-write but private,
+changes to the region are copied to a private page that is not
+visible to other processes mapping the file,
+and these modified pages are not reflected back to the file.
+.PP
+The final type of region is ``anonymous memory''.
+Uninitialed data uses such a region, privately mapped;
+it is zero-fill-on-demand and its contents are abandoned
+when the last reference is dropped.
+Unlike a region that is mapped from a file,
+the contents of an anonymous region will never be read from or
+written to a disk unless memory is short and part of the region
+must be paged to a swap area.
+If one of these regions is mapped shared,
+then all processes see the changes in the region.
+This difference has important performance considerations;
+the overhead of reading, flushing, and possibly allocating a file
+is much higher than simply zeroing memory.
+.PP
+If several processes wish to share a region,
+then they must have some way of rendezvousing.
+For a mapped file this is easy;
+the name of the file is used as the rendezvous point.
+However, processes may not need the semantics of mapped files
+nor be willing to pay the overhead associated with them.
+For anonymous memory they must use some other rendezvous point.
+Our current interface allows processes to associate a
+descriptor with a region, which it may then pass to other
+processes that wish to attach to the region.
+Such a descriptor may be bound into the UNIX file system
+name space so that other processes can find it just as
+they would with a mapped file.
+.NH 2
+Shared memory as high speed interprocess communication
+.PP
+The primary use envisioned for shared memory is to
+provide a high speed interprocess communication (IPC) mechanism
+between cooperating processes.
+Existing IPC mechanisms (\fIi.e.\fP pipes, sockets, or streams)
+require a system call to hand off a set
+of data destined for another process, and another system call
+by the recipient process to receive the data.
+Even if the data can be transferred by remapping the data pages
+to avoid a memory to memory copy, the overhead of doing the system
+calls limits the throughput of all but the largest transfers.
+Shared memory, by contrast, allows processes to share data at any
+level of granularity without system intervention.
+.PP
+However, to maintain all but the simplest of data structures,
+the processes must serialize their modifications to shared
+data structures if they are to avoid corrupting them.
+This serialization is typically done with semaphores.
+Unfortunately, most implementations of semaphores are
+done with system calls.
+Thus processes are once again limited by the need to do two
+system calls per transaction, one to lock the semaphore, the
+second to release it.
+The net effect is that the shared memory model provides little if
+any improvement in interprocess bandwidth.
+.PP
+To achieve a significant improvement in interprocess bandwidth
+requires a large decrease in the number of system calls needed to
+achieve the interaction.
+In profiling applications that use
+serialization locks such as the UNIX kernel,
+one typically finds that most locks are not contested.
+Thus if one can find a way to avoid doing a system call in the case
+in which a lock is not contested,
+one would expect to be able to dramatically reduce the number
+of system calls needed to achieve serialization.
+.PP
+In our design, cooperating processes manage their semaphores
+in their own address space.
+In the typical case, a process executes an atomic test-and-set instruction
+to acquire a lock, finds it free, and thus is able to get it.
+Only in the (rare) case where the lock is already set does the process
+need to do a system call to wait for the lock to clear.
+When a process is finished with a lock,
+it can clear the lock itself.
+Only if the ``WANT'' flag for the lock has been set is
+it necessary for the process to do a system call to cause the other
+process(es) to be awakened.
+.PP
+Another issue that must be considered is portability.
+Some computers require access to special hardware to implement
+atomic interprocessor test-and-set.
+For such machines the setting and clearing of locks would
+all have to be done with system calls;
+applications could still use the same interface without change,
+though they would tend to run slowly.
+.PP
+The other issue of compatibility is with System V's semaphore
+implementation.
+Since the System V interface has been in existence for several years,
+and applications have been built that depend on this interface,
+it is important that this interface also be available.
+Although the interface is based on system calls for both setting and
+clearing locks,
+the same interface can be obtained using our interface without
+system calls in most cases.
+.PP
+This implementation can be achieved as follows.
+System V allows entire sets of semaphores to be set concurrently.
+If any of the locks are unavailable, the process is put to sleep
+until they all become available.
+Under our paradigm, a single additional semaphore is defined
+that serializes access to the set of semaphores being simulated.
+Once obtained in the usual way, the set of semaphores can be
+inspected to see if the desired ones are available.
+If they are available, they are set, the guardian semaphore
+is released and the process proceeds.
+If one or more of the requested set is not available,
+the guardian semaphore is released and the process selects an
+unavailable semaphores for which to wait.
+On being reawakened, the whole selection process must be repeated.
+.PP
+In all the above examples, there appears to be a race condition.
+Between the time that the process finds that a semaphore is locked,
+and the time that it manages to call the system to sleep on the
+semaphore another process may unlock the semaphore and issue a wakeup call.
+Luckily the race can be avoided.
+The insight that is critical is that the process and the kernel agree
+on the physical byte of memory that is being used for the semaphore.
+The system call to put a process to sleep takes a pointer
+to the desired semaphore as its argument so that once inside
+the kernel, the kernel can repeat the test-and-set.
+If the lock has cleared
+(and possibly the wakeup issued) between the time that the process
+did the test-and-set and eventually got into the sleep request system call,
+then the kernel immediately resumes the process rather than putting
+it to sleep.
+Thus the only problem to solve is how the kernel interlocks between testing
+a semaphore and going to sleep;
+this problem has already been solved on existing systems.
+.NH
+References
+.sp
+.IP [Babaoglu79] 20
+Babaoglu, O., and Joy, W.,
+``Data Structures Added in the Berkeley Virtual Memory Extensions
+to the UNIX Operating System''
+Computer Systems Research Group, Dept of EECS, University of California,
+Berkeley, CA 94720, USA, November 1979.
+.IP [Someren84] 20
+Someren, J. van,
+``Paging in Berkeley UNIX'',
+Laboratorium voor schakeltechniek en techneik v.d.
+informatieverwerkende machines,
+Codenummer 051560-44(1984)01, February 1984.
diff --git a/share/doc/papers/newvm/Makefile b/share/doc/papers/newvm/Makefile
new file mode 100644
index 000000000000..6a4d1342162d
--- /dev/null
+++ b/share/doc/papers/newvm/Makefile
@@ -0,0 +1,6 @@
+VOLUME= papers
+DOC= newvm
+SRCS= 0.t 1.t a.t
+MACROS= -ms
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/newvm/a.t b/share/doc/papers/newvm/a.t
new file mode 100644
index 000000000000..4dcbc4f7283e
--- /dev/null
+++ b/share/doc/papers/newvm/a.t
@@ -0,0 +1,233 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.sp 2
+.ne 2i
+.NH
+Appendix A \- Virtual Memory Interface
+.NH 2
+Mapping pages
+.PP
+The system supports sharing of data between processes
+by allowing pages to be mapped into memory. These mapped
+pages may be \fIshared\fP with other processes or \fIprivate\fP
+to the process.
+Protection and sharing options are defined in \fI<sys/mman.h>\fP as:
+.DS
+.ta \w'#define\ \ 'u +\w'MAP_HASSEMAPHORE\ \ 'u +\w'0x0080\ \ 'u
+/* protections are chosen from these bits, or-ed together */
+#define PROT_READ 0x04 /* pages can be read */
+#define PROT_WRITE 0x02 /* pages can be written */
+#define PROT_EXEC 0x01 /* pages can be executed */
+.DE
+.DS
+.ta \w'#define\ \ 'u +\w'MAP_HASSEMAPHORE\ \ 'u +\w'0x0080\ \ 'u
+/* flags contain mapping type, sharing type and options */
+/* mapping type; choose one */
+#define MAP_FILE 0x0001 /* mapped from a file or device */
+#define MAP_ANON 0x0002 /* allocated from memory, swap space */
+#define MAP_TYPE 0x000f /* mask for type field */
+.DE
+.DS
+.ta \w'#define\ \ 'u +\w'MAP_HASSEMAPHORE\ \ 'u +\w'0x0080\ \ 'u
+/* sharing types; choose one */
+#define MAP_SHARED 0x0010 /* share changes */
+#define MAP_PRIVATE 0x0000 /* changes are private */
+.DE
+.DS
+.ta \w'#define\ \ 'u +\w'MAP_HASSEMAPHORE\ \ 'u +\w'0x0080\ \ 'u
+/* other flags */
+#define MAP_FIXED 0x0020 /* map addr must be exactly as requested */
+#define MAP_INHERIT 0x0040 /* region is retained after exec */
+#define MAP_HASSEMAPHORE 0x0080 /* region may contain semaphores */
+.DE
+The cpu-dependent size of a page is returned by the
+\fIgetpagesize\fP system call:
+.DS
+pagesize = getpagesize();
+result int pagesize;
+.DE
+.LP
+The call:
+.DS
+maddr = mmap(addr, len, prot, flags, fd, pos);
+result caddr_t maddr; caddr_t addr; int *len, prot, flags, fd; off_t pos;
+.DE
+causes the pages starting at \fIaddr\fP and continuing
+for at most \fIlen\fP bytes to be mapped from the object represented by
+descriptor \fIfd\fP, starting at byte offset \fIpos\fP.
+The starting address of the region is returned;
+for the convenience of the system,
+it may differ from that supplied
+unless the MAP_FIXED flag is given,
+in which case the exact address will be used or the call will fail.
+The actual amount mapped is returned in \fIlen\fP.
+The \fIaddr\fP, \fIlen\fP, and \fIpos\fP parameters
+must all be multiples of the pagesize.
+A successful \fImmap\fP will delete any previous mapping
+in the allocated address range.
+The parameter \fIprot\fP specifies the accessibility
+of the mapped pages.
+The parameter \fIflags\fP specifies
+the type of object to be mapped,
+mapping options, and
+whether modifications made to
+this mapped copy of the page
+are to be kept \fIprivate\fP, or are to be \fIshared\fP with
+other references.
+Possible types include MAP_FILE,
+mapping a regular file or character-special device memory,
+and MAP_ANON, which maps memory not associated with any specific file.
+The file descriptor used for creating MAP_ANON regions is used only
+for naming, and may be given as \-1 if no name
+is associated with the region.\(dg
+.FS
+\(dg The current design does not allow a process
+to specify the location of swap space.
+In the future we may define an additional mapping type, MAP_SWAP,
+in which the file descriptor argument specifies a file
+or device to which swapping should be done.
+.FE
+The MAP_INHERIT flag allows a region to be inherited after an \fIexec\fP.
+The MAP_HASSEMAPHORE flag allows special handling for
+regions that may contain semaphores.
+.PP
+A facility is provided to synchronize a mapped region with the file
+it maps; the call
+.DS
+msync(addr, len);
+caddr_t addr; int len;
+.DE
+writes any modified pages back to the filesystem and updates
+the file modification time.
+If \fIlen\fP is 0, all modified pages within the region containing \fIaddr\fP
+will be flushed;
+if \fIlen\fP is non-zero, only the pages containing \fIaddr\fP and \fIlen\fP
+succeeding locations will be examined.
+Any required synchronization of memory caches
+will also take place at this time.
+Filesystem operations on a file that is mapped for shared modifications
+are unpredictable except after an \fImsync\fP.
+.PP
+A mapping can be removed by the call
+.DS
+munmap(addr, len);
+caddr_t addr; int len;
+.DE
+This call deletes the mappings for the specified address range,
+and causes further references to addresses within the range
+to generate invalid memory references.
+.NH 2
+Page protection control
+.PP
+A process can control the protection of pages using the call
+.DS
+mprotect(addr, len, prot);
+caddr_t addr; int len, prot;
+.DE
+This call changes the specified pages to have protection \fIprot\fP\|.
+Not all implementations will guarantee protection on a page basis;
+the granularity of protection changes may be as large as an entire region.
+.NH 2
+Giving and getting advice
+.PP
+A process that has knowledge of its memory behavior may
+use the \fImadvise\fP call:
+.DS
+madvise(addr, len, behav);
+caddr_t addr; int len, behav;
+.DE
+\fIBehav\fP describes expected behavior, as given
+in \fI<sys/mman.h>\fP:
+.DS
+.ta \w'#define\ \ 'u +\w'MADV_SEQUENTIAL\ \ 'u +\w'00\ \ \ \ 'u
+#define MADV_NORMAL 0 /* no further special treatment */
+#define MADV_RANDOM 1 /* expect random page references */
+#define MADV_SEQUENTIAL 2 /* expect sequential references */
+#define MADV_WILLNEED 3 /* will need these pages */
+#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
+.DE
+Finally, a process may obtain information about whether pages are
+core resident by using the call
+.DS
+mincore(addr, len, vec)
+caddr_t addr; int len; result char *vec;
+.DE
+Here the current core residency of the pages is returned
+in the character array \fIvec\fP, with a value of 1 meaning
+that the page is in-core.
+.NH 2
+Synchronization primitives
+.PP
+Primitives are provided for synchronization using semaphores in shared memory.
+Semaphores must lie within a MAP_SHARED region with at least modes
+PROT_READ and PROT_WRITE.
+The MAP_HASSEMAPHORE flag must have been specified when the region was created.
+To acquire a lock a process calls:
+.DS
+value = mset(sem, wait)
+result int value; semaphore *sem; int wait;
+.DE
+\fIMset\fP indivisibly tests and sets the semaphore \fIsem\fP.
+If the previous value is zero, the process has acquired the lock
+and \fImset\fP returns true immediately.
+Otherwise, if the \fIwait\fP flag is zero,
+failure is returned.
+If \fIwait\fP is true and the previous value is non-zero,
+\fImset\fP relinquishes the processor until notified that it should retry.
+.LP
+To release a lock a process calls:
+.DS
+mclear(sem)
+semaphore *sem;
+.DE
+\fIMclear\fP indivisibly tests and clears the semaphore \fIsem\fP.
+If the ``WANT'' flag is zero in the previous value,
+\fImclear\fP returns immediately.
+If the ``WANT'' flag is non-zero in the previous value,
+\fImclear\fP arranges for waiting processes to retry before returning.
+.PP
+Two routines provide services analogous to the kernel
+\fIsleep\fP and \fIwakeup\fP functions interpreted in the domain of
+shared memory.
+A process may relinquish the processor by calling \fImsleep\fP
+with a set semaphore:
+.DS
+msleep(sem)
+semaphore *sem;
+.DE
+If the semaphore is still set when it is checked by the kernel,
+the process will be put in a sleeping state
+until some other process issues an \fImwakeup\fP for the same semaphore
+within the region using the call:
+.DS
+mwakeup(sem)
+semaphore *sem;
+.DE
+An \fImwakeup\fP may awaken all sleepers on the semaphore,
+or may awaken only the next sleeper on a queue.
diff --git a/share/doc/papers/newvm/spell.ok b/share/doc/papers/newvm/spell.ok
new file mode 100644
index 000000000000..543dc7e16a8f
--- /dev/null
+++ b/share/doc/papers/newvm/spell.ok
@@ -0,0 +1,56 @@
+ANON
+Babaoglu
+Babaoglu79
+Behav
+CM
+Codenummer
+DONTNEED
+Dept
+EECS
+Filesystem
+HASSEMAPHORE
+IPC
+Karels
+Laboratorium
+MADV
+McKusick
+Mclear
+Mset
+NOEXTEND
+PROT
+SPACEAVAIL
+Someren
+Someren84
+WILLNEED
+addr
+behav
+caching
+caddr
+es
+fd
+filesystem
+getpagesize
+informatieverwerkende
+len
+maddr
+madvise
+mclear
+mincore
+mman.h
+mmap
+mprotect
+mset
+msleep
+msync
+munmap
+mwakeup
+pagesize
+pos
+prot
+runtime
+schakeltechniek
+sem
+techneik
+v.d
+vec
+voor
diff --git a/share/doc/papers/relengr/0.t b/share/doc/papers/relengr/0.t
new file mode 100644
index 000000000000..3731dba787cf
--- /dev/null
+++ b/share/doc/papers/relengr/0.t
@@ -0,0 +1,85 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.rm CM
+.nr PO 1.25i
+.ds CH "
+.ds CF "%
+.nr Fn 0 1
+.ds b3 4.3\s-1BSD\s+1
+.de KI
+.ds Lb "Fig. \\n+(Fn
+.KF
+.ce 1
+Figure \\n(Fn - \\$1.
+..
+.de SM
+\\s-1\\$1\\s+1\\$2
+..
+.de NM
+\&\fI\\$1\fP\\$2
+..
+.de RN
+\&\fI\\$1\fP\^(\^)\\$2
+..
+.de PN
+\&\fB\\$1\fP\\$2
+..
+.TL
+The Release Engineering of 4.3\s-1BSD\s0
+.AU
+Marshall Kirk McKusick
+.AU
+Michael J. Karels
+.AU
+Keith Bostic
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+This paper describes an approach used by a small group of people
+to develop and integrate a large software system.
+It details the development and release engineering strategy
+used during the preparation of the \*(b3 version of the UNIX\(dg
+.FS
+\(dgUNIX is a registered trademark of AT&T in the US and other countries.
+.FE
+operating system.
+Each release cycle is divided into an initial development phase
+followed by a release engineering phase.
+The release engineering of the distribution is done in three steps.
+The first step has an informal control policy for tracking modifications;
+it results in an alpha distribution.
+The second step has more rigid change mechanisms in place;
+it results in a beta release.
+During the final step changes are tracked very closely;
+the result is the final distribution.
+.AE
+.LP
diff --git a/share/doc/papers/relengr/1.t b/share/doc/papers/relengr/1.t
new file mode 100644
index 000000000000..cd15d8034137
--- /dev/null
+++ b/share/doc/papers/relengr/1.t
@@ -0,0 +1,63 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.NH
+Introduction
+.PP
+The Computer Systems Research Group (\c
+.SM CSRG )
+has always been a small group of software developers.
+This resource limitation requires careful software-engineering management
+as well as careful coordination of both
+.SM CSRG
+personnel and the members of the general community who
+contribute to the development of the system.
+.PP
+Releases from Berkeley alternate between those that introduce
+major new facilities and those that provide bug fixes and efficiency
+improvements.
+This alternation allows timely releases, while providing for refinement,
+tuning, and correction of the new facilities.
+The timely followup of ``cleanup'' releases reflects the importance
+.SM CSRG
+places on providing a reliable and robust system on which its
+user community can depend.
+.PP
+The development of the Berkeley Software Distribution (\c
+.SM BSD )
+illustrates an \fIadvantage\fP of having a few
+principal developers:
+the developers all understand the entire system thoroughly enough
+to be able to coordinate their own work with
+that of other people to produce a coherent final system.
+Companies with large development organizations find
+this result difficult to duplicate.
+This paper describes the process by which
+the development effort for \*(b3 was managed.
+.[
+design and implementation
+.]
diff --git a/share/doc/papers/relengr/2.t b/share/doc/papers/relengr/2.t
new file mode 100644
index 000000000000..f786a55498dc
--- /dev/null
+++ b/share/doc/papers/relengr/2.t
@@ -0,0 +1,140 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.NH
+System Development
+.PP
+The first phase of each Berkeley system is its development.
+.SM CSRG
+maintains a continuously evolving list of projects that are candidates
+for integration into the system.
+Some of these are prompted by emerging ideas from the research world,
+such as the availability of a new technology, while other additions
+are suggested by the commercial world, such as the introduction of
+new standards like
+.SM POSIX ,
+and still other projects are emergency responses to situations like
+the Internet Worm.
+.PP
+These projects are ordered based on the perceived benefit of the
+project as opposed to its difficulty;
+the most important are selected for inclusion in each new release.
+Often there is a prototype available from a group outside
+.SM CSRG .
+Because of the limited staff at
+.SM CSRG ,
+this prototype is obtained to use as a starting base
+for integration into the
+.SM BSD
+system.
+Only if no prototype is available is the project begun in-house.
+In either case, the design of the facility is forced to conform to the
+.SM CSRG
+style.
+.PP
+Unlike other development groups, the staff of
+.SM CSRG
+specializes by projects rather than by particular parts
+of the system;
+a staff person will be responsible for all aspects of a project.
+This responsibility starts at the associated kernel device drivers;
+it proceeds up through the rest of the kernel,
+through the C library and system utility programs,
+ending at the user application layer.
+This staff person is also responsible for related documentation,
+including manual pages.
+Many projects proceed in parallel,
+interacting with other projects as their paths cross.
+.PP
+All source code, documentation, and auxiliary files are kept
+under a source code control system.
+During development,
+this control system is critical for notifying people
+when they are colliding with other ongoing projects.
+Even more important, however,
+is the audit trail maintained by the control system that
+is critical to the release engineering phase of the project
+described in the next section.
+.PP
+Much of the development of
+.SM BSD
+is done by personnel that are located at other institutions.
+Many of these people not only have interim copies of the release
+running on their own machines,
+but also have user accounts on the main development
+machine at Berkeley.
+Such users are commonly found logged in at Berkeley over the
+Internet, or sometimes via telephone dialup, from places as far away
+as Massachusetts or Maryland, as well as from closer places, such as
+Stanford.
+For the \*(b3 release,
+certain users had permission to modify the master copy of the
+system source directly.
+People given access to the master sources
+are carefully screened beforehand,
+but are not closely supervised.
+Their work is checked at the end of the beta-test period by
+.SM CSRG
+personnel who back out inappropriate changes.
+Several facilities, including the
+Fortran and C compilers,
+as well as important system programs, for example,
+.PN telnet
+and
+.PN ftp ,
+include significant contributions from people who did not work
+directly for
+.SM CSRG .
+One important exception to this approach is that changes to the kernel
+are made only by
+.SM CSRG
+personnel, although the changes are often suggested by the larger community.
+.PP
+The development phase continues until
+.SM CSRG
+decides that it is appropriate to make a release.
+The decision to halt development and transition to release mode
+is driven by several factors.
+The most important is that enough projects have been completed
+to make the system significantly superior to the previously released
+version of the system.
+For example,
+\*(b3 was released primarily because of the need for
+the improved networking capabilities and the markedly
+improved system performance.
+Of secondary importance is the issue of timing.
+If the releases are too infrequent, then
+.SM CSRG
+will be inundated with requests for interim releases.
+Conversely,
+if systems are released too frequently,
+the integration cost for many vendors will be too high,
+causing them to ignore the releases.
+Finally,
+the process of release engineering is long and tedious.
+Frequent releases slow the rate of development and
+cause undue tedium to the staff.
diff --git a/share/doc/papers/relengr/3.t b/share/doc/papers/relengr/3.t
new file mode 100644
index 000000000000..22cc33cf075e
--- /dev/null
+++ b/share/doc/papers/relengr/3.t
@@ -0,0 +1,384 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.NH
+System Release
+.PP
+Once the decision has been made to halt development
+and begin release engineering,
+all currently unfinished projects are evaluated.
+This evaluation involves computing the time required to complete
+the project as opposed to how important the project is to the
+upcoming release.
+Projects that are not selected for completion are
+removed from the distribution branch of the source code control system
+and saved on branch deltas so they can be retrieved,
+completed, and merged into a future release;
+the remaining unfinished projects are brought to orderly completion.
+.PP
+Developments from
+.SM CSRG
+are released in three steps: alpha, beta, and final.
+Alpha and beta releases are not true distributions\(emthey
+are test systems.
+Alpha releases are normally available to only a few sites,
+usually those working closely with
+.SM CSRG .
+More sites are given beta releases,
+as the system is closer to completion,
+and needs wider testing to find more obscure problems.
+For example, \*(b3 alpha was distributed to about fifteen
+sites, while \*(b3 beta ran at more than a hundred.
+.NH 2
+Alpha Distribution Development
+.PP
+The first step in creating an alpha distribution is to evaluate the
+existing state of the system and to decide what software should be
+included in the release.
+This decision process includes not only deciding what software should
+be added, but also what obsolete software ought to be retired from the
+distribution.
+The new software includes the successful projects that have been
+completed at
+.SM CSRG
+and elsewhere, as well as some portion of the vast quantity of
+contributed software that has been offered during the development
+period.
+.PP
+Once an initial list has been created,
+a prototype filesystem corresponding to the distribution
+is constructed, typically named
+.PN /nbsd .
+This prototype will eventually turn into the master source tree for the
+final distribution.
+During the period that the alpha distribution is being created,
+.PN /nbsd
+is mounted read-write, and is highly fluid.
+Programs are created and deleted,
+old versions of programs are completely replaced,
+and the correspondence between the sources and binaries
+is only loosely tracked.
+People outside
+.SM CSRG
+who are helping with the distribution are free to
+change their parts of the distribution at will.
+.PP
+During this period the newly forming distribution is
+checked for interoperability.
+For example,
+in \*(b3 the output of context differences from
+.PN diff
+was changed to merge overlapping sections.
+Unfortunately, this change broke the
+.PN patch
+program which could no longer interpret the output of
+.PN diff .
+Since the change to
+.PN diff
+and the
+.PN patch
+program had originated outside Berkeley,
+.SM CSRG
+had to coordinate the efforts of the respective authors
+to make the programs work together harmoniously.
+.PP
+Once the sources have stabilized,
+an attempt is made to compile the entire source tree.
+Often this exposes errors caused by changed header files,
+or use of obsoleted C library interfaces.
+If the incompatibilities affect too many programs,
+or require excessive amounts of change in the programs
+that are affected,
+the incompatibility is backed out or some backward-compatible
+interface is provided.
+The incompatibilities that are found and left in are noted
+in a list that is later incorporated into the release notes.
+Thus, users upgrading to the new system can anticipate problems
+in their own software that will require change.
+.PP
+Once the source tree compiles completely,
+it is installed and becomes the running system that
+.SM CSRG
+uses on its main development machine.
+Once in day-to-day use,
+other interoperability problems become apparent
+and are resolved.
+When all known problems have been resolved, and the system has been
+stable for some period of time, an alpha distribution tape is made
+from the contents of
+.PN /nbsd .
+.PP
+The alpha distribution is sent out to a small set of test sites.
+These test sites are selected as having a
+sophisticated user population, not only capable of finding bugs,
+but also of determining their cause and developing a fix for the problem.
+These sites are usually composed of groups that are contributing
+software to the distribution or groups that have a particular expertise
+with some portion of the system.
+.NH 2
+Beta Distribution Development
+.PP
+After the alpha tape is created,
+the distribution filesystem is mounted read-only.
+Further changes are requested in a change log rather than
+being made directly to the distribution.
+The change requests are inspected and implemented by a
+.SM CSRG
+staff person, followed by a compilation of the affected
+programs to ensure that they still build correctly.
+Once the alpha tape has been cut,
+changes to the distribution are no longer made by people outside
+.SM CSRG .
+.PP
+As the alpha sites install and begin running the alpha distribution,
+they monitor the problems that they encounter.
+For minor bugs, they typically report back the bug along with
+a suggested fix.
+Since many of the alpha sites are selected from among the people
+working closely with
+.SM CSRG ,
+they often have accounts on, and access to, the primary
+.SM CSRG
+development machine.
+Thus, they are able to directly install the fix themselves,
+and simply notify
+.SM CSRG
+when they have fixed the problem.
+After verifying the fix, the affected files are added to
+the list to be updated on
+.PN /nbsd .
+.PP
+The more important task of the alpha sites is to test out the
+new facilities that have been added to the system.
+The alpha sites often find major design flaws
+or operational shortcomings of the facilities.
+When such problems are found,
+the person in charge of that facility is responsible
+for resolving the problem.
+Occasionally this requires redesigning and reimplementing
+parts of the affected facility.
+For example,
+in 4.2\s-1BSD\s+1,
+the alpha release of the networking system did not have connection queueing.
+This shortcoming prevented the network from handling many
+connections to a single server.
+The result was that the networking interface had to be
+redesigned to provide this functionality.
+.PP
+The alpha sites are also responsible for ferreting out interoperability
+problems between different utilities.
+The user populations of the test sites differ from the user population at
+.SM CSRG ,
+and, as a result, the utilities are exercised in ways that differ
+from the ways that they are used at
+.SM CSRG .
+These differences in usage patterns turn up problems that
+do not occur in our initial test environment.
+.PP
+The alpha sites frequently redistribute the alpha tape to several
+of their own alpha sites that are particularly interested
+in parts of the new system.
+These additional sites are responsible for reporting
+problems back to the site from which they received the distribution,
+not to
+.SM CSRG .
+Often these redistribution sites are less sophisticated than the
+direct alpha sites, so their reports need to be filtered
+to avoid spurious, or site dependent, bug reports.
+The direct alpha sites sift through the reports to find those that
+are relevant, and usually verify the suggested fix if one is given,
+or develop a fix if none is provided.
+This hierarchical testing process forces
+bug reports, fixes, and new software
+to be collected, evaluated, and checked for inaccuracies
+by first-level sites before being forwarded to
+.SM CSRG ,
+allowing the developers at
+.SM CSRG
+to concentrate on tracking the changes being made to the system
+rather than sifting through information (often voluminous) from every
+alpha-test site.
+.PP
+Once the major problems have been attended to,
+the focus turns to getting the documentation synchronized
+with the code that is being shipped.
+The manual pages need to be checked to be sure that
+they accurately reflect any changes to the programs that
+they describe.
+Usually the manual pages are kept up to date as
+the program they describe evolves.
+However, the supporting documents frequently do not get changed,
+and must be edited to bring them up to date.
+During this review, the need for other documents becomes evident.
+For example, it was
+during this phase of \*(b3 that it was decided
+to add a tutorial document on how to use the socket
+interprocess communication primitives.
+.PP
+Another task during this period is to contact the people that
+have contributed complete software packages
+(such as
+.PN RCS
+or
+.PN MH )
+in previous releases to see if they wish to
+make any revisions to their software.
+For those who do,
+the new software has to be obtained,
+and tested to verify that it compiles and runs
+correctly on the system to be released.
+Again, this integration and testing can often be done by the
+contributors themselves by logging directly into the master machine.
+.PP
+After the stream of bug reports has slowed down
+to a reasonable level,
+.SM CSRG
+begins a careful review of all the changes to the
+system since the previous release.
+The review is done by running a recursive
+.PN diff
+of the entire source tree\(emhere, of
+.PN /nbsd
+with 4.2\s-1BSD\s+1.
+All the changes are checked to ensure that they are reasonable,
+and have been properly documented.
+The process often turns up questionable changes.
+When such a questionable change is found,
+the source code control system log is examined to find
+out who made the change and what their explanation was
+for the change.
+If the log does not resolve the problem,
+the person responsible for the change is asked for an explanation
+of what they were trying to accomplish.
+If the reason is not compelling,
+the change is backed out.
+Facilities deemed inappropriate in \*(b3 included new options to
+the directory-listing command and a changed return value for the
+.RN fseek
+library routine;
+the changes were removed from the source before final distribution.
+Although this process is long and tedious,
+it forces the developers to obtain a coherent picture of the entire set of
+changes to the system.
+This exercise often turns up inconsistencies that would
+otherwise never be found.
+.PP
+The outcome of the comparison results in
+a pair of documents detailing
+changes to every user-level command
+.[
+Bug Fixes and Changes
+.]
+and to every kernel source file.
+.[
+Changes to the Kernel
+.]
+These documents are delivered with the final distribution.
+A user can look up any command by name and see immediately
+what has changed,
+and a developer can similarly look up any kernel
+file by name and get a summary of the changes to that file.
+.PP
+Having completed the review of the entire system,
+the preparation of the beta distribution is started.
+Unlike the alpha distribution, where pieces of the system
+may be unfinished and the documentation incomplete,
+the beta distribution is put together as if it were
+going to be the final distribution.
+All known problems are fixed, and any remaining development
+is completed.
+Once the beta tape has been prepared,
+no further changes are permitted to
+.PN /nbsd
+without careful review,
+as spurious changes made after the system has been
+.PN diff ed
+are unlikely to be caught.
+.NH 2
+Final Distribution Development
+.PP
+The beta distribution goes to more sites than the
+alpha distribution for three main reasons.
+First, as it is closer to the final release, more sites are willing
+to run it in a production environment without fear of catastrophic failures.
+Second, more commercial sites delivering
+.SM BSD -\c
+derived systems are interested in getting a preview of the
+upcoming changes in preparation for merging them into their
+own systems.
+Finally, because the beta tape has fewer problems,
+it is beneficial to offer it to more sites in hopes of
+finding as many of the remaining problems as possible.
+Also, by handing the system out to less sophisticated sites,
+issues that would be ignored by the users of the alpha sites
+become apparent.
+.PP
+The anticipation is that the beta tape will not require
+extensive changes to either the programs or the documentation.
+Most of the work involves sifting through the reported bugs
+to find those that are relevant and devising the minimal
+reasonable set of changes to fix them.
+After throughly testing the fix, it is listed in the update log for
+.PN /nbsd .
+One person at
+.SM CSRG
+is responsible for doing the update of
+.PN /nbsd
+and ensuring that everything affected by the change is rebuilt and tested.
+Thus, a change to a C library routine requires that the entire
+system be rebuilt.
+.PP
+During this period, the documentation is all printed and proofread.
+As minor changes are made to the manual pages and documentation,
+the affected pages must be reprinted.
+.PP
+The final step in the release process is to check the distribution tree
+to ensure that it is in a consistent state.
+This step includes verification that every file and directory
+on the distribution has the proper owner, group, and modes.
+All source files must be checked to be sure that they have
+appropriate copyright notices and source code control system headers.
+Any extraneous files must be removed.
+Finally, the installed binaries must be checked to ensure that they correspond
+exactly to the sources and libraries that are on the distribution.
+.PP
+This checking is a formidable task given that there are over 20,000 files on
+a typical distribution.
+Much of the checking can be done by a set of programs set to scan
+over the distribution tree.
+Unfortunately, the exception list is long, and requires
+hours of tedious hand checking; this has caused
+.SM CSRG
+to develop even
+more comprehensive validation programs for use in our next release.
+.PP
+Once the final set of checks has been run,
+the master tape can be made, and the official distribution started.
+As for the staff of
+.SM CSRG ,
+we usually take a brief vacation before plunging back into
+a new development phase.
diff --git a/share/doc/papers/relengr/Makefile b/share/doc/papers/relengr/Makefile
new file mode 100644
index 000000000000..a2a2b355ef6e
--- /dev/null
+++ b/share/doc/papers/relengr/Makefile
@@ -0,0 +1,12 @@
+VOLUME= papers
+DOC= releng
+SRCS= stubs 0.t 1.t 2.t 3.t
+EXTRA= ref.bib
+MACROS= -ms
+USE_REFER=
+CLEANFILES= stubs
+
+stubs:
+ @(echo .R1; echo database ${.CURDIR}/ref.bib; echo .R2) > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/relengr/ref.bib b/share/doc/papers/relengr/ref.bib
new file mode 100644
index 000000000000..6f33cd7e9dd4
--- /dev/null
+++ b/share/doc/papers/relengr/ref.bib
@@ -0,0 +1,26 @@
+%A M. K. McKusick
+%A J. M. Bloom
+%A M. J. Karels
+%T Bug Fixes and Changes in 4.3BSD
+%B \s-1UNIX\s0 System Manager's Manual, 4.3 Berkeley Software Distribution, Virtual VAX-11 Version
+%I \s-1USENIX\s0 Association
+%C Berkeley, CA
+%P 12:1\-22
+%D 1986
+
+%A M. J. Karels
+%T Changes to the Kernel in 4.3BSD
+%B \s-1UNIX\s0 System Manager's Manual, 4.3 Berkeley Software Distribution, Virtual VAX-11 Version
+%I \s-1USENIX\s0 Association
+%C Berkeley, CA
+%P 13:1\-32
+%D 1986
+
+%A S. J. Leffler
+%A M. K. McKusick
+%A M. J. Karels
+%A J. S. Quarterman
+%T The Design and Implementation of the 4.3BSD UNIX Operating System
+%I Addison-Wesley
+%C Reading, MA
+%D 1989
diff --git a/share/doc/papers/relengr/spell.ok b/share/doc/papers/relengr/spell.ok
new file mode 100644
index 000000000000..13f5cf8b90ba
--- /dev/null
+++ b/share/doc/papers/relengr/spell.ok
@@ -0,0 +1,15 @@
+BSD
+Bostic
+CH
+CM
+CSRG
+Fn
+Karels
+Lb
+McKusick
+POSIX
+editted
+filesystem
+followup
+mothballed
+nbsd
diff --git a/share/doc/papers/sysperf/0.t b/share/doc/papers/sysperf/0.t
new file mode 100644
index 000000000000..e2ebaf6be22b
--- /dev/null
+++ b/share/doc/papers/sysperf/0.t
@@ -0,0 +1,241 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.if n .ND
+.TL
+Measuring and Improving the Performance of Berkeley UNIX*
+.sp
+April 17, 1991
+.AU
+Marshall Kirk McKusick,
+Samuel J. Leffler\(dg,
+Michael J. Karels
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, CA 94720
+.AB
+.FS
+* UNIX is a trademark of AT&T Bell Laboratories.
+.FE
+.FS
+\(dg Samuel J. Leffler is currently employed by:
+Silicon Graphics, Inc.
+.FE
+.FS
+This work was done under grants from
+the National Science Foundation under grant MCS80-05144,
+and the Defense Advance Research Projects Agency (DoD) under
+ARPA Order No. 4031 monitored by Naval Electronic System Command under
+Contract No. N00039-82-C-0235.
+.FE
+The 4.2 Berkeley Software Distribution of
+.UX
+for the VAX\(dd
+.FS
+\(dd VAX, MASSBUS, UNIBUS, and DEC are trademarks of
+Digital Equipment Corporation.
+.FE
+had several problems that could severely affect the overall
+performance of the system.
+These problems were identified with
+kernel profiling and system tracing during day to day use.
+Once potential problem areas had been identified
+benchmark programs were devised to highlight the bottlenecks.
+These benchmarks verified that the problems existed and provided
+a metric against which to validate proposed solutions.
+This paper examines
+the performance problems encountered and describes
+modifications that have been made
+to the system since the initial distribution.
+.PP
+The changes to the system have consisted of improvements to the
+performance of the existing facilities,
+as well as enhancements to the current facilities.
+Performance improvements in the kernel include cacheing of path name
+translations, reductions in clock handling and scheduling overhead,
+and improved throughput of the network subsystem.
+Performance improvements in the libraries and utilities include replacement of
+linear searches of system databases with indexed lookup,
+merging of most network services into a single daemon,
+and conversion of system utilities to use the more efficient
+facilities available in 4.2BSD.
+Enhancements in the kernel include the addition of subnets and gateways,
+increases in many kernel limits,
+cleanup of the signal and autoconfiguration implementations,
+and support for windows and system logging.
+Functional extensions in the libraries and utilities include
+the addition of an Internet name server,
+new system management tools,
+and extensions to \fIdbx\fP to work with Pascal.
+The paper concludes with a brief discussion of changes made to
+the system to enhance security.
+All of these enhancements are present in Berkeley UNIX 4.3BSD.
+.AE
+.LP
+.sp 2
+CR Categories and Subject Descriptors:
+D.4.3
+.B "[Operating Systems]":
+File Systems Management \-
+.I "file organization, directory structures, access methods";
+D.4.8
+.B "[Operating Systems]":
+Performance \-
+.I "measurements, operational analysis";
+.sp
+Additional Keywords and Phrases:
+Berkeley UNIX,
+system performance,
+application program interface.
+.sp
+General Terms:
+UNIX operating system,
+measurement,
+performance.
+.de PT
+.lt \\n(LLu
+.pc %
+.nr PN \\n%
+.tl '\\*(LH'\\*(CH'\\*(RH'
+.lt \\n(.lu
+..
+.af PN i
+.ds LH Performance
+.ds RH Contents
+.bp 1
+.if t .ds CF April 17, 1991
+.if t .ds LF DRAFT
+.if t .ds RF McKusick, et. al.
+.ce
+.B "TABLE OF CONTENTS"
+.LP
+.sp 1
+.nf
+.B "1. Introduction"
+.LP
+.sp .5v
+.nf
+.B "2. Observation techniques
+\0.1. System maintenance tools
+\0.2. Kernel profiling
+\0.3. Kernel tracing
+\0.4. Benchmark programs
+.LP
+.sp .5v
+.nf
+.B "3. Results of our observations
+\0.1. User programs
+\0.1.1. Mail system
+\0.1.2. Network servers
+\0.2. System overhead
+\0.2.1. Micro-operation benchmarks
+\0.2.2. Path name translation
+\0.2.3. Clock processing
+\0.2.4. Terminal multiplexors
+\0.2.5. Process table management
+\0.2.6. File system buffer cache
+\0.2.7. Network subsystem
+\0.2.8. Virtual memory subsystem
+.LP
+.sp .5v
+.nf
+.B "4. Performance Improvements
+\0.1. Performance Improvements in the Kernel
+\0.1.1. Name Cacheing
+\0.1.2. Intelligent Auto Siloing
+\0.1.3. Process Table Management
+\0.1.4. Scheduling
+\0.1.5. Clock Handling
+\0.1.6. File System
+\0.1.7. Network
+\0.1.8. Exec
+\0.1.9. Context Switching
+\0.1.10. Setjmp and Longjmp
+\0.1.11. Compensating for Lack of Compiler Technology
+\0.2. Improvements to Libraries and Utilities
+\0.2.1. Hashed Databases
+\0.2.2. Buffered I/O
+\0.2.3. Mail System
+\0.2.4. Network Servers
+\0.2.5. The C Run-time Library
+\0.2.6. Csh
+.LP
+.sp .5v
+.nf
+.B "5. Functional Extensions
+\0.1. Kernel Extensions
+\0.1.1. Subnets, Broadcasts, and Gateways
+\0.1.2. Interface Addressing
+\0.1.3. User Control of Network Buffering
+\0.1.4. Number of File Descriptors
+\0.1.5. Kernel Limits
+\0.1.6. Memory Management
+\0.1.7. Signals
+\0.1.8. System Logging
+\0.1.9. Windows
+\0.1.10. Configuration of UNIBUS Devices
+\0.1.11. Disk Recovery from Errors
+\0.2. Functional Extensions to Libraries and Utilities
+\0.2.1. Name Server
+\0.2.2. System Management
+\0.2.3. Routing
+\0.2.4. Compilers
+.LP
+.sp .5v
+.nf
+.B "6. Security Tightening
+\0.1. Generic Kernel
+\0.2. Security Problems in Utilities
+.LP
+.sp .5v
+.nf
+.B "7. Conclusions
+.LP
+.sp .5v
+.nf
+.B Acknowledgements
+.LP
+.sp .5v
+.nf
+.B References
+.LP
+.sp .5v
+.nf
+.B "Appendix \- Benchmark Programs"
+.de _d
+.if t .ta .6i 2.1i 2.6i
+.\" 2.94 went to 2.6, 3.64 to 3.30
+.if n .ta .84i 2.6i 3.30i
+..
+.de _f
+.if t .ta .5i 1.25i 2.5i
+.\" 3.5i went to 3.8i
+.if n .ta .7i 1.75i 3.8i
+..
diff --git a/share/doc/papers/sysperf/1.t b/share/doc/papers/sysperf/1.t
new file mode 100644
index 000000000000..38a56f0500b5
--- /dev/null
+++ b/share/doc/papers/sysperf/1.t
@@ -0,0 +1,75 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Introduction
+.af PN 1
+.bp 1
+.NH
+Introduction
+.PP
+The Berkeley Software Distributions of
+.UX
+for the VAX have added many new capabilities that were
+previously unavailable under
+.UX .
+The development effort for 4.2BSD concentrated on providing new
+facilities, and in getting them to work correctly.
+Many new data structures were added to the system to support
+these new capabilities.
+In addition,
+many of the existing data structures and algorithms
+were put to new uses or their old functions placed under increased demand.
+The effect of these changes was that
+mechanisms that were well tuned under 4.1BSD
+no longer provided adequate performance for 4.2BSD.
+The increased user feedback that came with the release of
+4.2BSD and a growing body of experience with the system
+highlighted the performance shortcomings of 4.2BSD.
+.PP
+This paper details the work that we have done since
+the release of 4.2BSD to measure the performance of the system,
+detect the bottlenecks,
+and find solutions to remedy them.
+Most of our tuning has been in the context of the real
+timesharing systems in our environment.
+Rather than using simulated workloads,
+we have sought to analyze our tuning efforts under
+realistic conditions.
+Much of the work has been done in the machine independent parts
+of the system, hence these improvements could be applied to
+other variants of UNIX with equal success.
+All of the changes made have been included in 4.3BSD.
+.PP
+Section 2 of the paper describes the tools and techniques
+available to us for measuring system performance.
+In Section 3 we present the results of using these tools, while Section 4
+has the performance improvements
+that have been made to the system based on our measurements.
+Section 5 highlights the functional enhancements that have
+been made to Berkeley UNIX 4.2BSD.
+Section 6 discusses some of the security problems that
+have been addressed.
diff --git a/share/doc/papers/sysperf/2.t b/share/doc/papers/sysperf/2.t
new file mode 100644
index 000000000000..7aa97335adae
--- /dev/null
+++ b/share/doc/papers/sysperf/2.t
@@ -0,0 +1,252 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Observation techniques
+.NH
+Observation techniques
+.PP
+There are many tools available for monitoring the performance
+of the system.
+Those that we found most useful are described below.
+.NH 2
+System maintenance tools
+.PP
+Several standard maintenance programs are invaluable in
+observing the basic actions of the system.
+The \fIvmstat\fP(1)
+program is designed to be an aid to monitoring
+systemwide activity. Together with the
+\fIps\fP\|(1)
+command (as in ``ps av''), it can be used to investigate systemwide
+virtual memory activity.
+By running \fIvmstat\fP
+when the system is active you can judge the system activity in several
+dimensions: job distribution, virtual memory load, paging and swapping
+activity, disk and cpu utilization.
+Ideally, to have a balanced system in activity,
+there should be few blocked (b) jobs,
+there should be little paging or swapping activity, there should
+be available bandwidth on the disk devices (most single arms peak
+out at 25-35 tps in practice), and the user cpu utilization (us) should
+be high (above 50%).
+.PP
+If the system is busy, then the count of active jobs may be large,
+and several of these jobs may often be blocked (b). If the virtual
+memory is active, then the paging demon will be running (sr will
+be non-zero). It is healthy for the paging demon to free pages when
+the virtual memory gets active; it is triggered by the amount of free
+memory dropping below a threshold and increases its pace as free memory
+goes to zero.
+.PP
+If you run \fIvmstat\fP
+when the system is busy (a ``vmstat 5'' gives all the
+numbers computed by the system), you can find
+imbalances by noting abnormal job distributions. If many
+processes are blocked (b), then the disk subsystem
+is overloaded or imbalanced. If you have several non-dma
+devices or open teletype lines that are ``ringing'', or user programs
+that are doing high-speed non-buffered input/output, then the system
+time may go high (60-80% or higher).
+It is often possible to pin down the cause of high system time by
+looking to see if there is excessive context switching (cs), interrupt
+activity (in) or system call activity (sy). Long term measurements
+on one of
+our large machines show
+an average of 60 context switches and interrupts
+per second and an average of 90 system calls per second.
+.PP
+If the system is heavily loaded, or if you have little memory
+for your load (1 megabyte is little in our environment), then the system
+may be forced to swap. This is likely to be accompanied by a noticeable
+reduction in the system responsiveness and long pauses when interactive
+jobs such as editors swap out.
+.PP
+A second important program is \fIiostat\fP\|(1).
+\fIIostat\fP
+iteratively reports the number of characters read and written to terminals,
+and, for each disk, the number of transfers per second, kilobytes
+transferred per second,
+and the milliseconds per average seek.
+It also gives the percentage of time the system has
+spent in user mode, in user mode running low priority (niced) processes,
+in system mode, and idling.
+.PP
+To compute this information, for each disk, seeks and data transfer completions
+and the number of words transferred are counted;
+for terminals collectively, the number
+of input and output characters are counted.
+Also, every 100 ms,
+the state of each disk is examined
+and a tally is made if the disk is active.
+From these numbers and the transfer rates
+of the devices it is possible to determine
+average seek times for each device.
+.PP
+When filesystems are poorly placed on the available
+disks, figures reported by \fIiostat\fP can be used
+to pinpoint bottlenecks. Under heavy system load, disk
+traffic should be spread out among the drives with
+higher traffic expected to the devices where the root, swap, and
+/tmp filesystems are located. When multiple disk drives are
+attached to the same controller, the system will
+attempt to overlap seek operations with I/O transfers. When
+seeks are performed, \fIiostat\fP will show
+non-zero average seek times. Most modern disk drives should
+exhibit an average seek time of 25-35 ms.
+.PP
+Terminal traffic reported by \fIiostat\fP should be heavily
+output oriented unless terminal lines are being used for
+data transfer by programs such as \fIuucp\fP. Input and
+output rates are system specific. Screen editors
+such as \fIvi\fP and \fIemacs\fP tend to exhibit output/input
+ratios of anywhere from 5/1 to 8/1. On one of our largest
+systems, 88 terminal lines plus 32 pseudo terminals, we observed
+an average of 180 characters/second input and 450 characters/second
+output over 4 days of operation.
+.NH 2
+Kernel profiling
+.PP
+It is simple to build a 4.2BSD kernel that will automatically
+collect profiling information as it operates simply by specifying the
+.B \-p
+option to \fIconfig\fP\|(8) when configuring a kernel.
+The program counter sampling can be driven by the system clock,
+or by an alternate real time clock.
+The latter is highly recommended as use of the system clock results
+in statistical anomalies in accounting for
+the time spent in the kernel clock routine.
+.PP
+Once a profiling system has been booted statistic gathering is
+handled by \fIkgmon\fP\|(8).
+\fIKgmon\fP allows profiling to be started and stopped
+and the internal state of the profiling buffers to be dumped.
+\fIKgmon\fP can also be used to reset the state of the internal
+buffers to allow multiple experiments to be run without
+rebooting the machine.
+.PP
+The profiling data is processed with \fIgprof\fP\|(1)
+to obtain information regarding the system's operation.
+Profiled systems maintain histograms of the kernel program counter,
+the number of invocations of each routine,
+and a dynamic call graph of the executing system.
+The postprocessing propagates the time spent in each
+routine along the arcs of the call graph.
+\fIGprof\fP then generates a listing for each routine in the kernel,
+sorted according to the time it uses
+including the time of its call graph descendents.
+Below each routine entry is shown its (direct) call graph children,
+and how their times are propagated to this routine.
+A similar display above the routine shows how this routine's time and the
+time of its descendents is propagated to its (direct) call graph parents.
+.PP
+A profiled system is about 5-10% larger in its text space because of
+the calls to count the subroutine invocations.
+When the system executes,
+the profiling data is stored in a buffer that is 1.2
+times the size of the text space.
+All the information is summarized in memory,
+it is not necessary to have a trace file
+being continuously dumped to disk.
+The overhead for running a profiled system varies;
+under normal load we see anywhere from 5-25%
+of the system time spent in the profiling code.
+Thus the system is noticeably slower than an unprofiled system,
+yet is not so bad that it cannot be used in a production environment.
+This is important since it allows us to gather data
+in a real environment rather than trying to
+devise synthetic work loads.
+.NH 2
+Kernel tracing
+.PP
+The kernel can be configured to trace certain operations by
+specifying ``options TRACE'' in the configuration file. This
+forces the inclusion of code that records the occurrence of
+events in \fItrace records\fP in a circular buffer in kernel
+memory. Events may be enabled/disabled selectively while the
+system is operating. Each trace record contains a time stamp
+(taken from the VAX hardware time of day clock register), an
+event identifier, and additional information that is interpreted
+according to the event type. Buffer cache operations, such as
+initiating a read, include
+the disk drive, block number, and transfer size in the trace record.
+Virtual memory operations, such as a pagein completing, include
+the virtual address and process id in the trace record. The circular
+buffer is normally configured to hold 256 16-byte trace records.\**
+.FS
+\** The standard trace facilities distributed with 4.2
+differ slightly from those described here. The time stamp in the
+distributed system is calculated from the kernel's time of day
+variable instead of the VAX hardware register, and the buffer cache
+trace points do not record the transfer size.
+.FE
+.PP
+Several user programs were written to sample and interpret the
+tracing information. One program runs in the background and
+periodically reads the circular buffer of trace records. The
+trace information is compressed, in some instances interpreted
+to generate additional information, and a summary is written to a
+file. In addition, the sampling program can also record
+information from other kernel data structures, such as those
+interpreted by the \fIvmstat\fP program. Data written out to
+a file is further buffered to minimize I/O load.
+.PP
+Once a trace log has been created, programs that compress
+and interpret the data may be run to generate graphs showing the
+data and relationships between traced events and
+system load.
+.PP
+The trace package was used mainly to investigate the operation of
+the file system buffer cache. The sampling program maintained a
+history of read-ahead blocks and used the trace information to
+calculate, for example, percentage of read-ahead blocks used.
+.NH 2
+Benchmark programs
+.PP
+Benchmark programs were used in two ways. First, a suite of
+programs was constructed to calculate the cost of certain basic
+system operations. Operations such as system call overhead and
+context switching time are critically important in evaluating the
+overall performance of a system. Because of the drastic changes in
+the system between 4.1BSD and 4.2BSD, it was important to verify
+the overhead of these low level operations had not changed appreciably.
+.PP
+The second use of benchmarks was in exercising
+suspected bottlenecks.
+When we suspected a specific problem with the system,
+a small benchmark program was written to repeatedly use
+the facility.
+While these benchmarks are not useful as a general tool
+they can give quick feedback on whether a hypothesized
+improvement is really having an effect.
+It is important to realize that the only real assurance
+that a change has a beneficial effect is through
+long term measurements of general timesharing.
+We have numerous examples where a benchmark program
+suggests vast improvements while the change
+in the long term system performance is negligible,
+and conversely examples in which the benchmark program run more slowly,
+but the long term system performance improves significantly.
diff --git a/share/doc/papers/sysperf/3.t b/share/doc/papers/sysperf/3.t
new file mode 100644
index 000000000000..fb3540491264
--- /dev/null
+++ b/share/doc/papers/sysperf/3.t
@@ -0,0 +1,688 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Results of our observations
+.NH
+Results of our observations
+.PP
+When 4.2BSD was first installed on several large timesharing systems
+the degradation in performance was significant.
+Informal measurements showed 4.2BSD providing 80% of the throughput
+of 4.1BSD (based on load averages observed under a normal timesharing load).
+Many of the initial problems found were because of programs that were
+not part of 4.1BSD. Using the techniques described in the previous
+section and standard process profiling several problems were identified.
+Later work concentrated on the operation of the kernel itself.
+In this section we discuss the problems uncovered; in the next
+section we describe the changes made to the system.
+.NH 2
+User programs
+.PP
+.NH 3
+Mail system
+.PP
+The mail system was the first culprit identified as a major
+contributor to the degradation in system performance.
+At Lucasfilm the mail system is heavily used
+on one machine, a VAX-11/780 with eight megabytes of memory.\**
+.FS
+\** During part of these observations the machine had only four
+megabytes of memory.
+.FE
+Message
+traffic is usually between users on the same machine and ranges from
+person-to-person telephone messages to per-organization distribution
+lists. After conversion to 4.2BSD, it was
+immediately noticed that mail to distribution lists of 20 or more people
+caused the system load to jump by anywhere from 3 to 6 points.
+The number of processes spawned by the \fIsendmail\fP program and
+the messages sent from \fIsendmail\fP to the system logging
+process, \fIsyslog\fP, generated significant load both from their
+execution and their interference with basic system operation. The
+number of context switches and disk transfers often doubled while
+\fIsendmail\fP operated; the system call rate jumped dramatically.
+System accounting information consistently
+showed \fIsendmail\fP as the top cpu user on the system.
+.NH 3
+Network servers
+.PP
+The network services provided in 4.2BSD add new capabilities to the system,
+but are not without cost. The system uses one daemon process to accept
+requests for each network service provided. The presence of many
+such daemons increases the numbers of active processes and files,
+and requires a larger configuration to support the same number of users.
+The overhead of the routing and status updates can consume
+several percent of the cpu.
+Remote logins and shells incur more overhead
+than their local equivalents.
+For example, a remote login uses three processes and a
+pseudo-terminal handler in addition to the local hardware terminal
+handler. When using a screen editor, sending and echoing a single
+character involves four processes on two machines.
+The additional processes, context switching, network traffic, and
+terminal handler overhead can roughly triple the load presented by one
+local terminal user.
+.NH 2
+System overhead
+.PP
+To measure the costs of various functions in the kernel,
+a profiling system was run for a 17 hour
+period on one of our general timesharing machines.
+While this is not as reproducible as a synthetic workload,
+it certainly represents a realistic test.
+This test was run on several occasions over a three month period.
+Despite the long period of time that elapsed
+between the test runs the shape of the profiles,
+as measured by the number of times each system call
+entry point was called, were remarkably similar.
+.PP
+These profiles turned up several bottlenecks that are
+discussed in the next section.
+Several of these were new to 4.2BSD,
+but most were caused by overloading of mechanisms
+which worked acceptably well in previous BSD systems.
+The general conclusion from our measurements was that
+the ratio of user to system time had increased from
+45% system / 55% user in 4.1BSD to 57% system / 43% user
+in 4.2BSD.
+.NH 3
+Micro-operation benchmarks
+.PP
+To compare certain basic system operations
+between 4.1BSD and 4.2BSD a suite of benchmark
+programs was constructed and run on a VAX-11/750 with 4.5 megabytes
+of physical memory and two disks on a MASSBUS controller.
+Tests were run with the machine operating in single user mode
+under both 4.1BSD and 4.2BSD. Paging was localized to the drive
+where the root file system was located.
+.PP
+The benchmark programs were modeled after the Kashtan benchmarks,
+[Kashtan80], with identical sources compiled under each system.
+The programs and their intended purpose are described briefly
+before the presentation of the results. The benchmark scripts
+were run twice with the results shown as the average of
+the two runs.
+The source code for each program and the shell scripts used during
+the benchmarks are included in the Appendix.
+.PP
+The set of tests shown in Table 1 was concerned with
+system operations other than paging. The intent of most
+benchmarks is clear. The result of running \fIsignocsw\fP is
+deducted from the \fIcsw\fP benchmark to calculate the context
+switch overhead. The \fIexec\fP tests use two different jobs to gauge
+the cost of overlaying a larger program with a smaller one
+and vice versa. The
+``null job'' and ``big job'' differ solely in the size of their data
+segments, 1 kilobyte versus 256 kilobytes. In both cases the
+text segment of the parent is larger than that of the child.\**
+.FS
+\** These tests should also have measured the cost of expanding the
+text segment; unfortunately time did not permit running additional tests.
+.FE
+All programs were compiled into the default load format that causes
+the text segment to be demand paged out of the file system and shared
+between processes.
+.KF
+.DS L
+.TS
+center box;
+l | l.
+Test Description
+_
+syscall perform 100,000 \fIgetpid\fP system calls
+csw perform 10,000 context switches using signals
+signocsw send 10,000 signals to yourself
+pipeself4 send 10,000 4-byte messages to yourself
+pipeself512 send 10,000 512-byte messages to yourself
+pipediscard4 send 10,000 4-byte messages to child who discards
+pipediscard512 send 10,000 512-byte messages to child who discards
+pipeback4 exchange 10,000 4-byte messages with child
+pipeback512 exchange 10,000 512-byte messages with child
+forks0 fork-exit-wait 1,000 times
+forks1k sbrk(1024), fault page, fork-exit-wait 1,000 times
+forks100k sbrk(102400), fault pages, fork-exit-wait 1,000 times
+vforks0 vfork-exit-wait 1,000 times
+vforks1k sbrk(1024), fault page, vfork-exit-wait 1,000 times
+vforks100k sbrk(102400), fault pages, vfork-exit-wait 1,000 times
+execs0null fork-exec ``null job''-exit-wait 1,000 times
+execs0null (1K env) execs0null above, with 1K environment added
+execs1knull sbrk(1024), fault page, fork-exec ``null job''-exit-wait 1,000 times
+execs1knull (1K env) execs1knull above, with 1K environment added
+execs100knull sbrk(102400), fault pages, fork-exec ``null job''-exit-wait 1,000 times
+vexecs0null vfork-exec ``null job''-exit-wait 1,000 times
+vexecs1knull sbrk(1024), fault page, vfork-exec ``null job''-exit-wait 1,000 times
+vexecs100knull sbrk(102400), fault pages, vfork-exec ``null job''-exit-wait 1,000 times
+execs0big fork-exec ``big job''-exit-wait 1,000 times
+execs1kbig sbrk(1024), fault page, fork-exec ``big job''-exit-wait 1,000 times
+execs100kbig sbrk(102400), fault pages, fork-exec ``big job''-exit-wait 1,000 times
+vexecs0big vfork-exec ``big job''-exit-wait 1,000 times
+vexecs1kbig sbrk(1024), fault pages, vfork-exec ``big job''-exit-wait 1,000 times
+vexecs100kbig sbrk(102400), fault pages, vfork-exec ``big job''-exit-wait 1,000 times
+.TE
+.ce
+Table 1. Kernel Benchmark programs.
+.DE
+.KE
+.PP
+The results of these tests are shown in Table 2. If the 4.1BSD results
+are scaled to reflect their being run on a VAX-11/750, they
+correspond closely to those found in [Joy80].\**
+.FS
+\** We assume that a VAX-11/750 runs at 60% of the speed of a VAX-11/780
+(not considering floating point operations).
+.FE
+.KF
+.DS L
+.TS
+center box;
+c s s s s s s s s s
+c || c s s || c s s || c s s
+c || c s s || c s s || c s s
+c || c | c | c || c | c | c || c | c | c
+l || n | n | n || n | n | n || n | n | n.
+Berkeley Software Distribution UNIX Systems
+_
+Test Elapsed Time User Time System Time
+\^ _ _ _
+\^ 4.1 4.2 4.3 4.1 4.2 4.3 4.1 4.2 4.3
+=
+syscall 28.0 29.0 23.0 4.5 5.3 3.5 23.9 23.7 20.4
+csw 45.0 60.0 45.0 3.5 4.3 3.3 19.5 25.4 19.0
+signocsw 16.5 23.0 16.0 1.9 3.0 1.1 14.6 20.1 15.2
+pipeself4 21.5 29.0 26.0 1.1 1.1 0.8 20.1 28.0 25.6
+pipeself512 47.5 59.0 55.0 1.2 1.2 1.0 46.1 58.3 54.2
+pipediscard4 32.0 42.0 36.0 3.2 3.7 3.0 15.5 18.8 15.6
+pipediscard512 61.0 76.0 69.0 3.1 2.1 2.0 29.7 36.4 33.2
+pipeback4 57.0 75.0 66.0 2.9 3.2 3.3 25.1 34.2 29.7
+pipeback512 110.0 138.0 125.0 3.1 3.4 2.2 52.2 65.7 57.7
+forks0 37.5 41.0 22.0 0.5 0.3 0.3 34.5 37.6 21.5
+forks1k 40.0 43.0 22.0 0.4 0.3 0.3 36.0 38.8 21.6
+forks100k 217.5 223.0 176.0 0.7 0.6 0.4 214.3 218.4 175.2
+vforks0 34.5 37.0 22.0 0.5 0.6 0.5 27.3 28.5 17.9
+vforks1k 35.0 37.0 22.0 0.6 0.8 0.5 27.2 28.6 17.9
+vforks100k 35.0 37.0 22.0 0.6 0.8 0.6 27.6 28.9 17.9
+execs0null 97.5 92.0 66.0 3.8 2.4 0.6 68.7 82.5 48.6
+execs0null (1K env) 197.0 229.0 75.0 4.1 2.6 0.9 167.8 212.3 62.6
+execs1knull 99.0 100.0 66.0 4.1 1.9 0.6 70.5 86.8 48.7
+execs1knull (1K env) 199.0 230.0 75.0 4.2 2.6 0.7 170.4 214.9 62.7
+execs100knull 283.5 278.0 216.0 4.8 2.8 1.1 251.9 269.3 202.0
+vexecs0null 100.0 92.0 66.0 5.1 2.7 1.1 63.7 76.8 45.1
+vexecs1knull 100.0 91.0 66.0 5.2 2.8 1.1 63.2 77.1 45.1
+vexecs100knull 100.0 92.0 66.0 5.1 3.0 1.1 64.0 77.7 45.6
+execs0big 129.0 201.0 101.0 4.0 3.0 1.0 102.6 153.5 92.7
+execs1kbig 130.0 202.0 101.0 3.7 3.0 1.0 104.7 155.5 93.0
+execs100kbig 318.0 385.0 263.0 4.8 3.1 1.1 286.6 339.1 247.9
+vexecs0big 128.0 200.0 101.0 4.6 3.5 1.6 98.5 149.6 90.4
+vexecs1kbig 125.0 200.0 101.0 4.7 3.5 1.3 98.9 149.3 88.6
+vexecs100kbig 126.0 200.0 101.0 4.2 3.4 1.3 99.5 151.0 89.0
+.TE
+.ce
+Table 2. Kernel Benchmark results (all times in seconds).
+.DE
+.KE
+.PP
+In studying the measurements we found that the basic system call
+and context switch overhead did not change significantly
+between 4.1BSD and 4.2BSD. The \fIsignocsw\fP results were caused by
+the changes to the \fIsignal\fP interface, resulting
+in an additional subroutine invocation for each call, not
+to mention additional complexity in the system's implementation.
+.PP
+The times for the use of pipes are significantly higher under
+4.2BSD because of their implementation on top of the interprocess
+communication facilities. Under 4.1BSD pipes were implemented
+without the complexity of the socket data structures and with
+simpler code. Further, while not obviously a factor here,
+4.2BSD pipes have less system buffer space provided them than
+4.1BSD pipes.
+.PP
+The \fIexec\fP tests shown in Table 2 were performed with 34 bytes of
+environment information under 4.1BSD and 40 bytes under 4.2BSD.
+To figure the cost of passing data through the environment,
+the execs0null and execs1knull tests were rerun with
+1065 additional bytes of data. The results are show in Table 3.
+.KF
+.DS L
+.TS
+center box;
+c || c s || c s || c s
+c || c s || c s || c s
+c || c | c || c | c || c | c
+l || n | n || n | n || n | n.
+Test Real User System
+\^ _ _ _
+\^ 4.1 4.2 4.1 4.2 4.1 4.2
+=
+execs0null 197.0 229.0 4.1 2.6 167.8 212.3
+execs1knull 199.0 230.0 4.2 2.6 170.4 214.9
+.TE
+.ce
+Table 3. Benchmark results with ``large'' environment (all times in seconds).
+.DE
+.KE
+These results show that passing argument data is significantly
+slower than under 4.1BSD: 121 ms/byte versus 93 ms/byte. Even using
+this factor to adjust the basic overhead of an \fIexec\fP system
+call, this facility is more costly under 4.2BSD than under 4.1BSD.
+.NH 3
+Path name translation
+.PP
+The single most expensive function performed by the kernel
+is path name translation.
+This has been true in almost every UNIX kernel [Mosher80];
+we find that our general time sharing systems do about
+500,000 name translations per day.
+.PP
+Name translations became more expensive in 4.2BSD for several reasons.
+The single most expensive addition was the symbolic link.
+Symbolic links
+have the effect of increasing the average number of components
+in path names to be translated.
+As an insidious example,
+consider the system manager that decides to change /tmp
+to be a symbolic link to /usr/tmp.
+A name such as /tmp/tmp1234 that previously required two component
+translations,
+now requires four component translations plus the cost of reading
+the contents of the symbolic link.
+.PP
+The new directory format also changes the characteristics of
+name translation.
+The more complex format requires more computation to determine
+where to place new entries in a directory.
+Conversely the additional information allows the system to only
+look at active entries when searching,
+hence searches of directories that had once grown large
+but currently have few active entries are checked quickly.
+The new format also stores the length of each name so that
+costly string comparisons are only done on names that are the
+same length as the name being sought.
+.PP
+The net effect of the changes is that the average time to
+translate a path name in 4.2BSD is 24.2 milliseconds,
+representing 40% of the time processing system calls,
+that is 19% of the total cycles in the kernel,
+or 11% of all cycles executed on the machine.
+The times are shown in Table 4. We have no comparable times
+for \fInamei\fP under 4.1 though they are certain to
+be significantly less.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 14.3 ms/call 11.3%
+child 9.9 ms/call 7.9%
+_
+total 24.2 ms/call 19.2%
+.TE
+.ce
+Table 4. Call times for \fInamei\fP in 4.2BSD.
+.DE
+.KE
+.NH 3
+Clock processing
+.PP
+Nearly 25% of the time spent in the kernel is spent in the clock
+processing routines.
+(This is a clear indication that to avoid sampling bias when profiling the
+kernel with our tools
+we need to drive them from an independent clock.)
+These routines are responsible for implementing timeouts,
+scheduling the processor,
+maintaining kernel statistics,
+and tending various hardware operations such as
+draining the terminal input silos.
+Only minimal work is done in the hardware clock interrupt
+routine (at high priority), the rest is performed (at a lower priority)
+in a software interrupt handler scheduled by the hardware interrupt
+handler.
+In the worst case, with a clock rate of 100 Hz
+and with every hardware interrupt scheduling a software
+interrupt, the processor must field 200 interrupts per second.
+The overhead of simply trapping and returning
+is 3% of the machine cycles,
+figuring out that there is nothing to do
+requires an additional 2%.
+.NH 3
+Terminal multiplexors
+.PP
+The terminal multiplexors supported by 4.2BSD have programmable receiver
+silos that may be used in two ways.
+With the silo disabled, each character received causes an interrupt
+to the processor.
+Enabling the receiver silo allows the silo to fill before
+generating an interrupt, allowing multiple characters to be read
+for each interrupt.
+At low rates of input, received characters will not be processed
+for some time unless the silo is emptied periodically.
+The 4.2BSD kernel uses the input silos of each terminal multiplexor,
+and empties each silo on each clock interrupt.
+This allows high input rates without the cost of per-character interrupts
+while assuring low latency.
+However, as character input rates on most machines are usually
+low (about 25 characters per second),
+this can result in excessive overhead.
+At the current clock rate of 100 Hz, a machine with 5 terminal multiplexors
+configured makes 500 calls to the receiver interrupt routines per second.
+In addition, to achieve acceptable input latency
+for flow control, each clock interrupt must schedule
+a software interrupt to run the silo draining routines.\**
+.FS
+\** It is not possible to check the input silos at
+the time of the actual clock interrupt without modifying the terminal
+line disciplines, as the input queues may not be in a consistent state \**.
+.FE
+\** This implies that the worst case estimate for clock processing
+is the basic overhead for clock processing.
+.NH 3
+Process table management
+.PP
+In 4.2BSD there are numerous places in the kernel where a linear search
+of the process table is performed:
+.IP \(bu 3
+in \fIexit\fP to locate and wakeup a process's parent;
+.IP \(bu 3
+in \fIwait\fP when searching for \fB\s-2ZOMBIE\s+2\fP and
+\fB\s-2STOPPED\s+2\fP processes;
+.IP \(bu 3
+in \fIfork\fP when allocating a new process table slot and
+counting the number of processes already created by a user;
+.IP \(bu 3
+in \fInewproc\fP, to verify
+that a process id assigned to a new process is not currently
+in use;
+.IP \(bu 3
+in \fIkill\fP and \fIgsignal\fP to locate all processes to
+which a signal should be delivered;
+.IP \(bu 3
+in \fIschedcpu\fP when adjusting the process priorities every
+second; and
+.IP \(bu 3
+in \fIsched\fP when locating a process to swap out and/or swap
+in.
+.LP
+These linear searches can incur significant overhead. The rule
+for calculating the size of the process table is:
+.ce
+nproc = 20 + 8 * maxusers
+.sp
+that means a 48 user system will have a 404 slot process table.
+With the addition of network services in 4.2BSD, as many as a dozen
+server processes may be maintained simply to await incoming requests.
+These servers are normally created at boot time which causes them
+to be allocated slots near the beginning of the process table. This
+means that process table searches under 4.2BSD are likely to take
+significantly longer than under 4.1BSD. System profiling shows
+that as much as 20% of the time spent in the kernel on a loaded
+system (a VAX-11/780) can be spent in \fIschedcpu\fP and, on average,
+5-10% of the kernel time is spent in \fIschedcpu\fP.
+The other searches of the proc table are similarly affected.
+This shows the system can no longer tolerate using linear searches of
+the process table.
+.NH 3
+File system buffer cache
+.PP
+The trace facilities described in section 2.3 were used
+to gather statistics on the performance of the buffer cache.
+We were interested in measuring the effectiveness of the
+cache and the read-ahead policies.
+With the file system block size in 4.2BSD four to
+eight times that of a 4.1BSD file system, we were concerned
+that large amounts of read-ahead might be performed without
+being used. Also, we were interested in seeing if the
+rules used to size the buffer cache at boot time were severely
+affecting the overall cache operation.
+.PP
+The tracing package was run over a three hour period during
+a peak mid-afternoon period on a VAX 11/780 with four megabytes
+of physical memory.
+This resulted in a buffer cache containing 400 kilobytes of memory
+spread among 50 to 200 buffers
+(the actual number of buffers depends on the size mix of
+disk blocks being read at any given time).
+The pertinent configuration information is shown in Table 5.
+.KF
+.DS L
+.TS
+center box;
+l l l l.
+Controller Drive Device File System
+_
+DEC MASSBUS DEC RP06 hp0d /usr
+ hp0b swap
+Emulex SC780 Fujitsu Eagle hp1a /usr/spool/news
+ hp1b swap
+ hp1e /usr/src
+ hp1d /u0 (users)
+ Fujitsu Eagle hp2a /tmp
+ hp2b swap
+ hp2d /u1 (users)
+ Fujitsu Eagle hp3a /
+.TE
+.ce
+Table 5. Active file systems during buffer cache tests.
+.DE
+.KE
+.PP
+During the test period the load average ranged from 2 to 13
+with an average of 5.
+The system had no idle time, 43% user time, and 57% system time.
+The system averaged 90 interrupts per second
+(excluding the system clock interrupts),
+220 system calls per second,
+and 50 context switches per second (40 voluntary, 10 involuntary).
+.PP
+The active virtual memory (the sum of the address space sizes of
+all jobs that have run in the previous twenty seconds)
+over the period ranged from 2 to 6 megabytes with an average
+of 3.5 megabytes.
+There was no swapping, though the page daemon was inspecting
+about 25 pages per second.
+.PP
+On average 250 requests to read disk blocks were initiated
+per second.
+These include read requests for file blocks made by user
+programs as well as requests initiated by the system.
+System reads include requests for indexing information to determine
+where a file's next data block resides,
+file system layout maps to allocate new data blocks,
+and requests for directory contents needed to do path name translations.
+.PP
+On average, an 85% cache hit rate was observed for read requests.
+Thus only 37 disk reads were initiated per second.
+In addition, 5 read-ahead requests were made each second
+filling about 20% of the buffer pool.
+Despite the policies to rapidly reuse read-ahead buffers
+that remain unclaimed, more than 90% of the read-ahead
+buffers were used.
+.PP
+These measurements showed that the buffer cache was working
+effectively. Independent tests have also showed that the size
+of the buffer cache may be reduced significantly on memory-poor
+system without severe effects;
+we have not yet tested this hypothesis [Shannon83].
+.NH 3
+Network subsystem
+.PP
+The overhead associated with the
+network facilities found in 4.2BSD is often
+difficult to gauge without profiling the system.
+This is because most input processing is performed
+in modules scheduled with software interrupts.
+As a result, the system time spent performing protocol
+processing is rarely attributed to the processes that
+really receive the data. Since the protocols supported
+by 4.2BSD can involve significant overhead this was a serious
+concern. Results from a profiled kernel show an average
+of 5% of the system time is spent
+performing network input and timer processing in our environment
+(a 3Mb/s Ethernet with most traffic using TCP).
+This figure can vary significantly depending on
+the network hardware used, the average message
+size, and whether packet reassembly is required at the network
+layer. On one machine we profiled over a 17 hour
+period (our gateway to the ARPANET)
+206,000 input messages accounted for 2.4% of the system time,
+while another 0.6% of the system time was spent performing
+protocol timer processing.
+This machine was configured with an ACC LH/DH IMP interface
+and a DMA 3Mb/s Ethernet controller.
+.PP
+The performance of TCP over slower long-haul networks
+was degraded substantially by two problems.
+The first problem was a bug that prevented round-trip timing measurements
+from being made, thus increasing retransmissions unnecessarily.
+The second was a problem with the maximum segment size chosen by TCP,
+that was well-tuned for Ethernet, but was poorly chosen for
+the ARPANET, where it causes packet fragmentation. (The maximum
+segment size was actually negotiated upwards to a value that
+resulted in excessive fragmentation.)
+.PP
+When benchmarked in Ethernet environments the main memory buffer management
+of the network subsystem presented some performance anomalies.
+The overhead of processing small ``mbufs'' severely affected throughput for a
+substantial range of message sizes.
+In spite of the fact that most system ustilities made use of the throughput
+optimal 1024 byte size, user processes faced large degradations for some
+arbitrary sizes. This was specially true for TCP/IP transmissions [Cabrera84,
+Cabrera85].
+.NH 3
+Virtual memory subsystem
+.PP
+We ran a set of tests intended to exercise the virtual
+memory system under both 4.1BSD and 4.2BSD.
+The tests are described in Table 6.
+The test programs dynamically allocated
+a 7.3 Megabyte array (using \fIsbrk\fP\|(2)) then referenced
+pages in the array either: sequentially, in a purely random
+fashion, or such that the distance between
+successive pages accessed was randomly selected from a Gaussian
+distribution. In the last case, successive runs were made with
+increasing standard deviations.
+.KF
+.DS L
+.TS
+center box;
+l | l.
+Test Description
+_
+seqpage sequentially touch pages, 10 iterations
+seqpage-v as above, but first make \fIvadvise\fP\|(2) call
+randpage touch random page 30,000 times
+randpage-v as above, but first make \fIvadvise\fP call
+gausspage.1 30,000 Gaussian accesses, standard deviation of 1
+gausspage.10 as above, standard deviation of 10
+gausspage.30 as above, standard deviation of 30
+gausspage.40 as above, standard deviation of 40
+gausspage.50 as above, standard deviation of 50
+gausspage.60 as above, standard deviation of 60
+gausspage.80 as above, standard deviation of 80
+gausspage.inf as above, standard deviation of 10,000
+.TE
+.ce
+Table 6. Paging benchmark programs.
+.DE
+.KE
+.PP
+The results in Table 7 show how the additional
+memory requirements
+of 4.2BSD can generate more work for the paging system.
+Under 4.1BSD,
+the system used 0.5 of the 4.5 megabytes of physical memory
+on the test machine;
+under 4.2BSD it used nearly 1 megabyte of physical memory.\**
+.FS
+\** The 4.1BSD system used for testing was really a 4.1a
+system configured
+with networking facilities and code to support
+remote file access. The
+4.2BSD system also included the remote file access code.
+Since both
+systems would be larger than similarly configured ``vanilla''
+4.1BSD or 4.2BSD system, we consider out conclusions to still be valid.
+.FE
+This resulted in more page faults and, hence, more system time.
+To establish a common ground on which to compare the paging
+routines of each system, we check instead the average page fault
+service times for those test runs that had a statistically significant
+number of random page faults. These figures, shown in Table 8, show
+no significant difference between the two systems in
+the area of page fault servicing. We currently have
+no explanation for the results of the sequential
+paging tests.
+.KF
+.DS L
+.TS
+center box;
+l || c s || c s || c s || c s
+l || c s || c s || c s || c s
+l || c | c || c | c || c | c || c | c
+l || n | n || n | n || n | n || n | n.
+Test Real User System Page Faults
+\^ _ _ _ _
+\^ 4.1 4.2 4.1 4.2 4.1 4.2 4.1 4.2
+=
+seqpage 959 1126 16.7 12.8 197.0 213.0 17132 17113
+seqpage-v 579 812 3.8 5.3 216.0 237.7 8394 8351
+randpage 571 569 6.7 7.6 64.0 77.2 8085 9776
+randpage-v 572 562 6.1 7.3 62.2 77.5 8126 9852
+gausspage.1 25 24 23.6 23.8 0.8 0.8 8 8
+gausspage.10 26 26 22.7 23.0 3.2 3.6 2 2
+gausspage.30 34 33 25.0 24.8 8.6 8.9 2 2
+gausspage.40 42 81 23.9 25.0 11.5 13.6 3 260
+gausspage.50 113 175 24.2 26.2 19.6 26.3 784 1851
+gausspage.60 191 234 27.6 26.7 27.4 36.0 2067 3177
+gausspage.80 312 329 28.0 27.9 41.5 52.0 3933 5105
+gausspage.inf 619 621 82.9 85.6 68.3 81.5 8046 9650
+.TE
+.ce
+Table 7. Paging benchmark results (all times in seconds).
+.DE
+.KE
+.KF
+.DS L
+.TS
+center box;
+c || c s || c s
+c || c s || c s
+c || c | c || c | c
+l || n | n || n | n.
+Test Page Faults PFST
+\^ _ _
+\^ 4.1 4.2 4.1 4.2
+=
+randpage 8085 9776 791 789
+randpage-v 8126 9852 765 786
+gausspage.inf 8046 9650 848 844
+.TE
+.ce
+Table 8. Page fault service times (all times in microseconds).
+.DE
+.KE
diff --git a/share/doc/papers/sysperf/4.t b/share/doc/papers/sysperf/4.t
new file mode 100644
index 000000000000..6c605911498b
--- /dev/null
+++ b/share/doc/papers/sysperf/4.t
@@ -0,0 +1,768 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Performance Improvements
+.NH
+Performance Improvements
+.PP
+This section outlines the changes made to the system
+since the 4.2BSD distribution.
+The changes reported here were made in response
+to the problems described in Section 3.
+The improvements fall into two major classes;
+changes to the kernel that are described in this section,
+and changes to the system libraries and utilities that are
+described in the following section.
+.NH 2
+Performance Improvements in the Kernel
+.PP
+Our goal has been to optimize system performance
+for our general timesharing environment.
+Since most sites running 4.2BSD have been forced to take
+advantage of declining
+memory costs rather than replace their existing machines with
+ones that are more powerful, we have
+chosen to optimize running time at the expense of memory.
+This tradeoff may need to be reconsidered for personal workstations
+that have smaller memories and higher latency disks.
+Decreases in the running time of the system may be unnoticeable
+because of higher paging rates incurred by a larger kernel.
+Where possible, we have allowed the size of caches to be controlled
+so that systems with limited memory may reduce them as appropriate.
+.NH 3
+Name Cacheing
+.PP
+Our initial profiling studies showed that more than one quarter
+of the time in the system was spent in the
+pathname translation routine, \fInamei\fP,
+translating path names to inodes\u\s-21\s0\d\**.
+.FS
+\** \u\s-21\s0\d Inode is an abbreviation for ``Index node''.
+Each file on the system is described by an inode;
+the inode maintains access permissions, and an array of pointers to
+the disk blocks that hold the data associated with the file.
+.FE
+An inspection of \fInamei\fP shows that
+it consists of two nested loops.
+The outer loop is traversed once per pathname component.
+The inner loop performs a linear search through a directory looking
+for a particular pathname component.
+.PP
+Our first idea was to reduce the number of iterations
+around the inner loop of \fInamei\fP by observing that many programs
+step through a directory performing an operation on each entry in turn.
+To improve performance for processes doing directory scans,
+the system keeps track of the directory offset of the last component of the
+most recently translated path name for each process.
+If the next name the process requests is in the same directory,
+the search is started from the offset that the previous name was found
+(instead of from the beginning of the directory).
+Changing directories invalidates the cache, as
+does modifying the directory.
+For programs that step sequentially through a directory with
+.EQ
+delim $$
+.EN
+$N$ files, search time decreases from $O ( N sup 2 )$ to $O(N)$.
+.EQ
+delim off
+.EN
+.PP
+The cost of the cache is about 20 lines of code
+(about 0.2 kilobytes)
+and 16 bytes per process, with the cached data
+stored in a process's \fIuser\fP vector.
+.PP
+As a quick benchmark to verify the maximum effectiveness of the
+cache we ran ``ls \-l''
+on a directory containing 600 files.
+Before the per-process cache this command
+used 22.3 seconds of system time.
+After adding the cache the program used the same amount
+of user time, but the system time dropped to 3.3 seconds.
+.PP
+This change prompted our rerunning a profiled system
+on a machine containing the new \fInamei\fP.
+The results showed that the time in \fInamei\fP
+dropped by only 2.6 ms/call and
+still accounted for 36% of the system call time,
+18% of the kernel, or about 10% of all the machine cycles.
+This amounted to a drop in system time from 57% to about 55%.
+The results are shown in Table 9.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 11.0 ms/call 9.2%
+child 10.6 ms/call 8.9%
+_
+total 21.6 ms/call 18.1%
+.TE
+.ce
+Table 9. Call times for \fInamei\fP with per-process cache.
+.DE
+.KE
+.PP
+The small performance improvement
+was caused by a low cache hit ratio.
+Although the cache was 90% effective when hit,
+it was only usable on about 25% of the names being translated.
+An additional reason for the small improvement was that
+although the amount of time spent in \fInamei\fP itself
+decreased substantially,
+more time was spent in the routines that it called
+since each directory had to be accessed twice;
+once to search from the middle to the end,
+and once to search from the beginning to the middle.
+.PP
+Frequent requests for a small set of names are best handled
+with a cache of recent name translations\**.
+.FS
+\** The cache is keyed on a name and the
+inode and device number of the directory that contains it.
+Associated with each entry is a pointer to the corresponding
+entry in the inode table.
+.FE
+This has the effect of eliminating the inner loop of \fInamei\fP.
+For each path name component,
+\fInamei\fP first looks in its cache of recent translations
+for the needed name.
+If it exists, the directory search can be completely eliminated.
+.PP
+The system already maintained a cache of recently accessed inodes,
+so the initial name cache
+maintained a simple name-inode association that was used to
+check each component of a path name during name translations.
+We considered implementing the cache by tagging each inode
+with its most recently translated name,
+but eventually decided to have a separate data structure that
+kept names with pointers to the inode table.
+Tagging inodes has two drawbacks;
+many inodes such as those associated with login ports remain in
+the inode table for a long period of time, but are never looked
+up by name.
+Other inodes, such as those describing directories are looked up
+frequently by many different names (\fIe.g.\fP ``..'').
+By keeping a separate table of names, the cache can
+truly reflect the most recently used names.
+An added benefit is that the table can be sized independently
+of the inode table, so that machines with small amounts of memory
+can reduce the size of the cache (or even eliminate it)
+without modifying the inode table structure.
+.PP
+Another issue to be considered is how the name cache should
+hold references to the inode table.
+Normally processes hold ``hard references'' by incrementing the
+reference count in the inode they reference.
+Since the system reuses only inodes with zero reference counts,
+a hard reference insures that the inode pointer will remain valid.
+However, if the name cache holds hard references,
+it is limited to some fraction of the size of the inode table,
+since some inodes must be left free for new files.
+It also makes it impossible for other parts of the kernel
+to verify sole use of a device or file.
+These reasons made it impractical to use hard references
+without affecting the behavior of the inode caching scheme.
+Thus, we chose instead to keep ``soft references'' protected
+by a \fIcapability\fP \- a 32-bit number
+guaranteed to be unique\u\s-22\s0\d \**.
+.FS
+\** \u\s-22\s0\d When all the numbers have been exhausted, all outstanding
+capabilities are purged and numbering starts over from scratch.
+Purging is possible as all capabilities are easily found in kernel memory.
+.FE
+When an entry is made in the name cache,
+the capability of its inode is copied to the name cache entry.
+When an inode is reused it is issued a new capability.
+When a name cache hit occurs,
+the capability of the name cache entry is compared
+with the capability of the inode that it references.
+If the capabilities do not match, the name cache entry is invalid.
+Since the name cache holds only soft references,
+it may be sized independent of the size of the inode table.
+A final benefit of using capabilities is that all
+cached names for an inode may be invalidated without
+searching through the entire cache;
+instead all you need to do is assign a new capability to the inode.
+.PP
+The cost of the name cache is about 200 lines of code
+(about 1.2 kilobytes)
+and 48 bytes per cache entry.
+Depending on the size of the system,
+about 200 to 1000 entries will normally be configured,
+using 10-50 kilobytes of physical memory.
+The name cache is resident in memory at all times.
+.PP
+After adding the system wide name cache we reran ``ls \-l''
+on the same directory.
+The user time remained the same,
+however the system time rose slightly to 3.7 seconds.
+This was not surprising as \fInamei\fP
+now had to maintain the cache,
+but was never able to make any use of it.
+.PP
+Another profiled system was created and measurements
+were collected over a 17 hour period. These measurements
+showed a 13 ms/call decrease in \fInamei\fP, with
+\fInamei\fP accounting for only 26% of the system call time,
+13% of the time in the kernel,
+or about 7% of all the machine cycles.
+System time dropped from 55% to about 49%.
+The results are shown in Table 10.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 4.2 ms/call 6.2%
+child 4.4 ms/call 6.6%
+_
+total 8.6 ms/call 12.8%
+.TE
+.ce
+Table 10. Call times for \fInamei\fP with both caches.
+.DE
+.KE
+.PP
+On our general time sharing systems we find that during the twelve
+hour period from 8AM to 8PM the system does 500,000 to 1,000,000
+name translations.
+Statistics on the performance of both caches show that
+the large performance improvement is
+caused by the high hit ratio.
+The name cache has a hit rate of 70%-80%;
+the directory offset cache gets a hit rate of 5%-15%.
+The combined hit rate of the two caches almost always adds up to 85%.
+With the addition of the two caches,
+the percentage of system time devoted to name translation has
+dropped from 25% to less than 13%.
+While the system wide cache reduces both the amount of time in
+the routines that \fInamei\fP calls as well as \fInamei\fP itself
+(since fewer directories need to be accessed or searched),
+it is interesting to note that the actual percentage of system
+time spent in \fInamei\fP itself increases even though the
+actual time per call decreases.
+This is because less total time is being spent in the kernel,
+hence a smaller absolute time becomes a larger total percentage.
+.NH 3
+Intelligent Auto Siloing
+.PP
+Most terminal input hardware can run in two modes:
+it can either generate an interrupt each time a character is received,
+or collect characters in a silo that the system then periodically drains.
+To provide quick response for interactive input and flow control,
+a silo must be checked 30 to 50 times per second.
+Ascii terminals normally exhibit
+an input rate of less than 30 characters per second.
+At this input rate
+they are most efficiently handled with interrupt per character mode,
+since this generates fewer interrupts than draining the input silos
+of the terminal multiplexors at each clock interrupt.
+When input is being generated by another machine
+or a malfunctioning terminal connection, however,
+the input rate is usually more than 50 characters per second.
+It is more efficient to use a device's silo input mode,
+since this generates fewer interrupts than handling each character
+as a separate interrupt.
+Since a given dialup port may switch between uucp logins and user logins,
+it is impossible to statically select the most efficient input mode to use.
+.PP
+We therefore changed the terminal multiplexor handlers
+to dynamically choose between the use of the silo and the use of
+per-character interrupts.
+At low input rates the handler processes characters on an
+interrupt basis, avoiding the overhead
+of checking each interface on each clock interrupt.
+During periods of sustained input, the handler enables the silo
+and starts a timer to drain input.
+This timer runs less frequently than the clock interrupts,
+and is used only when there is a substantial amount of input.
+The transition from using silos to an interrupt per character is
+damped to minimize the number of transitions with bursty traffic
+(such as in network communication).
+Input characters serve to flush the silo, preventing long latency.
+By switching between these two modes of operation dynamically,
+the overhead of checking the silos is incurred only
+when necessary.
+.PP
+In addition to the savings in the terminal handlers,
+the clock interrupt routine is no longer required to schedule
+a software interrupt after each hardware interrupt to drain the silos.
+The software-interrupt level portion of the clock routine is only
+needed when timers expire or the current user process is collecting
+an execution profile.
+Thus, the number of interrupts attributable to clock processing
+is substantially reduced.
+.NH 3
+Process Table Management
+.PP
+As systems have grown larger, the size of the process table
+has grown far past 200 entries.
+With large tables, linear searches must be eliminated
+from any frequently used facility.
+The kernel process table is now multi-threaded to allow selective searching
+of active and zombie processes.
+A third list threads unused process table slots.
+Free slots can be obtained in constant time by taking one
+from the front of the free list.
+The number of processes used by a given user may be computed by scanning
+only the active list.
+Since the 4.2BSD release,
+the kernel maintained linked lists of the descendents of each process.
+This linkage is now exploited when dealing with process exit;
+parents seeking the exit status of children now avoid linear search
+of the process table, but examine only their direct descendents.
+In addition, the previous algorithm for finding all descendents of an exiting
+process used multiple linear scans of the process table.
+This has been changed to follow the links between child process and siblings.
+.PP
+When forking a new process,
+the system must assign it a unique process identifier.
+The system previously scanned the entire process table each time it created
+a new process to locate an identifier that was not already in use.
+Now, to avoid scanning the process table for each new process,
+the system computes a range of unused identifiers
+that can be directly assigned.
+Only when the set of identifiers is exhausted is another process table
+scan required.
+.NH 3
+Scheduling
+.PP
+Previously the scheduler scanned the entire process table
+once per second to recompute process priorities.
+Processes that had run for their entire time slice had their
+priority lowered.
+Processes that had not used their time slice, or that had
+been sleeping for the past second had their priority raised.
+On systems running many processes,
+the scheduler represented nearly 20% of the system time.
+To reduce this overhead,
+the scheduler has been changed to consider only
+runnable processes when recomputing priorities.
+To insure that processes sleeping for more than a second
+still get their appropriate priority boost,
+their priority is recomputed when they are placed back on the run queue.
+Since the set of runnable process is typically only a small fraction
+of the total number of processes on the system,
+the cost of invoking the scheduler drops proportionally.
+.NH 3
+Clock Handling
+.PP
+The hardware clock interrupts the processor 100 times per second
+at high priority.
+As most of the clock-based events need not be done at high priority,
+the system schedules a lower priority software interrupt to do the less
+time-critical events such as cpu scheduling and timeout processing.
+Often there are no such events, and the software interrupt handler
+finds nothing to do and returns.
+The high priority event now checks to see if there are low priority
+events to process;
+if there is nothing to do, the software interrupt is not requested.
+Often, the high priority interrupt occurs during a period when the
+machine had been running at low priority.
+Rather than posting a software interrupt that would occur as
+soon as it returns,
+the hardware clock interrupt handler simply lowers the processor priority
+and calls the software clock routines directly.
+Between these two optimizations, nearly 80 of the 100 software
+interrupts per second can be eliminated.
+.NH 3
+File System
+.PP
+The file system uses a large block size, typically 4096 or 8192 bytes.
+To allow small files to be stored efficiently, the large blocks can
+be broken into smaller fragments, typically multiples of 1024 bytes.
+To minimize the number of full-sized blocks that must be broken
+into fragments, the file system uses a best fit strategy.
+Programs that slowly grow files using write of 1024 bytes or less
+can force the file system to copy the data to
+successively larger and larger fragments until it finally
+grows to a full sized block.
+The file system still uses a best fit strategy the first time
+a fragment is written.
+However, the first time that the file system is forced to copy a growing
+fragment it places it at the beginning of a full sized block.
+Continued growth can be accommodated without further copying
+by using up the rest of the block.
+If the file ceases to grow, the rest of the block is still
+available for holding other fragments.
+.PP
+When creating a new file name,
+the entire directory in which it will reside must be scanned
+to insure that the name does not already exist.
+For large directories, this scan is time consuming.
+Because there was no provision for shortening directories,
+a directory that is once over-filled will increase the cost
+of file creation even after the over-filling is corrected.
+Thus, for example, a congested uucp connection can leave a legacy long
+after it is cleared up.
+To alleviate the problem, the system now deletes empty blocks
+that it finds at the end of a directory while doing a complete
+scan to create a new name.
+.NH 3
+Network
+.PP
+The default amount of buffer space allocated for stream sockets (including
+pipes) has been increased to 4096 bytes.
+Stream sockets and pipes now return their buffer sizes in the block size field
+of the stat structure.
+This information allows the standard I/O library to use more optimal buffering.
+Unix domain stream sockets also return a dummy device and inode number
+in the stat structure to increase compatibility
+with other pipe implementations.
+The TCP maximum segment size is calculated according to the destination
+and interface in use; non-local connections use a more conservative size
+for long-haul networks.
+.PP
+On multiply-homed hosts, the local address bound by TCP now always corresponds
+to the interface that will be used in transmitting data packets for the
+connection.
+Several bugs in the calculation of round trip timing have been corrected.
+TCP now switches to an alternate gateway when an existing route fails,
+or when an ICMP redirect message is received.
+ICMP source quench messages are used to throttle the transmission
+rate of TCP streams by temporarily creating an artificially small
+send window, and retransmissions send only a single packet
+rather than resending all queued data.
+A send policy has been implemented
+that decreases the number of small packets outstanding
+for network terminal traffic [Nagle84],
+providing additional reduction of network congestion.
+The overhead of packet routing has been decreased by changes in the routing
+code and by caching the most recently used route for each datagram socket.
+.PP
+The buffer management strategy implemented by \fIsosend\fP has been
+changed to make better use of the increased size of the socket buffers
+and a better tuned delayed acknowledgement algorithm.
+Routing has been modified to include a one element cache of the last
+route computed.
+Multiple messages send with the same destination now require less processing.
+Performance deteriorates because of load in
+either the sender host, receiver host, or ether.
+Also, any CPU contention degrades substantially
+the throughput achievable by user processes [Cabrera85].
+We have observed empty VAX 11/750s using up to 90% of their cycles
+transmitting network messages.
+.NH 3
+Exec
+.PP
+When \fIexec\fP-ing a new process, the kernel creates the new
+program's argument list by copying the arguments and environment
+from the parent process's address space into the system, then back out
+again onto the stack of the newly created process.
+These two copy operations were done one byte at a time, but
+are now done a string at a time.
+This optimization reduced the time to process
+an argument list by a factor of ten;
+the average time to do an \fIexec\fP call decreased by 25%.
+.NH 3
+Context Switching
+.PP
+The kernel used to post a software event when it wanted to force
+a process to be rescheduled.
+Often the process would be rescheduled for other reasons before
+exiting the kernel, delaying the event trap.
+At some later time the process would again
+be selected to run and would complete its pending system call,
+finally causing the event to take place.
+The event would cause the scheduler to be invoked a second time
+selecting the same process to run.
+The fix to this problem is to cancel any software reschedule
+events when saving a process context.
+This change doubles the speed with which processes
+can synchronize using pipes or signals.
+.NH 3
+Setjmp/Longjmp
+.PP
+The kernel routine \fIsetjmp\fP, that saves the current system
+context in preparation for a non-local goto used to save many more
+registers than necessary under most circumstances.
+By trimming its operation to save only the minimum state required,
+the overhead for system calls decreased by an average of 13%.
+.NH 3
+Compensating for Lack of Compiler Technology
+.PP
+The current compilers available for C do not
+do any significant optimization.
+Good optimizing compilers are unlikely to be built;
+the C language is not well suited to optimization
+because of its rampant use of unbound pointers.
+Thus, many classical optimizations such as common subexpression
+analysis and selection of register variables must be done
+by hand using ``exterior'' knowledge of when such optimizations are safe.
+.PP
+Another optimization usually done by optimizing compilers
+is inline expansion of small or frequently used routines.
+In past Berkeley systems this has been done by using \fIsed\fP to
+run over the assembly language and replace calls to small
+routines with the code for the body of the routine, often
+a single VAX instruction.
+While this optimization eliminated the cost of the subroutine
+call and return,
+it did not eliminate the pushing and popping of several arguments
+to the routine.
+The \fIsed\fP script has been replaced by a more intelligent expander,
+\fIinline\fP, that merges the pushes and pops into moves to registers.
+For example, if the C code
+.DS
+if (scanc(map[i], 1, 47, i - 63))
+.DE
+is compiled into assembly language it generates the code shown
+in the left hand column of Table 11.
+The \fIsed\fP inline expander changes this code to that
+shown in the middle column.
+The newer optimizer eliminates most of the stack
+operations to generate the code shown in the right hand column.
+.KF
+.TS
+center, box;
+c s s s s s
+c s | c s | c s
+l l | l l | l l.
+Alternative C Language Code Optimizations
+_
+cc sed inline
+_
+subl3 $64,_i,\-(sp) subl3 $64,_i,\-(sp) subl3 $64,_i,r5
+pushl $47 pushl $47 movl $47,r4
+pushl $1 pushl $1 pushl $1
+mull2 $16,_i,r3 mull2 $16,_i,r3 mull2 $16,_i,r3
+pushl \-56(fp)[r3] pushl \-56(fp)[r3] movl \-56(fp)[r3],r2
+calls $4,_scanc movl (sp)+,r5 movl (sp)+,r3
+tstl r0 movl (sp)+,r4 scanc r2,(r3),(r4),r5
+jeql L7 movl (sp)+,r3 tstl r0
+ movl (sp)+,r2 jeql L7
+ scanc r2,(r3),(r4),r5
+ tstl r0
+ jeql L7
+.TE
+.ce
+Table 11. Alternative inline code expansions.
+.KE
+.PP
+Another optimization involved reevaluating
+existing data structures in the context of the current system.
+For example, disk buffer hashing was implemented when the system
+typically had thirty to fifty buffers.
+Most systems today have 200 to 1000 buffers.
+Consequently, most of the hash chains contained
+ten to a hundred buffers each!
+The running time of the low level buffer management primitives was
+dramatically improved simply by enlarging the size of the hash table.
+.NH 2
+Improvements to Libraries and Utilities
+.PP
+Intuitively, changes to the kernel would seem to have the greatest
+payoff since they affect all programs that run on the system.
+However, the kernel has been tuned many times before, so the
+opportunity for significant improvement was small.
+By contrast, many of the libraries and utilities had never been tuned.
+For example, we found utilities that spent 90% of their
+running time doing single character read system calls.
+Changing the utility to use the standard I/O library cut the
+running time by a factor of five!
+Thus, while most of our time has been spent tuning the kernel,
+more than half of the speedups are because of improvements in
+other parts of the system.
+Some of the more dramatic changes are described in the following
+subsections.
+.NH 3
+Hashed Databases
+.PP
+UNIX provides a set of database management routines, \fIdbm\fP,
+that can be used to speed lookups in large data files
+with an external hashed index file.
+The original version of dbm was designed to work with only one
+database at a time. These routines were generalized to handle
+multiple database files, enabling them to be used in rewrites
+of the password and host file lookup routines. The new routines
+used to access the password file significantly improve the running
+time of many important programs such as the mail subsystem,
+the C-shell (in doing tilde expansion), \fIls \-l\fP, etc.
+.NH 3
+Buffered I/O
+.PP
+The new filesystem with its larger block sizes allows better
+performance, but it is possible to degrade system performance
+by performing numerous small transfers rather than using
+appropriately-sized buffers.
+The standard I/O library
+automatically determines the optimal buffer size for each file.
+Some C library routines and commonly-used programs use low-level
+I/O or their own buffering, however.
+Several important utilities that did not use the standard I/O library
+and were buffering I/O using the old optimal buffer size,
+1Kbytes; the programs were changed to buffer I/O according to the
+optimal file system blocksize.
+These include the editor, the assembler, loader, remote file copy,
+the text formatting programs, and the C compiler.
+.PP
+The standard error output has traditionally been unbuffered
+to prevent delay in presenting the output to the user,
+and to prevent it from being lost if buffers are not flushed.
+The inordinate expense of sending single-byte packets through
+the network led us to impose a buffering scheme on the standard
+error stream.
+Within a single call to \fIfprintf\fP, all output is buffered temporarily.
+Before the call returns, all output is flushed and the stream is again
+marked unbuffered.
+As before, the normal block or line buffering mechanisms can be used
+instead of the default behavior.
+.PP
+It is possible for programs with good intentions to unintentionally
+defeat the standard I/O library's choice of I/O buffer size by using
+the \fIsetbuf\fP call to assign an output buffer.
+Because of portability requirements, the default buffer size provided
+by \fIsetbuf\fP is 1024 bytes; this can lead, once again, to added
+overhead.
+One such program with this problem was \fIcat\fP;
+there are undoubtedly other standard system utilities with similar problems
+as the system has changed much since they were originally written.
+.NH 3
+Mail System
+.PP
+The problems discussed in section 3.1.1 prompted significant work
+on the entire mail system. The first problem identified was a bug
+in the \fIsyslog\fP program. The mail delivery program, \fIsendmail\fP
+logs all mail transactions through this process with the 4.2BSD interprocess
+communication facilities. \fISyslog\fP then records the information in
+a log file. Unfortunately, \fIsyslog\fP was performing a \fIsync\fP
+operation after each message it received, whether it was logged to a file
+or not. This wreaked havoc on the effectiveness of the
+buffer cache and explained, to a large
+extent, why sending mail to large distribution lists generated such a
+heavy load on the system (one syslog message was generated for each
+message recipient causing almost a continuous sequence of sync operations).
+.PP
+The hashed data base files were
+installed in all mail programs, resulting in an order of magnitude
+speedup on large distribution lists. The code in \fI/bin/mail\fP
+that notifies the \fIcomsat\fP program when mail has been delivered to
+a user was changed to cache host table lookups, resulting in a similar
+speedup on large distribution lists.
+.PP
+Next, the file locking facilities
+provided in 4.2BSD, \fIflock\fP\|(2), were used in place of the old
+locking mechanism.
+The mail system previously used \fIlink\fP and \fIunlink\fP in
+implementing file locking primitives.
+Because these operations usually modify the contents of directories
+they require synchronous disk operations and cannot take
+advantage of the name cache maintained by the system.
+Unlink requires that the entry be found in the directory so that
+it can be removed;
+link requires that the directory be scanned to insure that the name
+does not already exist.
+By contrast the advisory locking facility in 4.2BSD is
+efficient because it is all done with in-memory tables.
+Thus, the mail system was modified to use the file locking primitives.
+This yielded another 10% cut in the basic overhead of delivering mail.
+Extensive profiling and tuning of \fIsendmail\fP and
+compiling it without debugging code reduced the overhead by another 20%.
+.NH 3
+Network Servers
+.PP
+With the introduction of the network facilities in 4.2BSD,
+a myriad of services became available, each of which
+required its own daemon process.
+Many of these daemons were rarely if ever used,
+yet they lay asleep in the process table consuming
+system resources and generally slowing down response.
+Rather than having many servers started at boot time, a single server,
+\fIinetd\fP was substituted.
+This process reads a simple configuration file
+that specifies the services the system is willing to support
+and listens for service requests on each service's Internet port.
+When a client requests service the appropriate server is created
+and passed a service connection as its standard input. Servers
+that require the identity of their client may use the \fIgetpeername\fP
+system call; likewise \fIgetsockname\fP may be used to find out
+a server's local address without consulting data base files.
+This scheme is attractive for several reasons:
+.IP \(bu 3
+it eliminates
+as many as a dozen processes, easing system overhead and
+allowing the file and text tables to be made smaller,
+.IP \(bu 3
+servers need not contain the code required to handle connection
+queueing, simplifying the programs, and
+.IP \(bu 3
+installing and replacing servers becomes simpler.
+.PP
+With an increased numbers of networks, both local and external to Berkeley,
+we found that the overhead of the routing process was becoming
+inordinately high.
+Several changes were made in the routing daemon to reduce this load.
+Routes to external networks are no longer exchanged by routers
+on the internal machines, only a route to a default gateway.
+This reduces the amount of network traffic and the time required
+to process routing messages.
+In addition, the routing daemon was profiled
+and functions responsible for large amounts
+of time were optimized.
+The major changes were a faster hashing scheme,
+and inline expansions of the ubiquitous byte-swapping functions.
+.PP
+Under certain circumstances, when output was blocked,
+attempts by the remote login process
+to send output to the user were rejected by the system,
+although a prior \fIselect\fP call had indicated that data could be sent.
+This resulted in continuous attempts to write the data until the remote
+user restarted output.
+This problem was initially avoided in the remote login handler,
+and the original problem in the kernel has since been corrected.
+.NH 3
+The C Run-time Library
+.PP
+Several people have found poorly tuned code
+in frequently used routines in the C library [Lankford84].
+In particular the running time of the string routines can be
+cut in half by rewriting them using the VAX string instructions.
+The memory allocation routines have been tuned to waste less
+memory for memory allocations with sizes that are a power of two.
+Certain library routines that did file input in one-character reads
+have been corrected.
+Other library routines including \fIfread\fP and \fIfwrite\fP
+have been rewritten for efficiency.
+.NH 3
+Csh
+.PP
+The C-shell was converted to run on 4.2BSD by
+writing a set of routines to simulate the old jobs library.
+While this provided a functioning C-shell,
+it was grossly inefficient, generating up
+to twenty system calls per prompt.
+The C-shell has been modified to use the new signal
+facilities directly,
+cutting the number of system calls per prompt in half.
+Additional tuning was done with the help of profiling
+to cut the cost of frequently used facilities.
diff --git a/share/doc/papers/sysperf/5.t b/share/doc/papers/sysperf/5.t
new file mode 100644
index 000000000000..f8e9a2a67e0e
--- /dev/null
+++ b/share/doc/papers/sysperf/5.t
@@ -0,0 +1,279 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Functional Extensions
+.NH
+Functional Extensions
+.PP
+Some of the facilities introduced in 4.2BSD were not completely
+implemented. An important part of the effort that went into
+4.3BSD was to clean up and unify both new and old facilities.
+.NH 2
+Kernel Extensions
+.PP
+A significant effort went into improving
+the networking part of the kernel.
+The work consisted of fixing bugs,
+tuning the algorithms,
+and revamping the lowest levels of the system
+to better handle heterogeneous network topologies.
+.NH 3
+Subnets, Broadcasts and Gateways
+.PP
+To allow sites to expand their network in an autonomous
+and orderly fashion, subnetworks have been introduced in 4.3BSD [GADS85].
+This facility allows sites to subdivide their local Internet address
+space into multiple subnetwork address spaces that are visible
+only by hosts at that site. To off-site hosts machines on a site's
+subnetworks appear to reside on a single network. The routing daemon
+has been reworked to provide routing support in this type of
+environment.
+.PP
+The default Internet broadcast address is now specified with a host part
+of all one's, rather than all zero's.
+The broadcast address may be set at boot time on a per-interface basis.
+.NH 3
+Interface Addressing
+.PP
+The organization of network interfaces has been
+reworked to more cleanly support multiple
+network protocols. Network interfaces no longer
+contain a host's address on that network; instead
+each interface contains a pointer to a list of addresses
+assigned to that interface. This permits a single
+interface to support, for example, Internet protocols
+at the same time as XNS protocols.
+.PP
+The Address Resolution Protocol (ARP) support
+for 10 megabyte/second Ethernet\(dg
+.FS
+\(dg Ethernet is a trademark of Xerox.
+.FE
+has been made more flexible by allowing hosts to
+act as a ``clearing house'' for hosts that do
+not support ARP. In addition, system managers have
+more control over the contents of the ARP translation
+cache and may interactively interrogate and modify
+the cache's contents.
+.NH 3
+User Control of Network Buffering
+.PP
+Although the system allocates reasonable default amounts of buffering
+for most connections, certain operations such as file system dumps
+to remote machines benefit from significant increases in buffering [Walsh84].
+The \fIsetsockopt\fP system call has been extended to allow such requests.
+In addition, \fIgetsockopt\fP and \fIsetsockopt\fP,
+are now interfaced to the protocol level allowing protocol-specific
+options to be manipulated by the user.
+.NH 3
+Number of File Descriptors
+.PP
+To allow full use of the many descriptor based services available,
+the previous hard limit of 30 open files per process has been relaxed.
+The changes entailed generalizing \fIselect\fP to handle arrays of
+32-bit words, removing the dependency on file descriptors from
+the page table entries,
+and limiting most of the linear scans of a process's file table.
+The default per-process descriptor limit was raised from 20 to 64,
+though there are no longer any hard upper limits on the number
+of file descriptors.
+.NH 3
+Kernel Limits
+.PP
+Many internal kernel configuration limits have been increased by suitable
+modifications to data structures.
+The limit on physical memory has been changed from 8 megabyte to 64 megabyte,
+and the limit of 15 mounted file systems has been changed to 255.
+The maximum file system size has been increased to 8 gigabyte,
+number of processes to 65536,
+and per process size to 64 megabyte of data and 64 megabyte of stack.
+Note that these are upper bounds,
+the default limits for these quantities are tuned for systems
+with 4-8 megabyte of physical memory.
+.NH 3
+Memory Management
+.PP
+The global clock page replacement algorithm used to have a single
+hand that was used both to mark and to reclaim memory.
+The first time that it encountered a page it would clear its reference bit.
+If the reference bit was still clear on its next pass across the page,
+it would reclaim the page.
+The use of a single hand does not work well with large physical
+memories as the time to complete a single revolution of the hand
+can take up to a minute or more.
+By the time the hand gets around to the marked pages,
+the information is usually no longer pertinent.
+During periods of sudden shortages,
+the page daemon will not be able to find any reclaimable pages until
+it has completed a full revolution.
+To alleviate this problem,
+the clock hand has been split into two separate hands.
+The front hand clears the reference bits,
+the back hand follows a constant number of pages behind
+reclaiming pages that still have cleared reference bits.
+While the code has been written to allow the distance between
+the hands to be varied, we have not found any algorithms
+suitable for determining how to dynamically adjust this distance.
+.PP
+The configuration of the virtual memory system used to require
+a significant understanding of its operation to do such
+simple tasks as increasing the maximum process size.
+This process has been significantly improved so that the most
+common configuration parameters, such as the virtual memory sizes,
+can be specified using a single option in the configuration file.
+Standard configurations support data and stack segments
+of 17, 33 and 64 megabytes.
+.NH 3
+Signals
+.PP
+The 4.2BSD signal implementation would push several words
+onto the normal run-time stack before switching to an
+alternate signal stack.
+The 4.3BSD implementation has been corrected so that
+the entire signal handler's state is now pushed onto the signal stack.
+Another limitation in the original signal implementation was
+that it used an undocumented system call to return from signals.
+Users could not write their own return from exceptions;
+4.3BSD formally specifies the \fIsigreturn\fP system call.
+.PP
+Many existing programs depend on interrupted system calls.
+The restartable system call semantics of 4.2BSD signals caused
+many of these programs to break.
+To simplify porting of programs from inferior versions of
+.UX
+the \fIsigvec\fP system call has been extended so that
+programmers may specify that system calls are not to be
+restarted after particular signals.
+.NH 3
+System Logging
+.PP
+A system logging facility has been added
+that sends kernel messages to the
+syslog daemon for logging in /usr/adm/messages and possibly for
+printing on the system console.
+The revised scheme for logging messages
+eliminates the time lag in updating the messages file,
+unifies the format of kernel messages,
+provides a finer granularity of control over the messages
+that get printed on the console,
+and eliminates the degradation in response during the printing of
+low-priority kernel messages.
+Recoverable system errors and common resource limitations are logged
+using this facility.
+Most system utilities such as init and login,
+have been modified to log errors to syslog
+rather than writing directly on the console.
+.NH 3
+Windows
+.PP
+The tty structure has been augmented to hold
+information about the size
+of an associated window or terminal.
+These sizes can be obtained by programs such as editors that want
+to know the size of the screen they are manipulating.
+When these sizes are changed,
+a new signal, SIGWINCH, is sent the current process group.
+The editors have been modified to catch this signal and reshape
+their view of the world, and the remote login program and server
+now cooperate to propagate window sizes and window size changes
+across a network.
+Other programs and libraries such as curses that need the width
+or height of the screen have been modified to use this facility as well.
+.NH 3
+Configuration of UNIBUS Devices
+.PP
+The UNIBUS configuration routines have been extended to allow auto-configuration
+of dedicated UNIBUS memory held by devices.
+The new routines simplify the configuration of memory-mapped devices
+and correct problems occurring on reset of the UNIBUS.
+.NH 3
+Disk Recovery from Errors
+.PP
+The MASSBUS disk driver's error recovery routines have been fixed to
+retry before correcting ECC errors, support ECC on bad-sector replacements,
+and correctly attempt retries after earlier
+corrective actions in the same transfer.
+The error messages are more accurate.
+.NH 2
+Functional Extensions to Libraries and Utilities
+.PP
+Most of the changes to the utilities and libraries have been to
+allow them to handle a more general set of problems,
+or to handle the same set of problems more quickly.
+.NH 3
+Name Server
+.PP
+In 4.2BSD the name resolution routines (\fIgethostbyname\fP,
+\fIgetservbyname\fP,
+etc.) were implemented by a set of database files maintained on the
+local machine.
+Inconsistencies or obsolescence in these files resulted in inaccessibility of
+hosts or services.
+In 4.3BSD these files may be replaced by a network name server that can
+insure a consistent view of the name space in a multimachine environment.
+This name server operates in accordance with Internet standards
+for service on the ARPANET [Mockapetris83].
+.NH 3
+System Management
+.PP
+A new utility, \fIrdist\fP,
+has been provided to assist system managers in keeping
+all their machines up to date with a consistent set of sources and binaries.
+A master set of sources may reside on a single central machine,
+or be distributed at (known) locations throughout the environment.
+New versions of \fIgetty\fP, \fIinit\fP, and \fIlogin\fP
+merge the functions of several
+files into a single place, and allow more flexibility in the
+startup of processes such as window managers.
+.PP
+The new utility \fItimed\fP keeps the time on a group of cooperating machines
+(within a single LAN) synchronized to within 30 milliseconds.
+It does its corrections using a new system call that changes
+the rate of time advance without stopping or reversing the system clock.
+It normally selects one machine to act as a master.
+If the master dies or is partitioned, a new master is elected.
+Other machines may participate in a purely slave role.
+.NH 3
+Routing
+.PP
+Many bugs in the routing daemon have been fixed;
+it is considerably more robust,
+and now understands how to properly deal with
+subnets and point-to-point networks.
+Its operation has been made more efficient by tuning with the use
+of execution profiles, along with inline expansion of common operations
+using the kernel's \fIinline\fP optimizer.
+.NH 3
+Compilers
+.PP
+The symbolic debugger \fIdbx\fP has had many new features added,
+and all the known bugs fixed. In addition \fIdbx\fP
+has been extended to work with the Pascal compiler.
+The fortran compiler \fIf77\fP has had numerous bugs fixed.
+The C compiler has been modified so that it can, optionally,
+generate single precision floating point instructions when operating
+on single precision variables.
diff --git a/share/doc/papers/sysperf/6.t b/share/doc/papers/sysperf/6.t
new file mode 100644
index 000000000000..a31cd63b20a3
--- /dev/null
+++ b/share/doc/papers/sysperf/6.t
@@ -0,0 +1,64 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Security Tightening
+.NH
+Security Tightening
+.PP
+Since we do not wish to encourage rampant system cracking,
+we describe only briefly the changes made to enhance security.
+.NH 2
+Generic Kernel
+.PP
+Several loopholes in the process tracing facility have been corrected.
+Programs being traced may not be executed;
+executing programs may not be traced.
+Programs may not provide input to terminals to which they do not
+have read permission.
+The handling of process groups has been tightened to eliminate
+some problems.
+When a program attempts to change its process group,
+the system checks to see if the process with the pid of the process
+group was started by the same user.
+If it exists and was started by a different user the process group
+number change is denied.
+.NH 2
+Security Problems in Utilities
+.PP
+Setuid utilities no longer use the \fIpopen\fP or \fIsystem\fP library routines.
+Access to the kernel's data structures through the kmem device
+is now restricted to programs that are set group id ``kmem''.
+Thus many programs that used to run with root privileges
+no longer need to do so.
+Access to disk devices is now controlled by an ``operator'' group id;
+this permission allows operators to function without being the super-user.
+Only users in group wheel can do ``su root''; this restriction
+allows administrators to define a super-user access list.
+Numerous holes have been closed in the shell to prevent
+users from gaining privileges from set user id shell scripts,
+although use of such scripts is still highly discouraged on systems
+that are concerned about security.
diff --git a/share/doc/papers/sysperf/7.t b/share/doc/papers/sysperf/7.t
new file mode 100644
index 000000000000..c23bcad04bb3
--- /dev/null
+++ b/share/doc/papers/sysperf/7.t
@@ -0,0 +1,158 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Conclusions
+.NH
+Conclusions
+.PP
+4.2BSD, while functionally superior to 4.1BSD, lacked much of the
+performance tuning required of a good system. We found that
+the distributed system spent 10-20% more time in the kernel than
+4.1BSD. This added overhead combined with problems with several
+user programs severely limited the overall performance of the
+system in a general timesharing environment.
+.PP
+Changes made to the system since the 4.2BSD distribution have
+eliminated most of the
+added system overhead by replacing old algorithms
+or introducing additional cacheing schemes.
+The combined caches added to the name translation process
+reduce the average cost of translating a pathname to an inode by more than 50%.
+These changes reduce the percentage of time spent running
+in the system by nearly 9%.
+.PP
+The use of silo input on terminal ports only when necessary
+has allowed the system to avoid a large amount of software interrupt
+processing. Observations show that the system is forced to
+field about 25% fewer interrupts than before.
+.PP
+The kernel
+changes, combined with many bug fixes, make the system much more
+responsive in a general timesharing environment.
+The 4.3BSD Berkeley UNIX system now appears
+capable of supporting loads at least as large as those supported under
+4.1BSD while providing all the new interprocess communication, networking,
+and file system facilities.
+.nr H2 1
+.ds RH Acknowledgements
+.SH
+\s+2Acknowledgements\s0
+.PP
+We would like to thank Robert Elz for sharing his ideas and
+his code for cacheing system wide names and searching the process table.
+We thank Alan Smith for initially suggesting the use of a
+capability based cache.
+We also acknowledge
+George Goble who dropped many of our changes
+into his production system and reported back fixes to the
+disasters that they caused.
+The buffer cache read-ahead trace package was based
+on a program written by Jim Lawson. Ralph Campbell
+implemented several of the C library changes. The original
+version of the Internet daemon was written by Bill Joy.
+In addition,
+we would like to thank the many other people that contributed
+ideas, information, and work while the system was undergoing change.
+.ds RH References
+.nr H2 1
+.sp 2
+.SH
+\s+2References\s-2
+.LP
+.IP [Cabrera84] 20
+Luis Felipe Cabrera, Eduard Hunter, Michael J. Karels, and David Mosher,
+``A User-Process Oriented Performance Study of Ethernet Networking Under
+Berkeley UNIX 4.2BSD,''
+Research Report No. UCB/CSD 84/217, University of California,
+Berkeley, December 1984.
+.IP [Cabrera85] 20
+Luis Felipe Cabrera, Michael J. Karels, and David Mosher,
+``The Impact of Buffer Management on Networking Software Performance
+in Berkeley UNIX 4.2BSD: A Case Study,''
+Proceedings of the Summer Usenix Conference, Portland, Oregon,
+June 1985, pp. 507-517.
+.IP [GADS85] 20
+GADS (Gateway Algorithms and Data Structures Task Force),
+``Toward an Internet Standard for Subnetting,'' RFC-940,
+Network Information Center, SRI International,
+April 1985.
+.IP [Joy80] 20
+Joy, William,
+``Comments on the performance of UNIX on the VAX'',
+Computer System Research Group, U.C. Berkeley.
+April 1980.
+.IP [Kashtan80] 20
+Kashtan, David L.,
+``UNIX and VMS, Some Performance Comparisons'',
+SRI International. February 1980.
+.IP [Lankford84] 20
+Jeffrey Lankford,
+``UNIX System V and 4BSD Performance,''
+\fIProceedings of the Salt Lake City Usenix Conference\fP,
+pp 228-236, June 1984.
+.IP [Leffler84] 20
+Sam Leffler, Mike Karels, and M. Kirk McKusick,
+``Measuring and Improving the Performance of 4.2BSD,''
+\fIProceedings of the Salt Lake City Usenix Conference\fP,
+pp 237-252, June 1984.
+.IP [McKusick85]
+M. Kirk McKusick, Mike Karels, and Samual Leffler,
+``Performance Improvements and Functional Enhancements in 4.3BSD''
+\fIProceedings of the Portland Usenix Conference\fP,
+pp 519-531, June 1985.
+.IP [Mockapetris83] 20
+Paul Mockapetris, ``Domain Names \- Implementation and Schedule,''
+Network Information Center, SRI International,
+RFC-883,
+November 1983.
+.IP [Mogul84] 20
+Jeffrey Mogul, ``Broadcasting Internet Datagrams,'' RFC-919,
+Network Information Center, SRI International,
+October 1984.
+.IP [Mosher80] 20
+Mosher, David,
+``UNIX Performance, an Introspection'',
+Presented at the Boulder, Colorado Usenix Conference, January 1980.
+Copies of the paper are available from
+Computer System Research Group, U.C. Berkeley.
+.IP [Nagle84] 20
+John Nagle, ``Congestion Control in IP/TCP Internetworks,'' RFC-896,
+Network Information Center, SRI International,
+January 1984.
+.IP [Ritchie74] 20
+Ritchie, D. M. and Thompson, K.,
+``The UNIX Time-Sharing System'',
+CACM 17, 7. July 1974. pp 365-375
+.IP [Shannon83] 20
+Shannon, W.,
+private communication,
+July 1983
+.IP [Walsh84] 20
+Robert Walsh and Robert Gurwitz,
+``Converting BBN TCP/IP to 4.2BSD,''
+\fIProceedings of the Salt Lake City Usenix Conference\fP,
+pp 52-61, June 1984.
diff --git a/share/doc/papers/sysperf/Makefile b/share/doc/papers/sysperf/Makefile
new file mode 100644
index 000000000000..d38189a350dc
--- /dev/null
+++ b/share/doc/papers/sysperf/Makefile
@@ -0,0 +1,9 @@
+VOLUME= papers
+DOC= sysperf
+SRCS= 0.t 1.t 2.t 3.t 4.t 5.t 6.t 7.t appendix.ms
+EXTRA= a1.t a2.t
+MACROS= -ms
+USE_EQN=
+USE_TBL=
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/sysperf/a1.t b/share/doc/papers/sysperf/a1.t
new file mode 100644
index 000000000000..677a6dc5143a
--- /dev/null
+++ b/share/doc/papers/sysperf/a1.t
@@ -0,0 +1,662 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Appendix A \- Benchmark sources
+.nr H2 1
+.sp 2
+.de vS
+.nf
+..
+.de vE
+.fi
+..
+.bp
+.SH
+\s+2Appendix A \- Benchmark sources\s-2
+.LP
+The programs shown here run under 4.2 with only routines
+from the standard libraries. When run under 4.1 they were augmented
+with a \fIgetpagesize\fP routine and a copy of the \fIrandom\fP
+function from the C library. The \fIvforks\fP and \fIvexecs\fP
+programs are constructed from the \fIforks\fP and \fIexecs\fP programs,
+respectively, by substituting calls to \fIfork\fP with calls to
+\fIvfork\fP.
+.SH
+syscall
+.LP
+.vS
+/*
+ * System call overhead benchmark.
+ */
+main(argc, argv)
+ char *argv[];
+{
+ register int ncalls;
+
+ if (argc < 2) {
+ printf("usage: %s #syscalls\n", argv[0]);
+ exit(1);
+ }
+ ncalls = atoi(argv[1]);
+ while (ncalls-- > 0)
+ (void) getpid();
+}
+.vE
+.SH
+csw
+.LP
+.vS
+/*
+ * Context switching benchmark.
+ *
+ * Force system to context switch 2*nsigs
+ * times by forking and exchanging signals.
+ * To calculate system overhead for a context
+ * switch, the signocsw program must be run
+ * with nsigs. Overhead is then estimated by
+ * t1 = time csw <n>
+ * t2 = time signocsw <n>
+ * overhead = t1 - 2 * t2;
+ */
+#include <signal.h>
+
+int sigsub();
+int otherpid;
+int nsigs;
+
+main(argc, argv)
+ char *argv[];
+{
+ int pid;
+
+ if (argc < 2) {
+ printf("usage: %s nsignals\n", argv[0]);
+ exit(1);
+ }
+ nsigs = atoi(argv[1]);
+ signal(SIGALRM, sigsub);
+ otherpid = getpid();
+ pid = fork();
+ if (pid != 0) {
+ otherpid = pid;
+ kill(otherpid, SIGALRM);
+ }
+ for (;;)
+ sigpause(0);
+}
+
+sigsub()
+{
+
+ signal(SIGALRM, sigsub);
+ kill(otherpid, SIGALRM);
+ if (--nsigs <= 0)
+ exit(0);
+}
+.vE
+.SH
+signocsw
+.LP
+.vS
+/*
+ * Signal without context switch benchmark.
+ */
+#include <signal.h>
+
+int pid;
+int nsigs;
+int sigsub();
+
+main(argc, argv)
+ char *argv[];
+{
+ register int i;
+
+ if (argc < 2) {
+ printf("usage: %s nsignals\n", argv[0]);
+ exit(1);
+ }
+ nsigs = atoi(argv[1]);
+ signal(SIGALRM, sigsub);
+ pid = getpid();
+ for (i = 0; i < nsigs; i++)
+ kill(pid, SIGALRM);
+}
+
+sigsub()
+{
+
+ signal(SIGALRM, sigsub);
+}
+.vE
+.SH
+pipeself
+.LP
+.vS
+/*
+ * IPC benchmark,
+ * write to self using pipes.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ char buf[512];
+ int fd[2], msgsize;
+ register int i, iter;
+
+ if (argc < 3) {
+ printf("usage: %s iterations message-size\n", argv[0]);
+ exit(1);
+ }
+ argc--, argv++;
+ iter = atoi(*argv);
+ argc--, argv++;
+ msgsize = atoi(*argv);
+ if (msgsize > sizeof (buf) || msgsize <= 0) {
+ printf("%s: Bad message size.\n", *argv);
+ exit(2);
+ }
+ if (pipe(fd) < 0) {
+ perror("pipe");
+ exit(3);
+ }
+ for (i = 0; i < iter; i++) {
+ write(fd[1], buf, msgsize);
+ read(fd[0], buf, msgsize);
+ }
+}
+.vE
+.SH
+pipediscard
+.LP
+.vS
+/*
+ * IPC benchmarkl,
+ * write and discard using pipes.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ char buf[512];
+ int fd[2], msgsize;
+ register int i, iter;
+
+ if (argc < 3) {
+ printf("usage: %s iterations message-size\n", argv[0]);
+ exit(1);
+ }
+ argc--, argv++;
+ iter = atoi(*argv);
+ argc--, argv++;
+ msgsize = atoi(*argv);
+ if (msgsize > sizeof (buf) || msgsize <= 0) {
+ printf("%s: Bad message size.\n", *argv);
+ exit(2);
+ }
+ if (pipe(fd) < 0) {
+ perror("pipe");
+ exit(3);
+ }
+ if (fork() == 0)
+ for (i = 0; i < iter; i++)
+ read(fd[0], buf, msgsize);
+ else
+ for (i = 0; i < iter; i++)
+ write(fd[1], buf, msgsize);
+}
+.vE
+.SH
+pipeback
+.LP
+.vS
+/*
+ * IPC benchmark,
+ * read and reply using pipes.
+ *
+ * Process forks and exchanges messages
+ * over a pipe in a request-response fashion.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ char buf[512];
+ int fd[2], fd2[2], msgsize;
+ register int i, iter;
+
+ if (argc < 3) {
+ printf("usage: %s iterations message-size\n", argv[0]);
+ exit(1);
+ }
+ argc--, argv++;
+ iter = atoi(*argv);
+ argc--, argv++;
+ msgsize = atoi(*argv);
+ if (msgsize > sizeof (buf) || msgsize <= 0) {
+ printf("%s: Bad message size.\n", *argv);
+ exit(2);
+ }
+ if (pipe(fd) < 0) {
+ perror("pipe");
+ exit(3);
+ }
+ if (pipe(fd2) < 0) {
+ perror("pipe");
+ exit(3);
+ }
+ if (fork() == 0)
+ for (i = 0; i < iter; i++) {
+ read(fd[0], buf, msgsize);
+ write(fd2[1], buf, msgsize);
+ }
+ else
+ for (i = 0; i < iter; i++) {
+ write(fd[1], buf, msgsize);
+ read(fd2[0], buf, msgsize);
+ }
+}
+.vE
+.SH
+forks
+.LP
+.vS
+/*
+ * Benchmark program to calculate fork+wait
+ * overhead (approximately). Process
+ * forks and exits while parent waits.
+ * The time to run this program is used
+ * in calculating exec overhead.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ register int nforks, i;
+ char *cp;
+ int pid, child, status, brksize;
+
+ if (argc < 2) {
+ printf("usage: %s number-of-forks sbrk-size\n", argv[0]);
+ exit(1);
+ }
+ nforks = atoi(argv[1]);
+ if (nforks < 0) {
+ printf("%s: bad number of forks\n", argv[1]);
+ exit(2);
+ }
+ brksize = atoi(argv[2]);
+ if (brksize < 0) {
+ printf("%s: bad size to sbrk\n", argv[2]);
+ exit(3);
+ }
+ cp = (char *)sbrk(brksize);
+ if ((int)cp == -1) {
+ perror("sbrk");
+ exit(4);
+ }
+ for (i = 0; i < brksize; i += 1024)
+ cp[i] = i;
+ while (nforks-- > 0) {
+ child = fork();
+ if (child == -1) {
+ perror("fork");
+ exit(-1);
+ }
+ if (child == 0)
+ _exit(-1);
+ while ((pid = wait(&status)) != -1 && pid != child)
+ ;
+ }
+ exit(0);
+}
+.vE
+.SH
+execs
+.LP
+.vS
+/*
+ * Benchmark program to calculate exec
+ * overhead (approximately). Process
+ * forks and execs "null" test program.
+ * The time to run the fork program should
+ * then be deducted from this one to
+ * estimate the overhead for the exec.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ register int nexecs, i;
+ char *cp, *sbrk();
+ int pid, child, status, brksize;
+
+ if (argc < 3) {
+ printf("usage: %s number-of-execs sbrk-size job-name\n",
+ argv[0]);
+ exit(1);
+ }
+ nexecs = atoi(argv[1]);
+ if (nexecs < 0) {
+ printf("%s: bad number of execs\n", argv[1]);
+ exit(2);
+ }
+ brksize = atoi(argv[2]);
+ if (brksize < 0) {
+ printf("%s: bad size to sbrk\n", argv[2]);
+ exit(3);
+ }
+ cp = sbrk(brksize);
+ if ((int)cp == -1) {
+ perror("sbrk");
+ exit(4);
+ }
+ for (i = 0; i < brksize; i += 1024)
+ cp[i] = i;
+ while (nexecs-- > 0) {
+ child = fork();
+ if (child == -1) {
+ perror("fork");
+ exit(-1);
+ }
+ if (child == 0) {
+ execv(argv[3], argv);
+ perror("execv");
+ _exit(-1);
+ }
+ while ((pid = wait(&status)) != -1 && pid != child)
+ ;
+ }
+ exit(0);
+}
+.vE
+.SH
+nulljob
+.LP
+.vS
+/*
+ * Benchmark "null job" program.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+
+ exit(0);
+}
+.vE
+.SH
+bigjob
+.LP
+.vS
+/*
+ * Benchmark "null big job" program.
+ */
+/* 250 here is intended to approximate vi's text+data size */
+char space[1024 * 250] = "force into data segment";
+
+main(argc, argv)
+ char *argv[];
+{
+
+ exit(0);
+}
+.vE
+.bp
+.SH
+seqpage
+.LP
+.vS
+/*
+ * Sequential page access benchmark.
+ */
+#include <sys/vadvise.h>
+
+char *valloc();
+
+main(argc, argv)
+ char *argv[];
+{
+ register i, niter;
+ register char *pf, *lastpage;
+ int npages = 4096, pagesize, vflag = 0;
+ char *pages, *name;
+
+ name = argv[0];
+ argc--, argv++;
+again:
+ if (argc < 1) {
+usage:
+ printf("usage: %s [ -v ] [ -p #pages ] niter\n", name);
+ exit(1);
+ }
+ if (strcmp(*argv, "-p") == 0) {
+ argc--, argv++;
+ if (argc < 1)
+ goto usage;
+ npages = atoi(*argv);
+ if (npages <= 0) {
+ printf("%s: Bad page count.\n", *argv);
+ exit(2);
+ }
+ argc--, argv++;
+ goto again;
+ }
+ if (strcmp(*argv, "-v") == 0) {
+ argc--, argv++;
+ vflag++;
+ goto again;
+ }
+ niter = atoi(*argv);
+ pagesize = getpagesize();
+ pages = valloc(npages * pagesize);
+ if (pages == (char *)0) {
+ printf("Can't allocate %d pages (%2.1f megabytes).\n",
+ npages, (npages * pagesize) / (1024. * 1024.));
+ exit(3);
+ }
+ lastpage = pages + (npages * pagesize);
+ if (vflag)
+ vadvise(VA_SEQL);
+ for (i = 0; i < niter; i++)
+ for (pf = pages; pf < lastpage; pf += pagesize)
+ *pf = 1;
+}
+.vE
+.SH
+randpage
+.LP
+.vS
+/*
+ * Random page access benchmark.
+ */
+#include <sys/vadvise.h>
+
+char *valloc();
+int rand();
+
+main(argc, argv)
+ char *argv[];
+{
+ register int npages = 4096, pagesize, pn, i, niter;
+ int vflag = 0, debug = 0;
+ char *pages, *name;
+
+ name = argv[0];
+ argc--, argv++;
+again:
+ if (argc < 1) {
+usage:
+ printf("usage: %s [ -d ] [ -v ] [ -p #pages ] niter\n", name);
+ exit(1);
+ }
+ if (strcmp(*argv, "-p") == 0) {
+ argc--, argv++;
+ if (argc < 1)
+ goto usage;
+ npages = atoi(*argv);
+ if (npages <= 0) {
+ printf("%s: Bad page count.\n", *argv);
+ exit(2);
+ }
+ argc--, argv++;
+ goto again;
+ }
+ if (strcmp(*argv, "-v") == 0) {
+ argc--, argv++;
+ vflag++;
+ goto again;
+ }
+ if (strcmp(*argv, "-d") == 0) {
+ argc--, argv++;
+ debug++;
+ goto again;
+ }
+ niter = atoi(*argv);
+ pagesize = getpagesize();
+ pages = valloc(npages * pagesize);
+ if (pages == (char *)0) {
+ printf("Can't allocate %d pages (%2.1f megabytes).\n",
+ npages, (npages * pagesize) / (1024. * 1024.));
+ exit(3);
+ }
+ if (vflag)
+ vadvise(VA_ANOM);
+ for (i = 0; i < niter; i++) {
+ pn = random() % npages;
+ if (debug)
+ printf("touch page %d\n", pn);
+ pages[pagesize * pn] = 1;
+ }
+}
+.vE
+.SH
+gausspage
+.LP
+.vS
+/*
+ * Random page access with
+ * a gaussian distribution.
+ *
+ * Allocate a large (zero fill on demand) address
+ * space and fault the pages in a random gaussian
+ * order.
+ */
+
+float sqrt(), log(), rnd(), cos(), gauss();
+char *valloc();
+int rand();
+
+main(argc, argv)
+ char *argv[];
+{
+ register int pn, i, niter, delta;
+ register char *pages;
+ float sd = 10.0;
+ int npages = 4096, pagesize, debug = 0;
+ char *name;
+
+ name = argv[0];
+ argc--, argv++;
+again:
+ if (argc < 1) {
+usage:
+ printf(
+"usage: %s [ -d ] [ -p #pages ] [ -s standard-deviation ] iterations\n", name);
+ exit(1);
+ }
+ if (strcmp(*argv, "-s") == 0) {
+ argc--, argv++;
+ if (argc < 1)
+ goto usage;
+ sscanf(*argv, "%f", &sd);
+ if (sd <= 0) {
+ printf("%s: Bad standard deviation.\n", *argv);
+ exit(2);
+ }
+ argc--, argv++;
+ goto again;
+ }
+ if (strcmp(*argv, "-p") == 0) {
+ argc--, argv++;
+ if (argc < 1)
+ goto usage;
+ npages = atoi(*argv);
+ if (npages <= 0) {
+ printf("%s: Bad page count.\n", *argv);
+ exit(2);
+ }
+ argc--, argv++;
+ goto again;
+ }
+ if (strcmp(*argv, "-d") == 0) {
+ argc--, argv++;
+ debug++;
+ goto again;
+ }
+ niter = atoi(*argv);
+ pagesize = getpagesize();
+ pages = valloc(npages*pagesize);
+ if (pages == (char *)0) {
+ printf("Can't allocate %d pages (%2.1f megabytes).\n",
+ npages, (npages*pagesize) / (1024. * 1024.));
+ exit(3);
+ }
+ pn = 0;
+ for (i = 0; i < niter; i++) {
+ delta = gauss(sd, 0.0);
+ while (pn + delta < 0 || pn + delta > npages)
+ delta = gauss(sd, 0.0);
+ pn += delta;
+ if (debug)
+ printf("touch page %d\n", pn);
+ else
+ pages[pn * pagesize] = 1;
+ }
+}
+
+float
+gauss(sd, mean)
+ float sd, mean;
+{
+ register float qa, qb;
+
+ qa = sqrt(log(rnd()) * -2.0);
+ qb = 3.14159 * rnd();
+ return (qa * cos(qb) * sd + mean);
+}
+
+float
+rnd()
+{
+ static int seed = 1;
+ static int biggest = 0x7fffffff;
+
+ return ((float)rand(seed) / (float)biggest);
+}
+.vE
diff --git a/share/doc/papers/sysperf/a2.t b/share/doc/papers/sysperf/a2.t
new file mode 100644
index 000000000000..cc2fff5a6ade
--- /dev/null
+++ b/share/doc/papers/sysperf/a2.t
@@ -0,0 +1,111 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.SH
+run (shell script)
+.LP
+.vS
+#! /bin/csh -fx
+# Script to run benchmark programs.
+#
+date
+make clean; time make
+time syscall 100000
+time seqpage -p 7500 10
+time seqpage -v -p 7500 10
+time randpage -p 7500 30000
+time randpage -v -p 7500 30000
+time gausspage -p 7500 -s 1 30000
+time gausspage -p 7500 -s 10 30000
+time gausspage -p 7500 -s 30 30000
+time gausspage -p 7500 -s 40 30000
+time gausspage -p 7500 -s 50 30000
+time gausspage -p 7500 -s 60 30000
+time gausspage -p 7500 -s 80 30000
+time gausspage -p 7500 -s 10000 30000
+time csw 10000
+time signocsw 10000
+time pipeself 10000 512
+time pipeself 10000 4
+time udgself 10000 512
+time udgself 10000 4
+time pipediscard 10000 512
+time pipediscard 10000 4
+time udgdiscard 10000 512
+time udgdiscard 10000 4
+time pipeback 10000 512
+time pipeback 10000 4
+time udgback 10000 512
+time udgback 10000 4
+size forks
+time forks 1000 0
+time forks 1000 1024
+time forks 1000 102400
+size vforks
+time vforks 1000 0
+time vforks 1000 1024
+time vforks 1000 102400
+countenv
+size nulljob
+time execs 1000 0 nulljob
+time execs 1000 1024 nulljob
+time execs 1000 102400 nulljob
+time vexecs 1000 0 nulljob
+time vexecs 1000 1024 nulljob
+time vexecs 1000 102400 nulljob
+size bigjob
+time execs 1000 0 bigjob
+time execs 1000 1024 bigjob
+time execs 1000 102400 bigjob
+time vexecs 1000 0 bigjob
+time vexecs 1000 1024 bigjob
+time vexecs 1000 102400 bigjob
+# fill environment with ~1024 bytes
+setenv a 012345678901234567890123456789012345678901234567890123456780123456789
+setenv b 012345678901234567890123456789012345678901234567890123456780123456789
+setenv c 012345678901234567890123456789012345678901234567890123456780123456789
+setenv d 012345678901234567890123456789012345678901234567890123456780123456789
+setenv e 012345678901234567890123456789012345678901234567890123456780123456789
+setenv f 012345678901234567890123456789012345678901234567890123456780123456789
+setenv g 012345678901234567890123456789012345678901234567890123456780123456789
+setenv h 012345678901234567890123456789012345678901234567890123456780123456789
+setenv i 012345678901234567890123456789012345678901234567890123456780123456789
+setenv j 012345678901234567890123456789012345678901234567890123456780123456789
+setenv k 012345678901234567890123456789012345678901234567890123456780123456789
+setenv l 012345678901234567890123456789012345678901234567890123456780123456789
+setenv m 012345678901234567890123456789012345678901234567890123456780123456789
+setenv n 012345678901234567890123456789012345678901234567890123456780123456789
+setenv o 012345678901234567890123456789012345678901234567890123456780123456789
+countenv
+time execs 1000 0 nulljob
+time execs 1000 1024 nulljob
+time execs 1000 102400 nulljob
+time execs 1000 0 bigjob
+time execs 1000 1024 bigjob
+time execs 1000 102400 bigjob
+.vE
+.bp
diff --git a/share/doc/papers/sysperf/appendix.ms b/share/doc/papers/sysperf/appendix.ms
new file mode 100644
index 000000000000..354c9287a6a6
--- /dev/null
+++ b/share/doc/papers/sysperf/appendix.ms
@@ -0,0 +1,1026 @@
+.am vS
+..
+.am vE
+..
+'ss 23
+'ds _ \d\(mi\u
+'ps 9z
+'vs 10p
+'ds - \(mi
+'ds / \\h'\\w' 'u-\\w'/'u'/
+'ds /* \\h'\\w' 'u-\\w'/'u'/*
+'bd B 3
+'bd S B 3
+'nr cm 0
+'nf
+'de vH
+'ev 2
+'ft 1
+'sp .35i
+'tl '\s14\f3\\*(=F\fP\s0'\\*(=H'\f3\s14\\*(=F\fP\s0'
+'sp .25i
+'ft 1
+\f2\s12\h'\\n(.lu-\w'\\*(=f'u'\\*(=f\fP\s0\h'|0u'
+.sp .05i
+'ev
+'ds =G \\*(=F
+..
+'de vF
+'ev 2
+'sp .35i
+'ie o 'tl '\f2\\*(=M''Page % of \\*(=G\fP'
+'el 'tl '\f2Page % of \\*(=G''\\*(=M\fP'
+'bp
+'ev
+'ft 1
+'if \\n(cm=1 'ft 2
+..
+'de ()
+'pn 1
+..
+'de +C
+'nr cm 1
+'ft 2
+'ds +K
+'ds -K
+..
+'de -C
+'nr cm 0
+'ft 1
+'ds +K \f3
+'ds -K \fP
+..
+'+C
+'-C
+'am +C
+'ne 3
+..
+'de FN
+\f2\s14\h'\\n(.lu-\w'\\$1'u'\\$1\fP\s0\h'|0u'\c
+.if r x .if \\nx .if d =F .tm \\$1 \\*(=F \\n%
+'ds =f \&...\\$1
+..
+'de FC
+.if r x .if \\nx .if d =F .tm \\$1 \\*(=F \\n%
+'ds =f \&...\\$1
+..
+'de -F
+'rm =f
+..
+'ft 1
+'lg 0
+'-F
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.ds RH Appendix A \- Benchmark sources
+.nr H2 1
+.sp 2
+.de vS
+.nf
+..
+.de vE
+.fi
+..
+.bp
+.SH
+\s+2Appendix A \- Benchmark sources\s-2
+.LP
+The programs shown here run under 4.2 with only routines
+from the standard libraries. When run under 4.1 they were augmented
+with a \fIgetpagesize\fP routine and a copy of the \fIrandom\fP
+function from the C library. The \fIvforks\fP and \fIvexecs\fP
+programs are constructed from the \fIforks\fP and \fIexecs\fP programs,
+respectively, by substituting calls to \fIfork\fP with calls to
+\fIvfork\fP.
+.SH
+syscall
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP System call overhead benchmark\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K ncalls;
+
+\h'|11n'\*(+Kif\*(-K (argc < 2) \*(+K{\*(-K
+\h'|21n'printf("usage: %s #syscalls\en", argv[0]);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'ncalls = atoi(argv[1]);
+\h'|11n'\*(+Kwhile\*(-K (ncalls\*-\*- > 0)
+\h'|21n'(\*(+Kvoid\*(-K) getpid();
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+csw
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Context switching benchmark\&.
+ \fI*\fP
+ \fI*\fP Force system to context switch 2\fI*\fPnsigs
+ \fI*\fP times by forking and exchanging signals\&.
+ \fI*\fP To calculate system overhead for a context
+ \fI*\fP switch, the signocsw program must be run
+ \fI*\fP with nsigs\&. Overhead is then estimated by
+ \fI*\fP\h'|11n't1 = time csw <n>
+ \fI*\fP\h'|11n't2 = time signocsw <n>
+ \fI*\fP\h'|11n'overhead = t1 \*- 2 \fI*\fP t2;
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+K#include\*(-K <signal\&.h>
+
+\*(+Kint\*(-K\h'|11n'sigsub();
+\*(+Kint\*(-K\h'|11n'otherpid;
+\*(+Kint\*(-K\h'|11n'nsigs;
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kint\*(-K pid;
+
+\h'|11n'\*(+Kif\*(-K (argc < 2) \*(+K{\*(-K
+\h'|21n'printf("usage: %s nsignals\en", argv[0]);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'nsigs = atoi(argv[1]);
+\h'|11n'signal(SIGALRM, sigsub);
+\h'|11n'otherpid = getpid();
+\h'|11n'pid = fork();
+\h'|11n'\*(+Kif\*(-K (pid != 0) \*(+K{\*(-K
+\h'|21n'otherpid = pid;
+\h'|21n'kill(otherpid, SIGALRM);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kfor\*(-K (;;)
+\h'|21n'sigpause(0);
+\*(+K}\*(-K\c\c
+'-F
+
+
+'FN sigsub
+sigsub()
+\*(+K{\*(-K
+
+\h'|11n'signal(SIGALRM, sigsub);
+\h'|11n'kill(otherpid, SIGALRM);
+\h'|11n'\*(+Kif\*(-K (\*-\*-nsigs <= 0)
+\h'|21n'exit(0);
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+signocsw
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Signal without context switch benchmark\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+K#include\*(-K <signal\&.h>
+
+\*(+Kint\*(-K\h'|11n'pid;
+\*(+Kint\*(-K\h'|11n'nsigs;
+\*(+Kint\*(-K\h'|11n'sigsub();
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K i;
+
+\h'|11n'\*(+Kif\*(-K (argc < 2) \*(+K{\*(-K
+\h'|21n'printf("usage: %s nsignals\en", argv[0]);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'nsigs = atoi(argv[1]);
+\h'|11n'signal(SIGALRM, sigsub);
+\h'|11n'pid = getpid();
+\h'|11n'\*(+Kfor\*(-K (i = 0; i < nsigs; i++)
+\h'|21n'kill(pid, SIGALRM);
+\*(+K}\*(-K\c\c
+'-F
+
+
+'FN sigsub
+sigsub()
+\*(+K{\*(-K
+
+\h'|11n'signal(SIGALRM, sigsub);
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+pipeself
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP IPC benchmark,
+ \fI*\fP write to self using pipes\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kchar\*(-K buf[512];
+\h'|11n'\*(+Kint\*(-K fd[2], msgsize;
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K i, iter;
+
+\h'|11n'\*(+Kif\*(-K (argc < 3) \*(+K{\*(-K
+\h'|21n'printf("usage: %s iterations message\*-size\en", argv[0]);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'argc\*-\*-, argv++;
+\h'|11n'iter = atoi(\fI*\fPargv);
+\h'|11n'argc\*-\*-, argv++;
+\h'|11n'msgsize = atoi(\fI*\fPargv);
+\h'|11n'\*(+Kif\*(-K (msgsize > \*(+Ksizeof\*(-K (buf) || msgsize <= 0) \*(+K{\*(-K
+\h'|21n'printf("%s: Bad message size\&.\en", \fI*\fPargv);
+\h'|21n'exit(2);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (pipe(fd) < 0) \*(+K{\*(-K
+\h'|21n'perror("pipe");
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kfor\*(-K (i = 0; i < iter; i++) \*(+K{\*(-K
+\h'|21n'write(fd[1], buf, msgsize);
+\h'|21n'read(fd[0], buf, msgsize);
+\h'|11n'\*(+K}\*(-K
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+pipediscard
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP IPC benchmarkl,
+ \fI*\fP write and discard using pipes\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kchar\*(-K buf[512];
+\h'|11n'\*(+Kint\*(-K fd[2], msgsize;
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K i, iter;
+
+\h'|11n'\*(+Kif\*(-K (argc < 3) \*(+K{\*(-K
+\h'|21n'printf("usage: %s iterations message\*-size\en", argv[0]);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'argc\*-\*-, argv++;
+\h'|11n'iter = atoi(\fI*\fPargv);
+\h'|11n'argc\*-\*-, argv++;
+\h'|11n'msgsize = atoi(\fI*\fPargv);
+\h'|11n'\*(+Kif\*(-K (msgsize > \*(+Ksizeof\*(-K (buf) || msgsize <= 0) \*(+K{\*(-K
+\h'|21n'printf("%s: Bad message size\&.\en", \fI*\fPargv);
+\h'|21n'exit(2);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (pipe(fd) < 0) \*(+K{\*(-K
+\h'|21n'perror("pipe");
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (fork() == 0)
+\h'|21n'\*(+Kfor\*(-K (i = 0; i < iter; i++)
+\h'|31n'read(fd[0], buf, msgsize);
+\h'|11n'\*(+Kelse\*(-K
+\h'|21n'\*(+Kfor\*(-K (i = 0; i < iter; i++)
+\h'|31n'write(fd[1], buf, msgsize);
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+pipeback
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP IPC benchmark,
+ \fI*\fP read and reply using pipes\&.
+ \fI*\fP
+ \fI*\fP Process forks and exchanges messages
+ \fI*\fP over a pipe in a request\*-response fashion\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kchar\*(-K buf[512];
+\h'|11n'\*(+Kint\*(-K fd[2], fd2[2], msgsize;
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K i, iter;
+
+\h'|11n'\*(+Kif\*(-K (argc < 3) \*(+K{\*(-K
+\h'|21n'printf("usage: %s iterations message\*-size\en", argv[0]);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'argc\*-\*-, argv++;
+\h'|11n'iter = atoi(\fI*\fPargv);
+\h'|11n'argc\*-\*-, argv++;
+\h'|11n'msgsize = atoi(\fI*\fPargv);
+\h'|11n'\*(+Kif\*(-K (msgsize > \*(+Ksizeof\*(-K (buf) || msgsize <= 0) \*(+K{\*(-K
+\h'|21n'printf("%s: Bad message size\&.\en", \fI*\fPargv);
+\h'|21n'exit(2);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (pipe(fd) < 0) \*(+K{\*(-K
+\h'|21n'perror("pipe");
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (pipe(fd2) < 0) \*(+K{\*(-K
+\h'|21n'perror("pipe");
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (fork() == 0)
+\h'|21n'\*(+Kfor\*(-K (i = 0; i < iter; i++) \*(+K{\*(-K
+\h'|31n'read(fd[0], buf, msgsize);
+\h'|31n'write(fd2[1], buf, msgsize);
+\h'|21n'\*(+K}\*(-K
+\h'|11n'\*(+Kelse\*(-K
+\h'|21n'\*(+Kfor\*(-K (i = 0; i < iter; i++) \*(+K{\*(-K
+\h'|31n'write(fd[1], buf, msgsize);
+\h'|31n'read(fd2[0], buf, msgsize);
+\h'|21n'\*(+K}\*(-K
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+forks
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Benchmark program to calculate fork+wait
+ \fI*\fP overhead (approximately)\&. Process
+ \fI*\fP forks and exits while parent waits\&.
+ \fI*\fP The time to run this program is used
+ \fI*\fP in calculating exec overhead\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K nforks, i;
+\h'|11n'\*(+Kchar\*(-K \fI*\fPcp;
+\h'|11n'\*(+Kint\*(-K pid, child, status, brksize;
+
+\h'|11n'\*(+Kif\*(-K (argc < 2) \*(+K{\*(-K
+\h'|21n'printf("usage: %s number\*-of\*-forks sbrk\*-size\en", argv[0]);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'nforks = atoi(argv[1]);
+\h'|11n'\*(+Kif\*(-K (nforks < 0) \*(+K{\*(-K
+\h'|21n'printf("%s: bad number of forks\en", argv[1]);
+\h'|21n'exit(2);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'brksize = atoi(argv[2]);
+\h'|11n'\*(+Kif\*(-K (brksize < 0) \*(+K{\*(-K
+\h'|21n'printf("%s: bad size to sbrk\en", argv[2]);
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'cp = (\*(+Kchar\*(-K \fI*\fP)sbrk(brksize);
+\h'|11n'\*(+Kif\*(-K ((\*(+Kint\*(-K)cp == \*-1) \*(+K{\*(-K
+\h'|21n'perror("sbrk");
+\h'|21n'exit(4);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kfor\*(-K (i = 0; i < brksize; i += 1024)
+\h'|21n'cp[i] = i;
+\h'|11n'\*(+Kwhile\*(-K (nforks\*-\*- > 0) \*(+K{\*(-K
+\h'|21n'child = fork();
+\h'|21n'\*(+Kif\*(-K (child == \*-1) \*(+K{\*(-K
+\h'|31n'perror("fork");
+\h'|31n'exit(\*-1);
+\h'|21n'\*(+K}\*(-K
+\h'|21n'\*(+Kif\*(-K (child == 0)
+\h'|31n'\*_exit(\*-1);
+\h'|21n'\*(+Kwhile\*(-K ((pid = wait(&status)) != \*-1 && pid != child)
+\h'|31n';
+\h'|11n'\*(+K}\*(-K
+\h'|11n'exit(0);
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+execs
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Benchmark program to calculate exec
+ \fI*\fP overhead (approximately)\&. Process
+ \fI*\fP forks and execs "null" test program\&.
+ \fI*\fP The time to run the fork program should
+ \fI*\fP then be deducted from this one to
+ \fI*\fP estimate the overhead for the exec\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K nexecs, i;
+\h'|11n'\*(+Kchar\*(-K \fI*\fPcp, \fI*\fPsbrk();
+\h'|11n'\*(+Kint\*(-K pid, child, status, brksize;
+
+\h'|11n'\*(+Kif\*(-K (argc < 3) \*(+K{\*(-K
+\h'|21n'printf("usage: %s number\*-of\*-execs sbrk\*-size job\*-name\en",
+\h'|21n' argv[0]);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'nexecs = atoi(argv[1]);
+\h'|11n'\*(+Kif\*(-K (nexecs < 0) \*(+K{\*(-K
+\h'|21n'printf("%s: bad number of execs\en", argv[1]);
+\h'|21n'exit(2);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'brksize = atoi(argv[2]);
+\h'|11n'\*(+Kif\*(-K (brksize < 0) \*(+K{\*(-K
+\h'|21n'printf("%s: bad size to sbrk\en", argv[2]);
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'cp = sbrk(brksize);
+\h'|11n'\*(+Kif\*(-K ((\*(+Kint\*(-K)cp == \*-1) \*(+K{\*(-K
+\h'|21n'perror("sbrk");
+\h'|21n'exit(4);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kfor\*(-K (i = 0; i < brksize; i += 1024)
+\h'|21n'cp[i] = i;
+\h'|11n'\*(+Kwhile\*(-K (nexecs\*-\*- > 0) \*(+K{\*(-K
+\h'|21n'child = fork();
+\h'|21n'\*(+Kif\*(-K (child == \*-1) \*(+K{\*(-K
+\h'|31n'perror("fork");
+\h'|31n'exit(\*-1);
+\h'|21n'\*(+K}\*(-K
+\h'|21n'\*(+Kif\*(-K (child == 0) \*(+K{\*(-K
+\h'|31n'execv(argv[3], argv);
+\h'|31n'perror("execv");
+\h'|31n'\*_exit(\*-1);
+\h'|21n'\*(+K}\*(-K
+\h'|21n'\*(+Kwhile\*(-K ((pid = wait(&status)) != \*-1 && pid != child)
+\h'|31n';
+\h'|11n'\*(+K}\*(-K
+\h'|11n'exit(0);
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+nulljob
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Benchmark "null job" program\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+
+\h'|11n'exit(0);
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+bigjob
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Benchmark "null big job" program\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+ 250 here is intended to approximate vi\'s text+data size \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+Kchar\*(-K\h'|11n'space[1024 \fI*\fP 250] = "force into data segment";
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+
+\h'|11n'exit(0);
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.bp
+.SH
+seqpage
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Sequential page access benchmark\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+K#include\*(-K <sys\fI\h'\w' 'u-\w'/'u'/\fPvadvise\&.h>
+
+\*(+Kchar\*(-K\h'|11n'\fI*\fPvalloc();
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kregister\*(-K i, niter;
+\h'|11n'\*(+Kregister\*(-K \*(+Kchar\*(-K \fI*\fPpf, \fI*\fPlastpage;
+\h'|11n'\*(+Kint\*(-K npages = 4096, pagesize, vflag = 0;
+\h'|11n'\*(+Kchar\*(-K \fI*\fPpages, \fI*\fPname;
+
+\h'|11n'name = argv[0];
+\h'|11n'argc\*-\*-, argv++;
+again:
+\h'|11n'\*(+Kif\*(-K (argc < 1) \*(+K{\*(-K
+usage:
+\h'|21n'printf("usage: %s [ \*-v ] [ \*-p #pages ] niter\en", name);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (strcmp(\fI*\fPargv, "\*-p") == 0) \*(+K{\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'\*(+Kif\*(-K (argc < 1)
+\h'|31n'\*(+Kgoto\*(-K usage;
+\h'|21n'npages = atoi(\fI*\fPargv);
+\h'|21n'\*(+Kif\*(-K (npages <= 0) \*(+K{\*(-K
+\h'|31n'printf("%s: Bad page count\&.\en", \fI*\fPargv);
+\h'|31n'exit(2);
+\h'|21n'\*(+K}\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'\*(+Kgoto\*(-K again;
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (strcmp(\fI*\fPargv, "\*-v") == 0) \*(+K{\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'vflag++;
+\h'|21n'\*(+Kgoto\*(-K again;
+\h'|11n'\*(+K}\*(-K
+\h'|11n'niter = atoi(\fI*\fPargv);
+\h'|11n'pagesize = getpagesize();
+\h'|11n'pages = valloc(npages \fI*\fP pagesize);
+\h'|11n'\*(+Kif\*(-K (pages == (\*(+Kchar\*(-K \fI*\fP)0) \*(+K{\*(-K
+\h'|21n'printf("Can\'t allocate %d pages (%2\&.1f megabytes)\&.\en",
+\h'|21n' npages, (npages \fI*\fP pagesize) \fI\h'\w' 'u-\w'/'u'/\fP (1024\&. \fI*\fP 1024\&.));
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'lastpage = pages + (npages \fI*\fP pagesize);
+\h'|11n'\*(+Kif\*(-K (vflag)
+\h'|21n'vadvise(VA\*_SEQL);
+\h'|11n'\*(+Kfor\*(-K (i = 0; i < niter; i++)
+\h'|21n'\*(+Kfor\*(-K (pf = pages; pf < lastpage; pf += pagesize)
+\h'|31n'\fI*\fPpf = 1;
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+randpage
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Random page access benchmark\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+\*(+K#include\*(-K <sys\fI\h'\w' 'u-\w'/'u'/\fPvadvise\&.h>
+
+\*(+Kchar\*(-K\h'|11n'\fI*\fPvalloc();
+\*(+Kint\*(-K\h'|11n'rand();
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K npages = 4096, pagesize, pn, i, niter;
+\h'|11n'\*(+Kint\*(-K vflag = 0, debug = 0;
+\h'|11n'\*(+Kchar\*(-K \fI*\fPpages, \fI*\fPname;
+
+\h'|11n'name = argv[0];
+\h'|11n'argc\*-\*-, argv++;
+again:
+\h'|11n'\*(+Kif\*(-K (argc < 1) \*(+K{\*(-K
+usage:
+\h'|21n'printf("usage: %s [ \*-d ] [ \*-v ] [ \*-p #pages ] niter\en", name);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (strcmp(\fI*\fPargv, "\*-p") == 0) \*(+K{\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'\*(+Kif\*(-K (argc < 1)
+\h'|31n'\*(+Kgoto\*(-K usage;
+\h'|21n'npages = atoi(\fI*\fPargv);
+\h'|21n'\*(+Kif\*(-K (npages <= 0) \*(+K{\*(-K
+\h'|31n'printf("%s: Bad page count\&.\en", \fI*\fPargv);
+\h'|31n'exit(2);
+\h'|21n'\*(+K}\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'\*(+Kgoto\*(-K again;
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (strcmp(\fI*\fPargv, "\*-v") == 0) \*(+K{\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'vflag++;
+\h'|21n'\*(+Kgoto\*(-K again;
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (strcmp(\fI*\fPargv, "\*-d") == 0) \*(+K{\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'debug++;
+\h'|21n'\*(+Kgoto\*(-K again;
+\h'|11n'\*(+K}\*(-K
+\h'|11n'niter = atoi(\fI*\fPargv);
+\h'|11n'pagesize = getpagesize();
+\h'|11n'pages = valloc(npages \fI*\fP pagesize);
+\h'|11n'\*(+Kif\*(-K (pages == (\*(+Kchar\*(-K \fI*\fP)0) \*(+K{\*(-K
+\h'|21n'printf("Can\'t allocate %d pages (%2\&.1f megabytes)\&.\en",
+\h'|21n' npages, (npages \fI*\fP pagesize) \fI\h'\w' 'u-\w'/'u'/\fP (1024\&. \fI*\fP 1024\&.));
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (vflag)
+\h'|21n'vadvise(VA\*_ANOM);
+\h'|11n'\*(+Kfor\*(-K (i = 0; i < niter; i++) \*(+K{\*(-K
+\h'|21n'pn = random() % npages;
+\h'|21n'\*(+Kif\*(-K (debug)
+\h'|31n'printf("touch page %d\en", pn);
+\h'|21n'pages[pagesize \fI*\fP pn] = 1;
+\h'|11n'\*(+K}\*(-K
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+.SH
+gausspage
+.LP
+.vS
+\fI\h'\w' 'u-\w'/'u'/\fP\fI*\fP\c\c
+'+C
+
+ \fI*\fP Random page access with
+ \fI*\fP a gaussian distribution\&.
+ \fI*\fP
+ \fI*\fP Allocate a large (zero fill on demand) address
+ \fI*\fP space and fault the pages in a random gaussian
+ \fI*\fP order\&.
+ \fI*\fP\fI\h'\w' 'u-\w'/'u'/\fP\c
+'-C
+
+
+\*(+Kfloat\*(-K\h'|11n'sqrt(), log(), rnd(), cos(), gauss();
+\*(+Kchar\*(-K\h'|11n'\fI*\fPvalloc();
+\*(+Kint\*(-K\h'|11n'rand();
+
+'FN main
+main(argc, argv)
+\h'|11n'\*(+Kchar\*(-K \fI*\fPargv[];
+\*(+K{\*(-K
+\h'|11n'\*(+Kregister\*(-K \*(+Kint\*(-K pn, i, niter, delta;
+\h'|11n'\*(+Kregister\*(-K \*(+Kchar\*(-K \fI*\fPpages;
+\h'|11n'\*(+Kfloat\*(-K sd = 10\&.0;
+\h'|11n'\*(+Kint\*(-K npages = 4096, pagesize, debug = 0;
+\h'|11n'\*(+Kchar\*(-K \fI*\fPname;
+
+\h'|11n'name = argv[0];
+\h'|11n'argc\*-\*-, argv++;
+again:
+\h'|11n'\*(+Kif\*(-K (argc < 1) \*(+K{\*(-K
+usage:
+\h'|21n'printf(
+"usage: %s [ \*-d ] [ \*-p #pages ] [ \*-s standard\*-deviation ] iterations\en", name);
+\h'|21n'exit(1);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (strcmp(\fI*\fPargv, "\*-s") == 0) \*(+K{\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'\*(+Kif\*(-K (argc < 1)
+\h'|31n'\*(+Kgoto\*(-K usage;
+\h'|21n'sscanf(\fI*\fPargv, "%f", &sd);
+\h'|21n'\*(+Kif\*(-K (sd <= 0) \*(+K{\*(-K
+\h'|31n'printf("%s: Bad standard deviation\&.\en", \fI*\fPargv);
+\h'|31n'exit(2);
+\h'|21n'\*(+K}\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'\*(+Kgoto\*(-K again;
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (strcmp(\fI*\fPargv, "\*-p") == 0) \*(+K{\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'\*(+Kif\*(-K (argc < 1)
+\h'|31n'\*(+Kgoto\*(-K usage;
+\h'|21n'npages = atoi(\fI*\fPargv);
+\h'|21n'\*(+Kif\*(-K (npages <= 0) \*(+K{\*(-K
+\h'|31n'printf("%s: Bad page count\&.\en", \fI*\fPargv);
+\h'|31n'exit(2);
+\h'|21n'\*(+K}\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'\*(+Kgoto\*(-K again;
+\h'|11n'\*(+K}\*(-K
+\h'|11n'\*(+Kif\*(-K (strcmp(\fI*\fPargv, "\*-d") == 0) \*(+K{\*(-K
+\h'|21n'argc\*-\*-, argv++;
+\h'|21n'debug++;
+\h'|21n'\*(+Kgoto\*(-K again;
+\h'|11n'\*(+K}\*(-K
+\h'|11n'niter = atoi(\fI*\fPargv);
+\h'|11n'pagesize = getpagesize();
+\h'|11n'pages = valloc(npages\fI*\fPpagesize);
+\h'|11n'\*(+Kif\*(-K (pages == (\*(+Kchar\*(-K \fI*\fP)0) \*(+K{\*(-K
+\h'|21n'printf("Can\'t allocate %d pages (%2\&.1f megabytes)\&.\en",
+\h'|21n' npages, (npages\fI*\fPpagesize) \fI\h'\w' 'u-\w'/'u'/\fP (1024\&. \fI*\fP 1024\&.));
+\h'|21n'exit(3);
+\h'|11n'\*(+K}\*(-K
+\h'|11n'pn = 0;
+\h'|11n'\*(+Kfor\*(-K (i = 0; i < niter; i++) \*(+K{\*(-K
+\h'|21n'delta = gauss(sd, 0\&.0);
+\h'|21n'\*(+Kwhile\*(-K (pn + delta < 0 || pn + delta > npages)
+\h'|31n'delta = gauss(sd, 0\&.0);
+\h'|21n'pn += delta;
+\h'|21n'\*(+Kif\*(-K (debug)
+\h'|31n'printf("touch page %d\en", pn);
+\h'|21n'\*(+Kelse\*(-K
+\h'|31n'pages[pn \fI*\fP pagesize] = 1;
+\h'|11n'\*(+K}\*(-K
+\*(+K}\*(-K\c\c
+'-F
+
+
+\*(+Kfloat\*(-K
+'FN gauss
+gauss(sd, mean)
+\h'|11n'\*(+Kfloat\*(-K sd, mean;
+\*(+K{\*(-K
+\h'|11n'\*(+Kregister\*(-K \*(+Kfloat\*(-K qa, qb;
+
+\h'|11n'qa = sqrt(log(rnd()) \fI*\fP \*-2\&.0);
+\h'|11n'qb = 3\&.14159 \fI*\fP rnd();
+\h'|11n'\*(+Kreturn\*(-K (qa \fI*\fP cos(qb) \fI*\fP sd + mean);
+\*(+K}\*(-K\c\c
+'-F
+
+
+\*(+Kfloat\*(-K
+'FN rnd
+rnd()
+\*(+K{\*(-K
+\h'|11n'\*(+Kstatic\*(-K \*(+Kint\*(-K seed = 1;
+\h'|11n'\*(+Kstatic\*(-K \*(+Kint\*(-K biggest = 0x7fffffff;
+
+\h'|11n'\*(+Kreturn\*(-K ((\*(+Kfloat\*(-K)rand(seed) \fI\h'\w' 'u-\w'/'u'/\fP (\*(+Kfloat\*(-K)biggest);
+\*(+K}\*(-K\c\c
+'-F
+
+.vE
+'-F
+.am vS
+..
+.am vE
+..
+'ss 23
+'ds _ \d\(mi\u
+'ps 9z
+'vs 10p
+'ds - \(mi
+'ds / \\h'\\w' 'u-\\w'/'u'/
+'ds /* \\h'\\w' 'u-\\w'/'u'/*
+'bd B 3
+'bd S B 3
+'nr cm 0
+'nf
+'de vH
+'ev 2
+'ft 1
+'sp .35i
+'tl '\s14\f3\\*(=F\fP\s0'\\*(=H'\f3\s14\\*(=F\fP\s0'
+'sp .25i
+'ft 1
+\f2\s12\h'\\n(.lu-\w'\\*(=f'u'\\*(=f\fP\s0\h'|0u'
+.sp .05i
+'ev
+'ds =G \\*(=F
+..
+'de vF
+'ev 2
+'sp .35i
+'ie o 'tl '\f2\\*(=M''Page % of \\*(=G\fP'
+'el 'tl '\f2Page % of \\*(=G''\\*(=M\fP'
+'bp
+'ev
+'ft 1
+'if \\n(cm=1 'ft 2
+..
+'de ()
+'pn 1
+..
+'de +C
+'nr cm 1
+'ft 2
+'ds +K
+'ds -K
+..
+'de -C
+'nr cm 0
+'ft 1
+'ds +K \f3
+'ds -K \fP
+..
+'+C
+'-C
+'am +C
+'ne 3
+..
+'de FN
+\f2\s14\h'\\n(.lu-\w'\\$1'u'\\$1\fP\s0\h'|0u'\c
+.if r x .if \\nx .if d =F .tm \\$1 \\*(=F \\n%
+'ds =f \&...\\$1
+..
+'de FC
+.if r x .if \\nx .if d =F .tm \\$1 \\*(=F \\n%
+'ds =f \&...\\$1
+..
+'de -F
+'rm =f
+..
+'ft 1
+'lg 0
+'-F
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.SH
+run (shell script)
+.LP
+.vS
+\*(+K#\*(-K! \fI\h'\w' 'u-\w'/'u'/\fPbin\fI\h'\w' 'u-\w'/'u'/\fPcsh \*-fx
+\*(+K#\*(-K Script to run benchmark programs\&.
+\*(+K#\*(-K
+date
+make clean; time make
+time syscall 100000
+time seqpage \*-p 7500 10
+time seqpage \*-v \*-p 7500 10
+time randpage \*-p 7500 30000
+time randpage \*-v \*-p 7500 30000
+time gausspage \*-p 7500 \*-s 1 30000
+time gausspage \*-p 7500 \*-s 10 30000
+time gausspage \*-p 7500 \*-s 30 30000
+time gausspage \*-p 7500 \*-s 40 30000
+time gausspage \*-p 7500 \*-s 50 30000
+time gausspage \*-p 7500 \*-s 60 30000
+time gausspage \*-p 7500 \*-s 80 30000
+time gausspage \*-p 7500 \*-s 10000 30000
+time csw 10000
+time signocsw 10000
+time pipeself 10000 512
+time pipeself 10000 4
+time udgself 10000 512
+time udgself 10000 4
+time pipediscard 10000 512
+time pipediscard 10000 4
+time udgdiscard 10000 512
+time udgdiscard 10000 4
+time pipeback 10000 512
+time pipeback 10000 4
+time udgback 10000 512
+time udgback 10000 4
+size forks
+time forks 1000 0
+time forks 1000 1024
+time forks 1000 102400
+size vforks
+time vforks 1000 0
+time vforks 1000 1024
+time vforks 1000 102400
+countenv
+size nulljob
+time execs 1000 0 nulljob
+time execs 1000 1024 nulljob
+time execs 1000 102400 nulljob
+time vexecs 1000 0 nulljob
+time vexecs 1000 1024 nulljob
+time vexecs 1000 102400 nulljob
+size bigjob
+time execs 1000 0 bigjob
+time execs 1000 1024 bigjob
+time execs 1000 102400 bigjob
+time vexecs 1000 0 bigjob
+time vexecs 1000 1024 bigjob
+time vexecs 1000 102400 bigjob
+\*(+K#\*(-K fill environment with ~1024 bytes
+setenv a 012345678901234567890123456789012345678901234567890123456780123456789
+setenv b 012345678901234567890123456789012345678901234567890123456780123456789
+setenv c 012345678901234567890123456789012345678901234567890123456780123456789
+setenv d 012345678901234567890123456789012345678901234567890123456780123456789
+setenv e 012345678901234567890123456789012345678901234567890123456780123456789
+setenv f 012345678901234567890123456789012345678901234567890123456780123456789
+setenv g 012345678901234567890123456789012345678901234567890123456780123456789
+setenv h 012345678901234567890123456789012345678901234567890123456780123456789
+setenv i 012345678901234567890123456789012345678901234567890123456780123456789
+setenv j 012345678901234567890123456789012345678901234567890123456780123456789
+setenv k 012345678901234567890123456789012345678901234567890123456780123456789
+setenv l 012345678901234567890123456789012345678901234567890123456780123456789
+setenv m 012345678901234567890123456789012345678901234567890123456780123456789
+setenv n 012345678901234567890123456789012345678901234567890123456780123456789
+setenv o 012345678901234567890123456789012345678901234567890123456780123456789
+countenv
+time execs 1000 0 nulljob
+time execs 1000 1024 nulljob
+time execs 1000 102400 nulljob
+time execs 1000 0 bigjob
+time execs 1000 1024 bigjob
+time execs 1000 102400 bigjob
+.vE
+.bp
+'-F
diff --git a/share/doc/papers/timecounter/Makefile b/share/doc/papers/timecounter/Makefile
new file mode 100644
index 000000000000..a2e7479bbd2a
--- /dev/null
+++ b/share/doc/papers/timecounter/Makefile
@@ -0,0 +1,18 @@
+# You really want:
+# PRINTERDEVICE=ps
+# or you will not get the illustration.
+VOLUME= papers
+DOC= timecounter
+SRCS= tmac.usenix timecounter.ms-patched
+EXTRA= fig1.eps fig2.eps fig3.eps fig4.eps fig5.eps gps.ps intr.ps
+MACROS= -ms
+CLEANFILES= timecounter.ms-patched
+USE_PIC=
+USE_EQN=
+USE_TBL=
+
+timecounter.ms-patched: timecounter.ms
+ sed -E -e 's;(gps|intr).ps;${.CURDIR}/&;' -e 's;fig[0-9].eps;${.CURDIR}/&;' \
+ ${.ALLSRC} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/timecounter/fig1.eps b/share/doc/papers/timecounter/fig1.eps
new file mode 100644
index 000000000000..012fed28d80f
--- /dev/null
+++ b/share/doc/papers/timecounter/fig1.eps
@@ -0,0 +1,227 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: fig1.eps
+%%Creator: fig2dev Version 3.2 Patchlevel 3d
+%%CreationDate: $FreeBSD$
+%%For: phk@critter.freebsd.dk (Poul-Henning Kamp)
+%%BoundingBox: 0 0 191 194
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+newpath 0 194 moveto 0 0 lineto 191 0 lineto 191 194 lineto closepath clip newpath
+-7.6 201.2 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+ /DrawEllipse {
+ /endangle exch def
+ /startangle exch def
+ /yrad exch def
+ /xrad exch def
+ /y exch def
+ /x exch def
+ /savematrix mtrx currentmatrix def
+ x y tr xrad yrad sc 0 0 1 startangle endangle arc
+ closepath
+ savematrix setmatrix
+ } def
+
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+
+$F2psBegin
+10 setmiterlimit
+ 0.06000 0.06000 sc
+%
+% Fig objects follow
+%
+/Times-Roman ff 180.00 scf sf
+750 3300 m
+gs 1 -1 sc (Imprecise) dup sw pop 2 div neg 0 rm col0 sh gr
+15.000 slw
+% Ellipse
+n 750 750 300 300 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 750 750 450 450 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 750 750 600 600 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 750 2250 150 150 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 750 2250 300 300 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 750 2250 450 450 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 750 2250 600 600 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2250 2250 150 150 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2250 2250 300 300 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2250 2250 450 450 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2250 2250 600 600 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2250 750 150 150 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2250 750 300 300 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2250 750 450 450 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2250 750 600 600 0 360 DrawEllipse gs col0 s gr
+
+% Ellipse
+n 2280 2197 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 2152 2212 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 2145 2332 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 2265 2325 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 2370 2295 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 292 2002 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 367 1905 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 390 2040 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 180 1950 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 1965 472 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 2355 517 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 2505 870 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 907 1170 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 1282 1305 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 975 825 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 2071 1074 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 2550 600 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 1350 675 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 1350 1050 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+% Ellipse
+n 225 2100 38 38 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr
+
+/Times-Roman ff 180.00 scf sf
+3300 750 m
+gs 1 -1 sc 90.0 rot (Unstable) dup sw pop 2 div neg 0 rm col0 sh gr
+/Times-Roman ff 180.00 scf sf
+3300 2250 m
+gs 1 -1 sc 90.0 rot (Stable) dup sw pop 2 div neg 0 rm col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2250 3300 m
+gs 1 -1 sc (Precise) dup sw pop 2 div neg 0 rm col0 sh gr
+% Ellipse
+n 750 750 150 150 0 360 DrawEllipse gs col0 s gr
+
+$F2psEnd
+rs
diff --git a/share/doc/papers/timecounter/fig2.eps b/share/doc/papers/timecounter/fig2.eps
new file mode 100644
index 000000000000..67714356dccd
--- /dev/null
+++ b/share/doc/papers/timecounter/fig2.eps
@@ -0,0 +1,150 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: fig2.eps
+%%Creator: fig2dev Version 3.2 Patchlevel 3d
+%%CreationDate: $FreeBSD$
+%%For: phk@critter.freebsd.dk (Poul-Henning Kamp)
+%%BoundingBox: 0 0 191 194
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+newpath 0 194 moveto 0 0 lineto 191 0 lineto 191 194 lineto closepath clip newpath
+-7.7 201.2 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+
+$F2psBegin
+10 setmiterlimit
+ 0.06000 0.06000 sc
+%
+% Fig objects follow
+%
+/Times-Roman ff 180.00 scf sf
+750 3300 m
+gs 1 -1 sc (Imprecise) dup sw pop 2 div neg 0 rm col0 sh gr
+% Polyline
+15.000 slw
+n 150 750 m
+ 1350 750 l gs 0.00 setgray ef gr gs col0 s gr
+% Polyline
+n 1650 150 m
+ 1650 1350 l gs 0.00 setgray ef gr gs col0 s gr
+% Polyline
+n 1650 750 m
+ 2850 750 l gs 0.00 setgray ef gr gs col0 s gr
+% Polyline
+n 1650 1650 m
+ 1650 2850 l gs 0.00 setgray ef gr gs col0 s gr
+% Polyline
+n 1650 2250 m
+ 2850 2250 l gs 0.00 setgray ef gr gs col0 s gr
+% Polyline
+n 150 1650 m
+ 150 2850 l gs 0.00 setgray ef gr gs col0 s gr
+% Polyline
+n 150 2250 m
+ 1350 2250 l gs 0.00 setgray ef gr gs col0 s gr
+% Polyline
+n 1665 2205 m 1792 2182 l 1942 2220 l 2100 2295 l 2257 2212 l 2392 2205 l
+ 2460 2280 l 2520 2295 l 2617 2197 l
+ 2850 2212 l gs col0 s gr
+% Polyline
+n 165 2565 m 360 2490 l 487 2362 l 615 2347 l 705 2250 l 825 2212 l
+ 915 2130 l 1057 2085 l 1155 1980 l 1237 1972 l 1297 1920 l
+
+ 1342 1897 l gs col0 s gr
+% Polyline
+n 1657 465 m 1770 637 l 1927 705 l 2002 1020 l 2107 862 l 2190 525 l
+ 2227 652 l 2272 555 l 2362 982 l 2475 1147 l 2512 832 l
+ 2557 427 l 2587 502 l 2647 277 l 2677 630 l 2775 967 l
+
+ 2850 525 l gs col0 s gr
+% Polyline
+n 150 232 m 352 307 l 375 637 l 562 577 l 622 982 l 690 622 l
+ 780 870 l 885 622 l 945 1207 l 1035 952 l 1080 1140 l
+ 1140 1080 l 1192 1372 l
+ 1350 1185 l gs col0 s gr
+/Times-Roman ff 180.00 scf sf
+3300 750 m
+gs 1 -1 sc 90.0 rot (Unstable) dup sw pop 2 div neg 0 rm col0 sh gr
+/Times-Roman ff 180.00 scf sf
+3300 2250 m
+gs 1 -1 sc 90.0 rot (Stable) dup sw pop 2 div neg 0 rm col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2250 3300 m
+gs 1 -1 sc (Precise) dup sw pop 2 div neg 0 rm col0 sh gr
+% Polyline
+n 150 150 m
+ 150 1350 l gs 0.00 setgray ef gr gs col0 s gr
+$F2psEnd
+rs
diff --git a/share/doc/papers/timecounter/fig3.eps b/share/doc/papers/timecounter/fig3.eps
new file mode 100644
index 000000000000..997282374dcf
--- /dev/null
+++ b/share/doc/papers/timecounter/fig3.eps
@@ -0,0 +1,126 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: fig3.eps
+%%Creator: fig2dev Version 3.2 Patchlevel 3d
+%%CreationDate: $FreeBSD$
+%%For: phk@critter.freebsd.dk (Poul-Henning Kamp)
+%%BoundingBox: 0 0 181 56
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+newpath 0 56 moveto 0 0 lineto 181 0 lineto 181 56 lineto closepath clip newpath
+-16.7 81.0 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+
+$F2psBegin
+10 setmiterlimit
+ 0.06000 0.06000 sc
+%
+% Fig objects follow
+%
+% Polyline
+7.500 slw
+gs clippath
+1740 780 m 1740 720 l 1588 720 l 1708 750 l 1588 780 l cp
+eoclip
+n 1200 750 m
+ 1725 750 l gs col0 s gr gr
+
+% arrowhead
+n 1588 780 m 1708 750 l 1588 720 l col0 s
+% Arc
+n 900.0 750.0 150.0 180.0 0.0 arcn
+gs col0 s gr
+
+% Polyline
+15.000 slw
+n 300 450 m 1200 450 l 1200 1050 l 300 1050 l
+ cp gs col0 s gr
+% Arc
+7.500 slw
+n 600.0 750.0 150.0 180.0 0.0 arc
+gs col0 s gr
+
+% Polyline
+15.000 slw
+n 1725 600 m 3225 600 l 3225 900 l 1725 900 l
+ cp gs col0 s gr
+/Times-Roman ff 180.00 scf sf
+1725 1350 m
+gs 1 -1 sc (Oscillator + Counter = Clock) dup sw pop 2 div neg 0 rm col0 sh gr
+/Helvetica-Bold ff 180.00 scf sf
+2475 825 m
+gs 1 -1 sc (1 0 3 7 5 4 2 5 0 0) dup sw pop 2 div neg 0 rm col0 sh gr
+$F2psEnd
+rs
diff --git a/share/doc/papers/timecounter/fig4.eps b/share/doc/papers/timecounter/fig4.eps
new file mode 100644
index 000000000000..7a5684f26a71
--- /dev/null
+++ b/share/doc/papers/timecounter/fig4.eps
@@ -0,0 +1,259 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: fig4.eps
+%%Creator: fig2dev Version 3.2 Patchlevel 3d
+%%CreationDate: $FreeBSD$
+%%For: phk@critter.freebsd.dk (Poul-Henning Kamp)
+%%BoundingBox: 0 0 119 203
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+newpath 0 203 moveto 0 0 lineto 119 0 lineto 119 203 lineto closepath clip newpath
+-8.3 207.7 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+
+$F2psBegin
+10 setmiterlimit
+ 0.06000 0.06000 sc
+%
+% Fig objects follow
+%
+/Times-Roman ff 180.00 scf sf
+300 450 m
+gs 1 -1 sc (*volatile timehands;) col0 sh gr
+% Polyline
+7.500 slw
+n 1005 750 m 900 750 900 1095 105 arcto 4 {pop} repeat
+ 900 1200 1245 1200 105 arcto 4 {pop} repeat
+ 1350 1200 1350 855 105 arcto 4 {pop} repeat
+ 1350 750 1005 750 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+n 1755 750 m 1650 750 1650 1095 105 arcto 4 {pop} repeat
+ 1650 1200 1995 1200 105 arcto 4 {pop} repeat
+ 2100 1200 2100 855 105 arcto 4 {pop} repeat
+ 2100 750 1755 750 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+n 1755 1500 m 1650 1500 1650 1845 105 arcto 4 {pop} repeat
+ 1650 1950 1995 1950 105 arcto 4 {pop} repeat
+ 2100 1950 2100 1605 105 arcto 4 {pop} repeat
+ 2100 1500 1755 1500 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+n 1755 2250 m 1650 2250 1650 2595 105 arcto 4 {pop} repeat
+ 1650 2700 1995 2700 105 arcto 4 {pop} repeat
+ 2100 2700 2100 2355 105 arcto 4 {pop} repeat
+ 2100 2250 1755 2250 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+n 1755 3000 m 1650 3000 1650 3345 105 arcto 4 {pop} repeat
+ 1650 3450 1995 3450 105 arcto 4 {pop} repeat
+ 2100 3450 2100 3105 105 arcto 4 {pop} repeat
+ 2100 3000 1755 3000 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+n 1005 3000 m 900 3000 900 3345 105 arcto 4 {pop} repeat
+ 900 3450 1245 3450 105 arcto 4 {pop} repeat
+ 1350 3450 1350 3105 105 arcto 4 {pop} repeat
+ 1350 3000 1005 3000 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+n 255 3000 m 150 3000 150 3345 105 arcto 4 {pop} repeat
+ 150 3450 495 3450 105 arcto 4 {pop} repeat
+ 600 3450 600 3105 105 arcto 4 {pop} repeat
+ 600 3000 255 3000 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+n 255 2250 m 150 2250 150 2595 105 arcto 4 {pop} repeat
+ 150 2700 495 2700 105 arcto 4 {pop} repeat
+ 600 2700 600 2355 105 arcto 4 {pop} repeat
+ 600 2250 255 2250 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+n 255 1500 m 150 1500 150 1845 105 arcto 4 {pop} repeat
+ 150 1950 495 1950 105 arcto 4 {pop} repeat
+ 600 1950 600 1605 105 arcto 4 {pop} repeat
+ 600 1500 255 1500 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+% Polyline
+gs clippath
+915 1005 m 915 945 l 763 945 l 883 975 l 763 1005 l cp
+eoclip
+n 600 975 m
+ 900 975 l gs col0 s gr gr
+
+% arrowhead
+n 763 1005 m 883 975 l 763 945 l col0 s
+% Polyline
+gs clippath
+1665 1005 m 1665 945 l 1513 945 l 1633 975 l 1513 1005 l cp
+eoclip
+n 1350 975 m
+ 1650 975 l gs col0 s gr gr
+
+% arrowhead
+n 1513 1005 m 1633 975 l 1513 945 l col0 s
+% Polyline
+gs clippath
+1845 1515 m 1905 1515 l 1905 1363 l 1875 1483 l 1845 1363 l cp
+eoclip
+n 1875 1200 m
+ 1875 1500 l gs col0 s gr gr
+
+% arrowhead
+n 1845 1363 m 1875 1483 l 1905 1363 l col0 s
+% Polyline
+gs clippath
+1845 2265 m 1905 2265 l 1905 2113 l 1875 2233 l 1845 2113 l cp
+eoclip
+n 1875 1950 m
+ 1875 2250 l gs col0 s gr gr
+
+% arrowhead
+n 1845 2113 m 1875 2233 l 1905 2113 l col0 s
+% Polyline
+gs clippath
+1845 3015 m 1905 3015 l 1905 2863 l 1875 2983 l 1845 2863 l cp
+eoclip
+n 1875 2700 m
+ 1875 3000 l gs col0 s gr gr
+
+% arrowhead
+n 1845 2863 m 1875 2983 l 1905 2863 l col0 s
+% Polyline
+gs clippath
+1335 3195 m 1335 3255 l 1487 3255 l 1367 3225 l 1487 3195 l cp
+eoclip
+n 1650 3225 m
+ 1350 3225 l gs col0 s gr gr
+
+% arrowhead
+n 1487 3195 m 1367 3225 l 1487 3255 l col0 s
+% Polyline
+gs clippath
+585 3195 m 585 3255 l 737 3255 l 617 3225 l 737 3195 l cp
+eoclip
+n 900 3225 m
+ 600 3225 l gs col0 s gr gr
+
+% arrowhead
+n 737 3195 m 617 3225 l 737 3255 l col0 s
+% Polyline
+gs clippath
+405 2685 m 345 2685 l 345 2837 l 375 2717 l 405 2837 l cp
+eoclip
+n 375 3000 m
+ 375 2700 l gs col0 s gr gr
+
+% arrowhead
+n 405 2837 m 375 2717 l 345 2837 l col0 s
+% Polyline
+gs clippath
+405 1935 m 345 1935 l 345 2087 l 375 1967 l 405 2087 l cp
+eoclip
+n 375 2250 m
+ 375 1950 l gs col0 s gr gr
+
+% arrowhead
+n 405 2087 m 375 1967 l 345 2087 l col0 s
+% Polyline
+gs clippath
+405 1185 m 345 1185 l 345 1337 l 375 1217 l 405 1337 l cp
+eoclip
+n 375 1500 m
+ 375 1200 l gs col0 s gr gr
+
+% arrowhead
+n 405 1337 m 375 1217 l 345 1337 l col0 s
+% Polyline
+gs clippath
+1845 765 m 1905 765 l 1905 613 l 1875 733 l 1845 613 l cp
+eoclip
+n 1800 375 m 1875 375 l
+ 1875 750 l gs col0 s gr gr
+
+% arrowhead
+n 1845 613 m 1875 733 l 1905 613 l col0 s
+/Times-Roman ff 180.00 scf sf
+150 225 m
+gs 1 -1 sc (struct timehands) col0 sh gr
+% Polyline
+n 255 750 m 150 750 150 1095 105 arcto 4 {pop} repeat
+ 150 1200 495 1200 105 arcto 4 {pop} repeat
+ 600 1200 600 855 105 arcto 4 {pop} repeat
+ 600 750 255 750 105 arcto 4 {pop} repeat
+ cp gs col0 s gr
+$F2psEnd
+rs
diff --git a/share/doc/papers/timecounter/fig5.eps b/share/doc/papers/timecounter/fig5.eps
new file mode 100644
index 000000000000..b6274c1f6d40
--- /dev/null
+++ b/share/doc/papers/timecounter/fig5.eps
@@ -0,0 +1,211 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: fig5.eps
+%%Creator: fig2dev Version 3.2 Patchlevel 3d
+%%CreationDate: $FreeBSD$
+%%For: phk@critter.freebsd.dk (Poul-Henning Kamp)
+%%BoundingBox: 0 0 140 225
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+newpath 0 225 moveto 0 0 lineto 140 0 lineto 140 225 lineto closepath clip newpath
+-7.7 234.7 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+
+$F2psBegin
+10 setmiterlimit
+ 0.06000 0.06000 sc
+%
+% Fig objects follow
+%
+/Times-Roman ff 180.00 scf sf
+1950 600 m
+gs 1 -1 sc (PPS#1) col0 sh gr
+% Polyline
+7.500 slw
+gs clippath
+915 2130 m 915 2070 l 763 2070 l 883 2100 l 763 2130 l cp
+eoclip
+n 600 2100 m
+ 900 2100 l gs col0 s gr gr
+
+% arrowhead
+n 763 2130 m 883 2100 l 763 2070 l col0 s
+% Polyline
+gs clippath
+1665 2130 m 1665 2070 l 1513 2070 l 1633 2100 l 1513 2130 l cp
+eoclip
+n 1350 2100 m
+ 1650 2100 l gs col0 s gr gr
+
+% arrowhead
+n 1513 2130 m 1633 2100 l 1513 2070 l col0 s
+% Polyline
+15.000 slw
+n 900 1050 m 1350 1050 l 1350 3000 l 900 3000 l
+ cp gs col0 s gr
+% Polyline
+n 1650 1050 m 2100 1050 l 2100 3000 l 1650 3000 l
+ cp gs col0 s gr
+% Polyline
+7.500 slw
+gs clippath
+345 3465 m 405 3465 l 405 3313 l 375 3433 l 345 3313 l cp
+eoclip
+n 375 3000 m
+ 375 3450 l gs col0 s gr gr
+
+% arrowhead
+n 345 3313 m 375 3433 l 405 3313 l col0 s
+% Polyline
+gs clippath
+1095 3465 m 1155 3465 l 1155 3313 l 1125 3433 l 1095 3313 l cp
+eoclip
+n 1125 3000 m
+ 1125 3450 l gs col0 s gr gr
+
+% arrowhead
+n 1095 3313 m 1125 3433 l 1155 3313 l col0 s
+% Polyline
+gs clippath
+1845 3465 m 1905 3465 l 1905 3313 l 1875 3433 l 1845 3313 l cp
+eoclip
+n 1875 3000 m
+ 1875 3450 l gs col0 s gr gr
+
+% arrowhead
+n 1845 3313 m 1875 3433 l 1905 3313 l col0 s
+% Polyline
+gs clippath
+2070 3915 m 2130 3915 l 2130 3763 l 2100 3883 l 2070 3763 l cp
+eoclip
+n 150 3450 m 2100 3450 l
+ 2100 3900 l gs col0 s gr gr
+
+% arrowhead
+n 2070 3763 m 2100 3883 l 2130 3763 l col0 s
+% Polyline
+gs clippath
+1845 1065 m 1905 1065 l 1905 913 l 1875 1033 l 1845 913 l cp
+eoclip
+n 1875 600 m
+ 1875 1050 l gs col0 s gr gr
+
+% arrowhead
+n 1845 913 m 1875 1033 l 1905 913 l col0 s
+% Polyline
+gs clippath
+1095 1065 m 1155 1065 l 1155 913 l 1125 1033 l 1095 913 l cp
+eoclip
+n 1125 450 m
+ 1125 1050 l gs col0 s gr gr
+
+% arrowhead
+n 1095 913 m 1125 1033 l 1155 913 l col0 s
+% Polyline
+gs clippath
+345 1065 m 405 1065 l 405 913 l 375 1033 l 345 913 l cp
+eoclip
+n 375 300 m
+ 375 1050 l gs col0 s gr gr
+
+% arrowhead
+n 345 913 m 375 1033 l 405 913 l col0 s
+/Times-Roman ff 180.00 scf sf
+450 2850 m
+gs 1 -1 sc 90.0 rot (26 bit binary counter.) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2250 2025 m
+gs 1 -1 sc (...) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1200 2850 m
+gs 1 -1 sc 90.0 rot (26 bit latch) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1950 2850 m
+gs 1 -1 sc 90.0 rot (26 bit latch) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+450 3675 m
+gs 1 -1 sc (PCI system bus) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+450 300 m
+gs 1 -1 sc (Clock) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1200 450 m
+gs 1 -1 sc (PPS#0) col0 sh gr
+% Polyline
+15.000 slw
+n 150 1050 m 600 1050 l 600 3000 l 150 3000 l
+ cp gs col0 s gr
+$F2psEnd
+rs
diff --git a/share/doc/papers/timecounter/gps.ps b/share/doc/papers/timecounter/gps.ps
new file mode 100644
index 000000000000..aaaae8173f7f
--- /dev/null
+++ b/share/doc/papers/timecounter/gps.ps
@@ -0,0 +1,1488 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: _.ps
+%%Creator: gnuplot 3.7 patchlevel 1
+%%CreationDate: $FreeBSD$
+%%DocumentFonts: (atend)
+%%BoundingBox: 50 50 266 201
+%%Orientation: Portrait
+%%EndComments
+/gnudict 256 dict def
+gnudict begin
+/Color false def
+/Solid false def
+/gnulinewidth 5.000 def
+/userlinewidth gnulinewidth def
+/vshift -46 def
+/dl {10 mul} def
+/hpt_ 31.5 def
+/vpt_ 31.5 def
+/hpt hpt_ def
+/vpt vpt_ def
+/M {moveto} bind def
+/L {lineto} bind def
+/R {rmoveto} bind def
+/V {rlineto} bind def
+/vpt2 vpt 2 mul def
+/hpt2 hpt 2 mul def
+/Lshow { currentpoint stroke M
+ 0 vshift R show } def
+/Rshow { currentpoint stroke M
+ dup stringwidth pop neg vshift R show } def
+/Cshow { currentpoint stroke M
+ dup stringwidth pop -2 div vshift R show } def
+/UP { dup vpt_ mul /vpt exch def hpt_ mul /hpt exch def
+ /hpt2 hpt 2 mul def /vpt2 vpt 2 mul def } def
+/DL { Color {setrgbcolor Solid {pop []} if 0 setdash }
+ {pop pop pop Solid {pop []} if 0 setdash} ifelse } def
+/BL { stroke userlinewidth 2 mul setlinewidth } def
+/AL { stroke userlinewidth 2 div setlinewidth } def
+/UL { dup gnulinewidth mul /userlinewidth exch def
+ 10 mul /udl exch def } def
+/PL { stroke userlinewidth setlinewidth } def
+/LTb { BL [] 0 0 0 DL } def
+/LTa { AL [1 udl mul 2 udl mul] 0 setdash 0 0 0 setrgbcolor } def
+/LT0 { PL [] 1 0 0 DL } def
+/LT1 { PL [4 dl 2 dl] 0 1 0 DL } def
+/LT2 { PL [2 dl 3 dl] 0 0 1 DL } def
+/LT3 { PL [1 dl 1.5 dl] 1 0 1 DL } def
+/LT4 { PL [5 dl 2 dl 1 dl 2 dl] 0 1 1 DL } def
+/LT5 { PL [4 dl 3 dl 1 dl 3 dl] 1 1 0 DL } def
+/LT6 { PL [2 dl 2 dl 2 dl 4 dl] 0 0 0 DL } def
+/LT7 { PL [2 dl 2 dl 2 dl 2 dl 2 dl 4 dl] 1 0.3 0 DL } def
+/LT8 { PL [2 dl 2 dl 2 dl 2 dl 2 dl 2 dl 2 dl 4 dl] 0.5 0.5 0.5 DL } def
+/Pnt { stroke [] 0 setdash
+ gsave 1 setlinecap M 0 0 V stroke grestore } def
+/Dia { stroke [] 0 setdash 2 copy vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath stroke
+ Pnt } def
+/Pls { stroke [] 0 setdash vpt sub M 0 vpt2 V
+ currentpoint stroke M
+ hpt neg vpt neg R hpt2 0 V stroke
+ } def
+/Box { stroke [] 0 setdash 2 copy exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath stroke
+ Pnt } def
+/Crs { stroke [] 0 setdash exch hpt sub exch vpt add M
+ hpt2 vpt2 neg V currentpoint stroke M
+ hpt2 neg 0 R hpt2 vpt2 V stroke } def
+/TriU { stroke [] 0 setdash 2 copy vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath stroke
+ Pnt } def
+/Star { 2 copy Pls Crs } def
+/BoxF { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath fill } def
+/TriUF { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath fill } def
+/TriD { stroke [] 0 setdash 2 copy vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath stroke
+ Pnt } def
+/TriDF { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath fill} def
+/DiaF { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath fill } def
+/Pent { stroke [] 0 setdash 2 copy gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath stroke grestore Pnt } def
+/PentF { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath fill grestore } def
+/Circle { stroke [] 0 setdash 2 copy
+ hpt 0 360 arc stroke Pnt } def
+/CircleF { stroke [] 0 setdash hpt 0 360 arc fill } def
+/C0 { BL [] 0 setdash 2 copy moveto vpt 90 450 arc } bind def
+/C1 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C2 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C3 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C4 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 180 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C5 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc
+ 2 copy moveto
+ 2 copy vpt 180 270 arc closepath fill
+ vpt 0 360 arc } bind def
+/C6 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C7 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C8 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 270 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C9 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 270 450 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C10 { BL [] 0 setdash 2 copy 2 copy moveto vpt 270 360 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 90 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C11 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 180 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 270 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C12 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 180 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C13 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 180 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C14 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 360 arc closepath fill
+ vpt 0 360 arc } bind def
+/C15 { BL [] 0 setdash 2 copy vpt 0 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/Rec { newpath 4 2 roll moveto 1 index 0 rlineto 0 exch rlineto
+ neg 0 rlineto closepath } bind def
+/Square { dup Rec } bind def
+/Bsquare { vpt sub exch vpt sub exch vpt2 Square } bind def
+/S0 { BL [] 0 setdash 2 copy moveto 0 vpt rlineto BL Bsquare } bind def
+/S1 { BL [] 0 setdash 2 copy vpt Square fill Bsquare } bind def
+/S2 { BL [] 0 setdash 2 copy exch vpt sub exch vpt Square fill Bsquare } bind def
+/S3 { BL [] 0 setdash 2 copy exch vpt sub exch vpt2 vpt Rec fill Bsquare } bind def
+/S4 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt Square fill Bsquare } bind def
+/S5 { BL [] 0 setdash 2 copy 2 copy vpt Square fill
+ exch vpt sub exch vpt sub vpt Square fill Bsquare } bind def
+/S6 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill Bsquare } bind def
+/S7 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill
+ 2 copy vpt Square fill
+ Bsquare } bind def
+/S8 { BL [] 0 setdash 2 copy vpt sub vpt Square fill Bsquare } bind def
+/S9 { BL [] 0 setdash 2 copy vpt sub vpt vpt2 Rec fill Bsquare } bind def
+/S10 { BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt Square fill
+ Bsquare } bind def
+/S11 { BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt2 vpt Rec fill
+ Bsquare } bind def
+/S12 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill Bsquare } bind def
+/S13 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+ 2 copy vpt Square fill Bsquare } bind def
+/S14 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+ 2 copy exch vpt sub exch vpt Square fill Bsquare } bind def
+/S15 { BL [] 0 setdash 2 copy Bsquare fill Bsquare } bind def
+/D0 { gsave translate 45 rotate 0 0 S0 stroke grestore } bind def
+/D1 { gsave translate 45 rotate 0 0 S1 stroke grestore } bind def
+/D2 { gsave translate 45 rotate 0 0 S2 stroke grestore } bind def
+/D3 { gsave translate 45 rotate 0 0 S3 stroke grestore } bind def
+/D4 { gsave translate 45 rotate 0 0 S4 stroke grestore } bind def
+/D5 { gsave translate 45 rotate 0 0 S5 stroke grestore } bind def
+/D6 { gsave translate 45 rotate 0 0 S6 stroke grestore } bind def
+/D7 { gsave translate 45 rotate 0 0 S7 stroke grestore } bind def
+/D8 { gsave translate 45 rotate 0 0 S8 stroke grestore } bind def
+/D9 { gsave translate 45 rotate 0 0 S9 stroke grestore } bind def
+/D10 { gsave translate 45 rotate 0 0 S10 stroke grestore } bind def
+/D11 { gsave translate 45 rotate 0 0 S11 stroke grestore } bind def
+/D12 { gsave translate 45 rotate 0 0 S12 stroke grestore } bind def
+/D13 { gsave translate 45 rotate 0 0 S13 stroke grestore } bind def
+/D14 { gsave translate 45 rotate 0 0 S14 stroke grestore } bind def
+/D15 { gsave translate 45 rotate 0 0 S15 stroke grestore } bind def
+/DiaE { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath stroke } def
+/BoxE { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath stroke } def
+/TriUE { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath stroke } def
+/TriDE { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath stroke } def
+/PentE { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath stroke grestore } def
+/CircE { stroke [] 0 setdash
+ hpt 0 360 arc stroke } def
+/Opaque { gsave closepath 1 setgray fill grestore 0 setgray closepath } def
+/DiaW { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V Opaque stroke } def
+/BoxW { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V Opaque stroke } def
+/TriUW { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V Opaque stroke } def
+/TriDW { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V Opaque stroke } def
+/PentW { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ Opaque stroke grestore } def
+/CircW { stroke [] 0 setdash
+ hpt 0 360 arc Opaque stroke } def
+/BoxFill { gsave Rec 1 setgray fill grestore } def
+end
+%%EndProlog
+gnudict begin
+gsave
+50 50 translate
+0.050 0.050 scale
+0 setgray
+newpath
+(Helvetica) findfont 140 scalefont setfont
+1.000 UL
+LTb
+1.000 UL
+LTa
+630 420 M
+3452 0 V
+1.000 UL
+LTb
+630 420 M
+63 0 V
+3389 0 R
+-63 0 V
+546 420 M
+(-20) Rshow
+1.000 UL
+LTa
+630 826 M
+3452 0 V
+1.000 UL
+LTb
+630 826 M
+63 0 V
+3389 0 R
+-63 0 V
+546 826 M
+(-15) Rshow
+1.000 UL
+LTa
+630 1232 M
+3452 0 V
+1.000 UL
+LTb
+630 1232 M
+63 0 V
+3389 0 R
+-63 0 V
+-3473 0 R
+(-10) Rshow
+1.000 UL
+LTa
+630 1638 M
+3452 0 V
+1.000 UL
+LTb
+630 1638 M
+63 0 V
+3389 0 R
+-63 0 V
+-3473 0 R
+(-5) Rshow
+1.000 UL
+LTa
+630 2044 M
+3452 0 V
+1.000 UL
+LTb
+630 2044 M
+63 0 V
+3389 0 R
+-63 0 V
+-3473 0 R
+(0) Rshow
+1.000 UL
+LTa
+630 2450 M
+3452 0 V
+1.000 UL
+LTb
+630 2450 M
+63 0 V
+3389 0 R
+-63 0 V
+-3473 0 R
+(5) Rshow
+1.000 UL
+LTa
+630 2856 M
+3452 0 V
+1.000 UL
+LTb
+630 2856 M
+63 0 V
+3389 0 R
+-63 0 V
+-3473 0 R
+(10) Rshow
+1.000 UL
+LTa
+630 420 M
+0 2436 V
+1.000 UL
+LTb
+630 420 M
+0 63 V
+0 2373 R
+0 -63 V
+630 280 M
+(0) Cshow
+1.000 UL
+LTa
+975 420 M
+0 2436 V
+1.000 UL
+LTb
+975 420 M
+0 63 V
+0 2373 R
+0 -63 V
+975 280 M
+(100) Cshow
+1.000 UL
+LTa
+1320 420 M
+0 2436 V
+1.000 UL
+LTb
+1320 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(200) Cshow
+1.000 UL
+LTa
+1666 420 M
+0 2436 V
+1.000 UL
+LTb
+1666 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(300) Cshow
+1.000 UL
+LTa
+2011 420 M
+0 2436 V
+1.000 UL
+LTb
+2011 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(400) Cshow
+1.000 UL
+LTa
+2356 420 M
+0 2436 V
+1.000 UL
+LTb
+2356 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(500) Cshow
+1.000 UL
+LTa
+2701 420 M
+0 2436 V
+1.000 UL
+LTb
+2701 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(600) Cshow
+1.000 UL
+LTa
+3046 420 M
+0 2436 V
+1.000 UL
+LTb
+3046 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(700) Cshow
+1.000 UL
+LTa
+3392 420 M
+0 2436 V
+1.000 UL
+LTb
+3392 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(800) Cshow
+1.000 UL
+LTa
+3737 420 M
+0 2373 V
+0 63 V
+1.000 UL
+LTb
+3737 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(900) Cshow
+1.000 UL
+LTa
+4082 420 M
+0 2436 V
+1.000 UL
+LTb
+4082 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(1000) Cshow
+1.000 UL
+LTb
+630 420 M
+3452 0 V
+0 2436 V
+-3452 0 V
+630 420 L
+140 1638 M
+currentpoint gsave translate 90 rotate 0 0 M
+(nanoseconds) Cshow
+grestore
+2356 70 M
+(seconds) Cshow
+1.000 UL
+LT0
+631 2125 M
+3 -81 V
+4 0 V
+3 -162 V
+4 0 V
+3 162 V
+3 0 V
+4 0 V
+3 -244 V
+4 0 V
+3 244 V
+4 -162 V
+3 0 V
+4 -163 V
+3 0 V
+4 163 V
+3 -163 V
+3 0 V
+4 731 V
+3 -812 V
+4 162 V
+3 650 V
+4 -812 V
+3 731 V
+4 81 V
+3 -731 V
+4 650 V
+3 -812 V
+3 731 V
+4 81 V
+3 -731 V
+4 650 V
+3 0 V
+4 -82 V
+3 82 V
+4 81 V
+3 -81 V
+4 0 V
+3 -163 V
+3 81 V
+4 82 V
+3 -163 V
+4 0 V
+3 -81 V
+4 81 V
+3 81 V
+4 -81 V
+3 0 V
+3 -81 V
+4 0 V
+3 81 V
+4 -81 V
+3 0 V
+4 -162 V
+3 162 V
+4 568 V
+3 -568 V
+4 0 V
+3 568 V
+3 -730 V
+4 568 V
+3 -568 V
+4 0 V
+3 649 V
+4 -731 V
+3 569 V
+4 162 V
+3 -812 V
+4 731 V
+3 81 V
+3 -243 V
+4 162 V
+3 -731 V
+4 569 V
+3 81 V
+4 -244 V
+3 163 V
+4 -731 V
+3 568 V
+3 0 V
+4 -243 V
+3 162 V
+4 0 V
+3 -244 V
+4 82 V
+3 -244 V
+4 162 V
+3 82 V
+4 -244 V
+3 81 V
+3 -243 V
+4 243 V
+3 81 V
+4 -243 V
+3 81 V
+4 -162 V
+3 162 V
+4 81 V
+3 -243 V
+4 81 V
+3 -163 V
+3 163 V
+4 81 V
+3 -162 V
+4 81 V
+3 -244 V
+4 163 V
+3 162 V
+4 -244 V
+3 82 V
+3 -163 V
+4 81 V
+3 163 V
+4 -244 V
+3 81 V
+4 -162 V
+3 81 V
+4 81 V
+3 -162 V
+4 0 V
+3 -81 V
+3 81 V
+4 81 V
+3 -81 V
+4 0 V
+3 650 V
+4 -569 V
+3 0 V
+4 -162 V
+3 162 V
+4 569 V
+3 -731 V
+3 162 V
+4 -162 V
+3 81 V
+4 812 V
+3 0 V
+4 -731 V
+3 731 V
+4 -893 V
+3 731 V
+4 0 V
+3 -650 V
+3 812 V
+4 0 V
+3 -244 V
+4 82 V
+3 162 V
+4 -244 V
+3 82 V
+4 -244 V
+3 81 V
+3 163 V
+4 -163 V
+3 81 V
+4 -162 V
+3 0 V
+4 244 V
+3 -244 V
+4 81 V
+3 -162 V
+4 81 V
+3 162 V
+3 -243 V
+4 162 V
+3 -162 V
+4 0 V
+3 162 V
+4 -162 V
+3 81 V
+4 650 V
+3 -812 V
+4 162 V
+3 650 V
+3 -731 V
+4 649 V
+3 -812 V
+4 731 V
+3 163 V
+4 -812 V
+3 649 V
+4 0 V
+3 -81 V
+4 162 V
+3 -812 V
+3 650 V
+4 81 V
+3 -243 V
+4 162 V
+3 -812 V
+4 650 V
+3 162 V
+4 -325 V
+3 81 V
+4 0 V
+3 -81 V
+3 81 V
+4 -243 V
+3 162 V
+4 0 V
+3 -81 V
+4 81 V
+3 -162 V
+4 162 V
+3 81 V
+3 -162 V
+4 81 V
+3 -243 V
+4 243 V
+3 81 V
+4 -243 V
+3 81 V
+4 -162 V
+3 162 V
+4 0 V
+3 -244 V
+3 82 V
+4 -82 V
+3 82 V
+4 81 V
+3 -163 V
+4 0 V
+3 -81 V
+4 81 V
+3 82 V
+4 -163 V
+3 0 V
+3 -162 V
+4 162 V
+3 0 V
+4 -81 V
+3 0 V
+4 650 V
+3 -569 V
+4 0 V
+3 -162 V
+3 81 V
+4 650 V
+3 -731 V
+4 162 V
+3 569 V
+4 -650 V
+3 650 V
+4 0 V
+3 -569 V
+3 569 V
+4 162 V
+3 -162 V
+4 0 V
+3 162 V
+4 0 V
+3 0 V
+4 -162 V
+3 162 V
+4 0 V
+3 0 V
+3 0 V
+4 -162 V
+3 162 V
+4 0 V
+3 0 V
+4 0 V
+3 -244 V
+4 244 V
+3 0 V
+4 -162 V
+3 162 V
+3 -162 V
+4 162 V
+3 -325 V
+4 325 V
+3 0 V
+4 -162 V
+3 162 V
+4 -244 V
+3 244 V
+4 0 V
+3 -162 V
+3 162 V
+4 -244 V
+3 244 V
+4 0 V
+3 -162 V
+4 162 V
+3 -325 V
+4 163 V
+3 162 V
+4 -244 V
+3 82 V
+3 568 V
+4 -650 V
+3 82 V
+4 649 V
+3 -731 V
+4 569 V
+3 162 V
+4 -649 V
+3 568 V
+3 -731 V
+4 569 V
+3 162 V
+4 -650 V
+3 569 V
+4 81 V
+3 -162 V
+4 81 V
+3 -650 V
+4 650 V
+3 81 V
+3 -162 V
+4 0 V
+3 243 V
+4 -243 V
+3 162 V
+4 -162 V
+3 0 V
+4 243 V
+3 -243 V
+4 81 V
+3 -81 V
+3 0 V
+4 162 V
+3 -162 V
+4 81 V
+3 -163 V
+4 0 V
+3 244 V
+4 -244 V
+3 82 V
+3 -82 V
+4 0 V
+3 163 V
+4 -244 V
+3 163 V
+4 -163 V
+3 0 V
+4 -81 V
+3 81 V
+4 81 V
+3 -81 V
+3 0 V
+4 -81 V
+3 0 V
+4 81 V
+3 -81 V
+4 -162 V
+3 -163 V
+4 81 V
+3 82 V
+4 -244 V
+3 0 V
+3 -162 V
+4 81 V
+3 81 V
+4 -81 V
+3 0 V
+4 649 V
+3 -649 V
+4 0 V
+3 -81 V
+3 0 V
+4 649 V
+3 -649 V
+4 0 V
+3 730 V
+4 -812 V
+3 650 V
+4 244 V
+3 -812 V
+4 649 V
+3 -812 V
+3 731 V
+4 81 V
+3 -731 V
+4 650 V
+3 81 V
+4 -243 V
+3 162 V
+4 0 V
+3 -162 V
+4 162 V
+3 -244 V
+3 244 V
+4 0 V
+3 -244 V
+4 82 V
+3 -244 V
+4 162 V
+3 82 V
+4 -163 V
+3 81 V
+4 -243 V
+3 243 V
+3 82 V
+4 -244 V
+3 81 V
+4 -162 V
+3 162 V
+4 81 V
+3 569 V
+currentpoint stroke M
+4 -731 V
+3 731 V
+3 81 V
+4 -731 V
+3 650 V
+4 -731 V
+3 650 V
+4 0 V
+3 -650 V
+4 568 V
+3 163 V
+4 -163 V
+3 0 V
+3 163 V
+4 -81 V
+3 -82 V
+4 -81 V
+3 81 V
+4 82 V
+3 -163 V
+4 0 V
+3 -81 V
+4 81 V
+3 81 V
+3 -162 V
+4 0 V
+3 0 V
+4 0 V
+3 81 V
+4 -81 V
+3 0 V
+4 -162 V
+3 162 V
+4 0 V
+3 -162 V
+3 0 V
+4 -163 V
+3 81 V
+4 82 V
+3 -82 V
+4 0 V
+3 650 V
+4 -731 V
+3 163 V
+3 649 V
+4 -812 V
+3 650 V
+4 81 V
+3 -650 V
+4 650 V
+3 -812 V
+4 650 V
+3 81 V
+4 -731 V
+3 650 V
+3 81 V
+4 -163 V
+3 82 V
+4 81 V
+3 -163 V
+4 82 V
+3 -163 V
+4 81 V
+3 -162 V
+4 81 V
+3 163 V
+3 -244 V
+4 81 V
+3 -81 V
+4 0 V
+3 81 V
+4 -81 V
+3 0 V
+4 -244 V
+3 244 V
+3 0 V
+4 -162 V
+3 0 V
+4 649 V
+3 -731 V
+4 82 V
+3 649 V
+4 -812 V
+3 650 V
+4 81 V
+3 -731 V
+3 650 V
+4 81 V
+3 -162 V
+4 81 V
+3 -731 V
+4 650 V
+3 0 V
+4 -163 V
+3 163 V
+4 81 V
+3 -163 V
+3 0 V
+4 -162 V
+3 81 V
+4 81 V
+3 -81 V
+4 0 V
+3 -81 V
+4 81 V
+3 0 V
+3 -81 V
+4 0 V
+3 -244 V
+4 244 V
+3 0 V
+4 -244 V
+3 0 V
+4 650 V
+3 -650 V
+4 0 V
+3 731 V
+3 -812 V
+4 650 V
+3 162 V
+4 -812 V
+3 650 V
+4 -812 V
+3 731 V
+4 162 V
+3 0 V
+4 -162 V
+3 0 V
+3 -82 V
+4 163 V
+3 0 V
+4 -163 V
+3 0 V
+4 -81 V
+3 163 V
+4 0 V
+3 -163 V
+4 0 V
+3 -81 V
+3 81 V
+4 81 V
+3 -162 V
+4 81 V
+3 -81 V
+4 0 V
+3 81 V
+4 -81 V
+3 0 V
+4 650 V
+3 0 V
+3 -650 V
+4 650 V
+3 -650 V
+4 568 V
+3 0 V
+4 -568 V
+3 650 V
+4 0 V
+3 -163 V
+3 0 V
+4 163 V
+3 -163 V
+4 0 V
+3 -81 V
+4 0 V
+3 162 V
+4 -162 V
+3 0 V
+4 -81 V
+3 0 V
+3 162 V
+4 -325 V
+3 82 V
+4 -163 V
+3 0 V
+4 163 V
+3 -244 V
+4 81 V
+3 -81 V
+4 0 V
+3 81 V
+3 569 V
+4 -650 V
+3 487 V
+4 81 V
+3 -568 V
+4 487 V
+3 -731 V
+4 650 V
+3 0 V
+3 -650 V
+4 569 V
+3 81 V
+4 -162 V
+3 0 V
+4 162 V
+3 -162 V
+4 0 V
+3 -82 V
+4 0 V
+3 163 V
+3 -244 V
+4 81 V
+3 -162 V
+4 0 V
+3 162 V
+4 -162 V
+3 0 V
+4 -162 V
+3 162 V
+4 81 V
+3 -243 V
+3 162 V
+4 487 V
+3 -731 V
+4 82 V
+3 730 V
+4 -730 V
+3 568 V
+4 81 V
+3 -731 V
+3 650 V
+4 81 V
+3 -162 V
+4 81 V
+3 81 V
+4 -162 V
+3 81 V
+4 -244 V
+3 82 V
+4 162 V
+3 -244 V
+3 82 V
+4 -244 V
+3 81 V
+4 163 V
+3 -163 V
+4 81 V
+3 -162 V
+4 0 V
+3 81 V
+4 -81 V
+3 0 V
+3 -244 V
+4 82 V
+3 162 V
+4 -162 V
+3 0 V
+4 730 V
+3 -730 V
+4 568 V
+3 162 V
+3 -812 V
+4 650 V
+3 162 V
+4 -243 V
+3 162 V
+4 -812 V
+3 650 V
+4 81 V
+3 -162 V
+4 81 V
+3 0 V
+3 -81 V
+4 81 V
+3 -244 V
+4 163 V
+3 81 V
+4 -163 V
+3 82 V
+4 -244 V
+3 162 V
+4 0 V
+3 -162 V
+3 0 V
+4 -244 V
+3 244 V
+4 0 V
+3 -325 V
+4 0 V
+3 -243 V
+4 162 V
+3 -81 V
+4 -81 V
+3 -82 V
+3 731 V
+4 -649 V
+3 0 V
+4 649 V
+3 -812 V
+4 731 V
+3 162 V
+4 -812 V
+3 731 V
+3 -812 V
+4 731 V
+3 162 V
+4 -812 V
+3 731 V
+4 0 V
+3 -81 V
+4 162 V
+3 -812 V
+4 650 V
+3 81 V
+3 -81 V
+4 81 V
+3 0 V
+4 -81 V
+3 81 V
+4 -243 V
+3 162 V
+4 162 V
+3 -162 V
+4 0 V
+3 -244 V
+3 244 V
+4 0 V
+3 -162 V
+4 162 V
+3 -244 V
+4 82 V
+3 162 V
+4 -325 V
+3 81 V
+3 -162 V
+4 81 V
+3 163 V
+4 -244 V
+3 162 V
+4 -162 V
+3 0 V
+4 244 V
+3 -244 V
+4 162 V
+3 -162 V
+3 0 V
+4 244 V
+3 -244 V
+4 81 V
+3 -81 V
+4 0 V
+3 162 V
+4 -81 V
+3 0 V
+4 -81 V
+3 81 V
+3 81 V
+4 -81 V
+3 0 V
+4 -81 V
+3 81 V
+4 163 V
+3 -163 V
+4 81 V
+3 -162 V
+3 81 V
+4 650 V
+3 -650 V
+4 81 V
+3 650 V
+4 -731 V
+3 731 V
+4 -731 V
+3 81 V
+4 569 V
+3 -650 V
+3 650 V
+4 -731 V
+3 162 V
+4 650 V
+3 -731 V
+4 650 V
+3 81 V
+4 -731 V
+3 731 V
+4 -812 V
+3 650 V
+3 243 V
+4 -812 V
+3 650 V
+4 0 V
+3 -81 V
+4 81 V
+3 81 V
+4 -162 V
+3 81 V
+4 -244 V
+3 163 V
+3 0 V
+4 -82 V
+3 82 V
+4 -244 V
+3 81 V
+4 0 V
+3 -81 V
+4 -162 V
+3 -244 V
+3 81 V
+4 -81 V
+currentpoint stroke M
+3 -406 V
+4 -325 V
+3 0 V
+4 325 V
+3 0 V
+4 -81 V
+3 0 V
+4 -81 V
+3 81 V
+3 0 V
+4 -163 V
+3 -81 V
+4 -162 V
+3 162 V
+4 0 V
+3 -81 V
+4 81 V
+3 -162 V
+4 162 V
+3 0 V
+3 -162 V
+4 81 V
+3 -162 V
+4 162 V
+3 81 V
+4 -243 V
+3 162 V
+4 -162 V
+3 162 V
+3 81 V
+4 -162 V
+3 81 V
+4 812 V
+3 -812 V
+4 244 V
+3 -163 V
+4 0 V
+3 731 V
+4 -731 V
+3 81 V
+3 731 V
+4 -731 V
+3 731 V
+4 0 V
+3 -649 V
+4 649 V
+3 -731 V
+4 731 V
+3 0 V
+4 -649 V
+3 649 V
+3 81 V
+4 -162 V
+3 162 V
+4 -730 V
+3 730 V
+4 82 V
+3 -163 V
+4 81 V
+3 -162 V
+3 162 V
+4 82 V
+3 -163 V
+4 81 V
+3 -162 V
+4 81 V
+3 81 V
+4 -81 V
+3 0 V
+4 -81 V
+3 162 V
+3 0 V
+4 -81 V
+3 0 V
+4 -162 V
+3 243 V
+4 0 V
+3 -81 V
+4 81 V
+3 -162 V
+4 162 V
+3 82 V
+3 -244 V
+4 81 V
+3 569 V
+4 -650 V
+3 81 V
+4 569 V
+3 -731 V
+4 731 V
+3 81 V
+4 -731 V
+3 650 V
+3 81 V
+4 -81 V
+3 0 V
+4 -650 V
+3 731 V
+4 0 V
+3 -163 V
+4 82 V
+3 162 V
+3 -81 V
+4 0 V
+3 -163 V
+4 82 V
+3 81 V
+4 -81 V
+3 0 V
+4 -163 V
+3 81 V
+4 82 V
+3 -163 V
+3 163 V
+4 -244 V
+3 81 V
+4 81 V
+3 -81 V
+4 81 V
+3 -162 V
+4 0 V
+3 81 V
+4 -81 V
+3 81 V
+3 650 V
+4 -731 V
+3 81 V
+4 -81 V
+3 0 V
+4 650 V
+3 -650 V
+4 0 V
+3 731 V
+4 -731 V
+3 568 V
+3 82 V
+4 -163 V
+3 163 V
+4 81 V
+3 -244 V
+4 81 V
+3 -162 V
+4 81 V
+3 81 V
+3 -243 V
+4 81 V
+3 -162 V
+4 81 V
+3 81 V
+4 -244 V
+3 82 V
+4 -163 V
+3 81 V
+4 82 V
+3 -244 V
+3 81 V
+4 -243 V
+3 243 V
+4 81 V
+3 -162 V
+4 0 V
+3 568 V
+4 -568 V
+3 0 V
+4 568 V
+3 -730 V
+3 649 V
+4 81 V
+3 -812 V
+4 650 V
+3 -731 V
+4 569 V
+3 162 V
+4 0 V
+3 -162 V
+3 81 V
+4 -244 V
+3 163 V
+4 0 V
+3 -82 V
+4 82 V
+3 -244 V
+4 162 V
+3 82 V
+4 -244 V
+3 81 V
+3 -81 V
+4 0 V
+3 81 V
+4 -81 V
+3 0 V
+4 568 V
+3 -568 V
+4 0 V
+3 487 V
+4 81 V
+3 -243 V
+3 162 V
+4 -731 V
+stroke
+grestore
+end
+showpage
+%%Trailer
+%%DocumentFonts: Helvetica
diff --git a/share/doc/papers/timecounter/intr.ps b/share/doc/papers/timecounter/intr.ps
new file mode 100644
index 000000000000..a6bb7ce078a2
--- /dev/null
+++ b/share/doc/papers/timecounter/intr.ps
@@ -0,0 +1,1501 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: _.ps
+%%Creator: gnuplot 3.7 patchlevel 1
+%%CreationDate: $FreeBSD$
+%%DocumentFonts: (atend)
+%%BoundingBox: 50 50 266 201
+%%Orientation: Portrait
+%%EndComments
+/gnudict 256 dict def
+gnudict begin
+/Color false def
+/Solid false def
+/gnulinewidth 5.000 def
+/userlinewidth gnulinewidth def
+/vshift -46 def
+/dl {10 mul} def
+/hpt_ 31.5 def
+/vpt_ 31.5 def
+/hpt hpt_ def
+/vpt vpt_ def
+/M {moveto} bind def
+/L {lineto} bind def
+/R {rmoveto} bind def
+/V {rlineto} bind def
+/vpt2 vpt 2 mul def
+/hpt2 hpt 2 mul def
+/Lshow { currentpoint stroke M
+ 0 vshift R show } def
+/Rshow { currentpoint stroke M
+ dup stringwidth pop neg vshift R show } def
+/Cshow { currentpoint stroke M
+ dup stringwidth pop -2 div vshift R show } def
+/UP { dup vpt_ mul /vpt exch def hpt_ mul /hpt exch def
+ /hpt2 hpt 2 mul def /vpt2 vpt 2 mul def } def
+/DL { Color {setrgbcolor Solid {pop []} if 0 setdash }
+ {pop pop pop Solid {pop []} if 0 setdash} ifelse } def
+/BL { stroke userlinewidth 2 mul setlinewidth } def
+/AL { stroke userlinewidth 2 div setlinewidth } def
+/UL { dup gnulinewidth mul /userlinewidth exch def
+ 10 mul /udl exch def } def
+/PL { stroke userlinewidth setlinewidth } def
+/LTb { BL [] 0 0 0 DL } def
+/LTa { AL [1 udl mul 2 udl mul] 0 setdash 0 0 0 setrgbcolor } def
+/LT0 { PL [] 1 0 0 DL } def
+/LT1 { PL [4 dl 2 dl] 0 1 0 DL } def
+/LT2 { PL [2 dl 3 dl] 0 0 1 DL } def
+/LT3 { PL [1 dl 1.5 dl] 1 0 1 DL } def
+/LT4 { PL [5 dl 2 dl 1 dl 2 dl] 0 1 1 DL } def
+/LT5 { PL [4 dl 3 dl 1 dl 3 dl] 1 1 0 DL } def
+/LT6 { PL [2 dl 2 dl 2 dl 4 dl] 0 0 0 DL } def
+/LT7 { PL [2 dl 2 dl 2 dl 2 dl 2 dl 4 dl] 1 0.3 0 DL } def
+/LT8 { PL [2 dl 2 dl 2 dl 2 dl 2 dl 2 dl 2 dl 4 dl] 0.5 0.5 0.5 DL } def
+/Pnt { stroke [] 0 setdash
+ gsave 1 setlinecap M 0 0 V stroke grestore } def
+/Dia { stroke [] 0 setdash 2 copy vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath stroke
+ Pnt } def
+/Pls { stroke [] 0 setdash vpt sub M 0 vpt2 V
+ currentpoint stroke M
+ hpt neg vpt neg R hpt2 0 V stroke
+ } def
+/Box { stroke [] 0 setdash 2 copy exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath stroke
+ Pnt } def
+/Crs { stroke [] 0 setdash exch hpt sub exch vpt add M
+ hpt2 vpt2 neg V currentpoint stroke M
+ hpt2 neg 0 R hpt2 vpt2 V stroke } def
+/TriU { stroke [] 0 setdash 2 copy vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath stroke
+ Pnt } def
+/Star { 2 copy Pls Crs } def
+/BoxF { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath fill } def
+/TriUF { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath fill } def
+/TriD { stroke [] 0 setdash 2 copy vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath stroke
+ Pnt } def
+/TriDF { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath fill} def
+/DiaF { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath fill } def
+/Pent { stroke [] 0 setdash 2 copy gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath stroke grestore Pnt } def
+/PentF { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath fill grestore } def
+/Circle { stroke [] 0 setdash 2 copy
+ hpt 0 360 arc stroke Pnt } def
+/CircleF { stroke [] 0 setdash hpt 0 360 arc fill } def
+/C0 { BL [] 0 setdash 2 copy moveto vpt 90 450 arc } bind def
+/C1 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C2 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C3 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C4 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 180 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C5 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc
+ 2 copy moveto
+ 2 copy vpt 180 270 arc closepath fill
+ vpt 0 360 arc } bind def
+/C6 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C7 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C8 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 270 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C9 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 270 450 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C10 { BL [] 0 setdash 2 copy 2 copy moveto vpt 270 360 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 90 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C11 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 180 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 270 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C12 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 180 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C13 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 180 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C14 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 360 arc closepath fill
+ vpt 0 360 arc } bind def
+/C15 { BL [] 0 setdash 2 copy vpt 0 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/Rec { newpath 4 2 roll moveto 1 index 0 rlineto 0 exch rlineto
+ neg 0 rlineto closepath } bind def
+/Square { dup Rec } bind def
+/Bsquare { vpt sub exch vpt sub exch vpt2 Square } bind def
+/S0 { BL [] 0 setdash 2 copy moveto 0 vpt rlineto BL Bsquare } bind def
+/S1 { BL [] 0 setdash 2 copy vpt Square fill Bsquare } bind def
+/S2 { BL [] 0 setdash 2 copy exch vpt sub exch vpt Square fill Bsquare } bind def
+/S3 { BL [] 0 setdash 2 copy exch vpt sub exch vpt2 vpt Rec fill Bsquare } bind def
+/S4 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt Square fill Bsquare } bind def
+/S5 { BL [] 0 setdash 2 copy 2 copy vpt Square fill
+ exch vpt sub exch vpt sub vpt Square fill Bsquare } bind def
+/S6 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill Bsquare } bind def
+/S7 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill
+ 2 copy vpt Square fill
+ Bsquare } bind def
+/S8 { BL [] 0 setdash 2 copy vpt sub vpt Square fill Bsquare } bind def
+/S9 { BL [] 0 setdash 2 copy vpt sub vpt vpt2 Rec fill Bsquare } bind def
+/S10 { BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt Square fill
+ Bsquare } bind def
+/S11 { BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt2 vpt Rec fill
+ Bsquare } bind def
+/S12 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill Bsquare } bind def
+/S13 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+ 2 copy vpt Square fill Bsquare } bind def
+/S14 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+ 2 copy exch vpt sub exch vpt Square fill Bsquare } bind def
+/S15 { BL [] 0 setdash 2 copy Bsquare fill Bsquare } bind def
+/D0 { gsave translate 45 rotate 0 0 S0 stroke grestore } bind def
+/D1 { gsave translate 45 rotate 0 0 S1 stroke grestore } bind def
+/D2 { gsave translate 45 rotate 0 0 S2 stroke grestore } bind def
+/D3 { gsave translate 45 rotate 0 0 S3 stroke grestore } bind def
+/D4 { gsave translate 45 rotate 0 0 S4 stroke grestore } bind def
+/D5 { gsave translate 45 rotate 0 0 S5 stroke grestore } bind def
+/D6 { gsave translate 45 rotate 0 0 S6 stroke grestore } bind def
+/D7 { gsave translate 45 rotate 0 0 S7 stroke grestore } bind def
+/D8 { gsave translate 45 rotate 0 0 S8 stroke grestore } bind def
+/D9 { gsave translate 45 rotate 0 0 S9 stroke grestore } bind def
+/D10 { gsave translate 45 rotate 0 0 S10 stroke grestore } bind def
+/D11 { gsave translate 45 rotate 0 0 S11 stroke grestore } bind def
+/D12 { gsave translate 45 rotate 0 0 S12 stroke grestore } bind def
+/D13 { gsave translate 45 rotate 0 0 S13 stroke grestore } bind def
+/D14 { gsave translate 45 rotate 0 0 S14 stroke grestore } bind def
+/D15 { gsave translate 45 rotate 0 0 S15 stroke grestore } bind def
+/DiaE { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath stroke } def
+/BoxE { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath stroke } def
+/TriUE { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath stroke } def
+/TriDE { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath stroke } def
+/PentE { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath stroke grestore } def
+/CircE { stroke [] 0 setdash
+ hpt 0 360 arc stroke } def
+/Opaque { gsave closepath 1 setgray fill grestore 0 setgray closepath } def
+/DiaW { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V Opaque stroke } def
+/BoxW { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V Opaque stroke } def
+/TriUW { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V Opaque stroke } def
+/TriDW { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V Opaque stroke } def
+/PentW { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ Opaque stroke grestore } def
+/CircW { stroke [] 0 setdash
+ hpt 0 360 arc Opaque stroke } def
+/BoxFill { gsave Rec 1 setgray fill grestore } def
+end
+%%EndProlog
+gnudict begin
+gsave
+50 50 translate
+0.050 0.050 scale
+0 setgray
+newpath
+(Helvetica) findfont 140 scalefont setfont
+1.000 UL
+LTb
+1.000 UL
+LTa
+882 420 M
+3200 0 V
+1.000 UL
+LTb
+882 420 M
+63 0 V
+3137 0 R
+-63 0 V
+798 420 M
+(0) Rshow
+1.000 UL
+LTa
+882 768 M
+3200 0 V
+1.000 UL
+LTb
+882 768 M
+63 0 V
+3137 0 R
+-63 0 V
+798 768 M
+(20000) Rshow
+1.000 UL
+LTa
+882 1116 M
+3200 0 V
+1.000 UL
+LTb
+882 1116 M
+63 0 V
+3137 0 R
+-63 0 V
+-3221 0 R
+(40000) Rshow
+1.000 UL
+LTa
+882 1464 M
+3200 0 V
+1.000 UL
+LTb
+882 1464 M
+63 0 V
+3137 0 R
+-63 0 V
+-3221 0 R
+(60000) Rshow
+1.000 UL
+LTa
+882 1812 M
+3200 0 V
+1.000 UL
+LTb
+882 1812 M
+63 0 V
+3137 0 R
+-63 0 V
+-3221 0 R
+(80000) Rshow
+1.000 UL
+LTa
+882 2160 M
+3200 0 V
+1.000 UL
+LTb
+882 2160 M
+63 0 V
+3137 0 R
+-63 0 V
+-3221 0 R
+(100000) Rshow
+1.000 UL
+LTa
+882 2508 M
+3200 0 V
+1.000 UL
+LTb
+882 2508 M
+63 0 V
+3137 0 R
+-63 0 V
+-3221 0 R
+(120000) Rshow
+1.000 UL
+LTa
+882 2856 M
+3200 0 V
+1.000 UL
+LTb
+882 2856 M
+63 0 V
+3137 0 R
+-63 0 V
+-3221 0 R
+(140000) Rshow
+1.000 UL
+LTa
+882 420 M
+0 2436 V
+1.000 UL
+LTb
+882 420 M
+0 63 V
+0 2373 R
+0 -63 V
+882 280 M
+(0) Cshow
+1.000 UL
+LTa
+1202 420 M
+0 2436 V
+1.000 UL
+LTb
+1202 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(100) Cshow
+1.000 UL
+LTa
+1522 420 M
+0 2436 V
+1.000 UL
+LTb
+1522 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(200) Cshow
+1.000 UL
+LTa
+1842 420 M
+0 2436 V
+1.000 UL
+LTb
+1842 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(300) Cshow
+1.000 UL
+LTa
+2162 420 M
+0 2436 V
+1.000 UL
+LTb
+2162 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(400) Cshow
+1.000 UL
+LTa
+2482 420 M
+0 2436 V
+1.000 UL
+LTb
+2482 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(500) Cshow
+1.000 UL
+LTa
+2802 420 M
+0 2436 V
+1.000 UL
+LTb
+2802 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(600) Cshow
+1.000 UL
+LTa
+3122 420 M
+0 2436 V
+1.000 UL
+LTb
+3122 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(700) Cshow
+1.000 UL
+LTa
+3442 420 M
+0 2373 V
+0 63 V
+1.000 UL
+LTb
+3442 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(800) Cshow
+1.000 UL
+LTa
+3762 420 M
+0 2373 V
+0 63 V
+1.000 UL
+LTb
+3762 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(900) Cshow
+1.000 UL
+LTa
+4082 420 M
+0 2436 V
+1.000 UL
+LTb
+4082 420 M
+0 63 V
+0 2373 R
+0 -63 V
+0 -2513 R
+(1000) Cshow
+1.000 UL
+LTb
+882 420 M
+3200 0 V
+0 2436 V
+-3200 0 V
+882 420 L
+140 1638 M
+currentpoint gsave translate 90 rotate 0 0 M
+(nanoseconds) Cshow
+grestore
+2482 70 M
+(seconds) Cshow
+1.000 UL
+LT0
+883 628 M
+3 589 V
+3 -538 V
+3 -63 V
+4 51 V
+3 -42 V
+3 458 V
+3 -424 V
+3 19 V
+4 -31 V
+3 30 V
+3 10 V
+3 65 V
+3 -89 V
+4 -39 V
+3 33 V
+3 13 V
+3 -10 V
+3 23 V
+3 -18 V
+4 25 V
+3 -17 V
+3 232 V
+3 -248 V
+4 19 V
+3 -35 V
+3 8 V
+3 -46 V
+3 9 V
+4 19 V
+3 -17 V
+3 9 V
+3 243 V
+3 -226 V
+4 -41 V
+3 38 V
+3 0 V
+3 5 V
+3 -40 V
+4 571 V
+3 823 V
+3 -1311 V
+3 -44 V
+3 7 V
+4 4 V
+3 9 V
+3 11 V
+3 13 V
+3 -81 V
+3 54 V
+4 -27 V
+3 30 V
+3 -8 V
+3 -21 V
+4 -2 V
+3 19 V
+3 -14 V
+3 13 V
+3 -25 V
+4 27 V
+3 15 V
+3 1888 V
+3 -1558 V
+3 -413 V
+4 38 V
+3 2 V
+3 -6 V
+3 15 V
+3 -42 V
+4 24 V
+3 -27 V
+3 19 V
+3 -8 V
+3 1236 V
+3 -1232 V
+4 3 V
+3 -15 V
+3 9 V
+3 10 V
+3 -27 V
+4 -31 V
+3 40 V
+3 17 V
+3 -20 V
+3 2 V
+4 3 V
+3 15 V
+3 -27 V
+3 15 V
+4 -37 V
+3 42 V
+3 1 V
+3 -10 V
+3 -10 V
+4 -22 V
+3 11 V
+3 34 V
+3 -33 V
+3 -6 V
+4 3 V
+3 13 V
+3 17 V
+3 -30 V
+3 13 V
+4 15 V
+3 -26 V
+3 341 V
+3 -321 V
+3 26 V
+4 -25 V
+3 39 V
+3 -26 V
+3 -10 V
+3 -23 V
+4 14 V
+3 24 V
+3 -14 V
+3 3 V
+3 -32 V
+3 41 V
+4 -24 V
+3 18 V
+3 -28 V
+3 38 V
+4 -24 V
+3 28 V
+3 -30 V
+3 5 V
+3 -7 V
+4 -7 V
+3 9 V
+3 37 V
+3 -43 V
+3 19 V
+4 -9 V
+3 -16 V
+3 39 V
+3 -22 V
+3 -29 V
+4 126 V
+3 1183 V
+3 -1283 V
+3 -30 V
+3 -6 V
+4 20 V
+3 -18 V
+3 28 V
+3 -15 V
+3 3 V
+3 34 V
+4 -27 V
+3 18 V
+3 -48 V
+3 55 V
+3 -23 V
+4 14 V
+3 10 V
+3 -21 V
+3 22 V
+4 -23 V
+3 12 V
+3 -15 V
+3 25 V
+3 -49 V
+4 54 V
+3 -30 V
+3 27 V
+3 -18 V
+3 10 V
+4 -24 V
+3 58 V
+3 -42 V
+3 12 V
+3 784 V
+4 -765 V
+3 -15 V
+3 56 V
+3 -64 V
+3 2 V
+4 39 V
+3 -32 V
+3 -11 V
+3 26 V
+3 -15 V
+4 7 V
+3 -30 V
+3 46 V
+3 -7 V
+3 16 V
+4 -39 V
+3 -9 V
+3 -7 V
+3 22 V
+3 -32 V
+4 16 V
+3 0 V
+3 1 V
+3 -13 V
+3 14 V
+4 10 V
+3 -39 V
+3 10 V
+3 -7 V
+3 29 V
+4 -28 V
+3 35 V
+3 686 V
+3 -531 V
+3 -97 V
+4 26 V
+3 -31 V
+3 -58 V
+3 45 V
+3 27 V
+3 -38 V
+4 30 V
+3 -80 V
+3 83 V
+3 -7 V
+3 20 V
+4 -26 V
+3 108 V
+3 -101 V
+3 27 V
+4 -49 V
+3 41 V
+3 -24 V
+3 4 V
+3 7 V
+4 -3 V
+3 -7 V
+3 26 V
+3 -9 V
+3 33 V
+4 132 V
+3 204 V
+3 -416 V
+3 26 V
+3 -37 V
+4 23 V
+3 1264 V
+3 -1299 V
+3 7 V
+3 -11 V
+3 21 V
+4 -40 V
+3 41 V
+3 -28 V
+3 28 V
+3 -24 V
+4 18 V
+3 -12 V
+3 6 V
+3 -3 V
+4 5 V
+3 -22 V
+3 24 V
+3 3 V
+3 10 V
+4 -26 V
+3 26 V
+3 -16 V
+3 12 V
+3 -16 V
+4 8 V
+3 -30 V
+3 3 V
+3 -1 V
+3 28 V
+4 0 V
+3 -1 V
+3 -3 V
+3 -28 V
+3 341 V
+3 -287 V
+4 -42 V
+3 38 V
+3 -22 V
+3 20 V
+3 -19 V
+4 39 V
+3 -24 V
+3 16 V
+3 -9 V
+3 20 V
+4 -18 V
+3 -21 V
+3 14 V
+3 -37 V
+3 47 V
+4 -21 V
+3 20 V
+3 -36 V
+3 38 V
+4 8 V
+3 -20 V
+3 -6 V
+3 5 V
+3 -21 V
+4 30 V
+3 -29 V
+3 5 V
+3 -8 V
+3 15 V
+4 -18 V
+3 17 V
+3 138 V
+3 949 V
+3 -1087 V
+4 -14 V
+3 -4 V
+3 17 V
+3 -16 V
+3 5 V
+3 -16 V
+4 15 V
+3 -7 V
+3 23 V
+3 -37 V
+4 43 V
+3 -14 V
+3 -2 V
+3 -3 V
+3 36 V
+4 -66 V
+3 41 V
+3 8 V
+3 -17 V
+3 16 V
+4 2 V
+3 9 V
+3 -34 V
+3 50 V
+3 -48 V
+4 18 V
+3 -10 V
+3 6 V
+3 -2 V
+3 12 V
+4 -23 V
+3 782 V
+3 -758 V
+3 50 V
+3 -31 V
+4 27 V
+3 -4 V
+3 29 V
+3 -32 V
+3 -2 V
+4 0 V
+3 2 V
+3 4 V
+3 -7 V
+3 -48 V
+4 15 V
+3 -15 V
+3 6 V
+3 -52 V
+3 71 V
+4 -15 V
+3 12 V
+3 -4 V
+3 30 V
+3 -28 V
+4 19 V
+3 -21 V
+3 15 V
+3 -17 V
+3 11 V
+4 1 V
+3 -21 V
+3 34 V
+3 -27 V
+3 140 V
+4 1432 V
+3 -1456 V
+3 -52 V
+3 59 V
+3 -36 V
+3 15 V
+4 -9 V
+3 15 V
+3 -17 V
+3 4 V
+3 -2 V
+4 4 V
+3 -4 V
+3 -2 V
+3 -29 V
+3 31 V
+4 2 V
+3 -2 V
+3 21 V
+3 -11 V
+4 -6 V
+3 16 V
+3 13 V
+3 -34 V
+3 39 V
+4 -13 V
+currentpoint stroke M
+3 -21 V
+3 54 V
+3 -33 V
+3 24 V
+4 -29 V
+3 64 V
+3 447 V
+3 757 V
+3 -1299 V
+4 549 V
+3 -428 V
+3 -128 V
+3 0 V
+3 14 V
+3 -40 V
+4 1 V
+3 23 V
+3 -41 V
+3 50 V
+4 -21 V
+3 20 V
+3 -15 V
+3 -1 V
+3 -6 V
+4 -1 V
+3 -19 V
+3 26 V
+3 -9 V
+3 21 V
+4 -28 V
+3 29 V
+3 1 V
+3 -8 V
+3 1 V
+4 0 V
+3 -7 V
+3 12 V
+3 -21 V
+3 19 V
+3 -15 V
+4 256 V
+3 -228 V
+3 2 V
+3 -3 V
+3 2 V
+4 -25 V
+3 38 V
+3 -3 V
+3 21 V
+3 -28 V
+4 42 V
+3 -24 V
+3 -10 V
+3 -10 V
+3 -7 V
+4 19 V
+3 -14 V
+3 7 V
+3 -29 V
+4 -3 V
+3 -4 V
+3 28 V
+3 -18 V
+3 10 V
+4 3 V
+3 23 V
+3 -51 V
+3 15 V
+3 -29 V
+4 36 V
+3 -19 V
+3 18 V
+3 4 V
+3 220 V
+4 835 V
+3 -1079 V
+3 -18 V
+3 7 V
+3 17 V
+3 -11 V
+4 3 V
+3 8 V
+3 -11 V
+3 24 V
+3 -9 V
+4 -1 V
+3 4 V
+3 5 V
+3 -15 V
+4 6 V
+3 8 V
+3 26 V
+3 -26 V
+3 0 V
+4 5 V
+3 27 V
+3 -24 V
+3 -2 V
+3 3 V
+4 -2 V
+3 22 V
+3 -38 V
+3 28 V
+3 -36 V
+4 29 V
+3 -15 V
+3 28 V
+3 601 V
+3 -596 V
+4 -36 V
+3 6 V
+3 27 V
+3 -1 V
+3 -41 V
+3 56 V
+4 -67 V
+3 16 V
+3 -30 V
+3 -7 V
+3 2108 V
+4 -2072 V
+3 -28 V
+3 10 V
+3 20 V
+4 1 V
+3 -125 V
+3 130 V
+3 -46 V
+3 5 V
+4 33 V
+3 8 V
+3 -21 V
+3 -9 V
+3 56 V
+4 -5 V
+3 -38 V
+3 36 V
+3 -49 V
+3 31 V
+4 4 V
+3 640 V
+3 564 V
+3 -1162 V
+3 46 V
+3 -53 V
+4 40 V
+3 -32 V
+3 16 V
+3 2 V
+3 7 V
+4 17 V
+3 -26 V
+3 4 V
+3 -1 V
+3 -19 V
+4 18 V
+3 -13 V
+3 35 V
+3 -15 V
+4 -19 V
+3 3 V
+3 25 V
+3 -34 V
+3 37 V
+4 -28 V
+3 31 V
+3 -18 V
+3 31 V
+3 -56 V
+4 -4 V
+3 -24 V
+3 -30 V
+3 20 V
+3 33 V
+4 1300 V
+3 -1283 V
+3 1 V
+3 6 V
+3 -18 V
+3 20 V
+4 -21 V
+3 -10 V
+3 35 V
+3 -20 V
+3 498 V
+4 -391 V
+3 -145 V
+3 3 V
+3 8 V
+3 -58 V
+4 50 V
+3 -12 V
+3 6 V
+3 -1 V
+4 -9 V
+3 -1 V
+3 5 V
+3 10 V
+3 14 V
+4 -23 V
+3 20 V
+3 -2 V
+3 -9 V
+3 4 V
+4 -1 V
+3 -7 V
+3 2 V
+3 246 V
+3 -249 V
+4 10 V
+3 9 V
+3 -28 V
+3 47 V
+3 -16 V
+3 -9 V
+4 3 V
+3 22 V
+3 -26 V
+3 21 V
+3 -5 V
+4 33 V
+3 -50 V
+3 -21 V
+3 40 V
+4 -18 V
+3 14 V
+3 -29 V
+3 29 V
+3 -24 V
+4 29 V
+3 -38 V
+3 41 V
+3 -24 V
+3 41 V
+4 -49 V
+3 19 V
+3 -14 V
+3 10 V
+3 -2 V
+4 9 V
+3 192 V
+3 903 V
+3 -1091 V
+3 8 V
+4 -21 V
+3 16 V
+3 -13 V
+3 12 V
+3 -9 V
+3 18 V
+4 -9 V
+3 -7 V
+3 4 V
+3 12 V
+4 -5 V
+3 0 V
+3 -5 V
+3 14 V
+3 -11 V
+4 26 V
+3 -34 V
+3 -53 V
+3 66 V
+3 35 V
+4 -39 V
+3 16 V
+3 4 V
+3 -43 V
+3 52 V
+4 -36 V
+3 22 V
+3 -26 V
+3 28 V
+3 -15 V
+4 199 V
+3 -180 V
+3 32 V
+3 -19 V
+3 -10 V
+4 -11 V
+3 -39 V
+3 48 V
+3 -26 V
+3 20 V
+3 24 V
+4 -43 V
+3 26 V
+3 -36 V
+3 22 V
+3 3 V
+4 -7 V
+3 -12 V
+3 14 V
+3 -24 V
+4 16 V
+3 -6 V
+3 4 V
+3 13 V
+3 -26 V
+4 29 V
+3 3 V
+3 9 V
+3 -16 V
+3 4 V
+4 22 V
+3 -45 V
+3 45 V
+3 -31 V
+3 1300 V
+4 -1245 V
+3 -27 V
+3 29 V
+3 -29 V
+3 20 V
+4 -20 V
+3 37 V
+3 -32 V
+3 42 V
+3 -40 V
+4 23 V
+3 -5 V
+3 -70 V
+3 32 V
+3 24 V
+4 -11 V
+3 36 V
+3 -12 V
+3 -17 V
+3 1 V
+4 7 V
+3 -1 V
+3 -33 V
+3 -3 V
+3 -45 V
+4 30 V
+3 -10 V
+3 16 V
+3 -1 V
+3 -1 V
+4 -1 V
+3 7 V
+3 790 V
+3 -631 V
+3 -117 V
+4 -9 V
+3 28 V
+3 -15 V
+3 28 V
+3 -45 V
+3 36 V
+4 -20 V
+3 -22 V
+3 47 V
+3 -40 V
+3 24 V
+4 -33 V
+3 50 V
+3 -20 V
+3 453 V
+4 46 V
+3 -544 V
+3 -2 V
+3 1 V
+3 -6 V
+4 -42 V
+3 24 V
+3 21 V
+3 -8 V
+3 -2 V
+4 -22 V
+3 30 V
+3 -2 V
+3 1 V
+3 -20 V
+4 -3 V
+3 1284 V
+3 -1273 V
+3 18 V
+3 -39 V
+3 40 V
+4 -3 V
+3 -10 V
+3 -3 V
+3 -16 V
+3 15 V
+4 5 V
+3 11 V
+3 -18 V
+3 -8 V
+4 3 V
+3 38 V
+3 -43 V
+3 24 V
+3 -8 V
+4 -16 V
+3 21 V
+3 -16 V
+3 -10 V
+3 -22 V
+4 51 V
+currentpoint stroke M
+3 -29 V
+3 22 V
+3 -31 V
+3 15 V
+4 -10 V
+3 30 V
+3 -53 V
+3 19 V
+3 400 V
+4 -415 V
+3 3 V
+3 15 V
+3 -14 V
+3 -1 V
+3 3 V
+4 -12 V
+3 20 V
+3 -2 V
+3 -7 V
+3 -22 V
+4 19 V
+3 -4 V
+3 22 V
+3 -3 V
+3 -10 V
+4 -1 V
+3 -10 V
+3 15 V
+3 5 V
+4 -18 V
+3 7 V
+3 30 V
+3 -18 V
+3 19 V
+4 -34 V
+3 11 V
+3 5 V
+3 -20 V
+3 15 V
+4 -31 V
+3 34 V
+3 224 V
+3 1103 V
+3 -1388 V
+4 47 V
+3 -14 V
+3 5 V
+3 5 V
+3 -6 V
+4 -14 V
+3 22 V
+3 -8 V
+3 -22 V
+3 13 V
+4 17 V
+3 3 V
+3 -18 V
+3 -12 V
+3 7 V
+4 2 V
+3 -14 V
+3 12 V
+3 -2 V
+3 -10 V
+4 12 V
+3 2 V
+3 -9 V
+3 15 V
+3 -6 V
+4 -3 V
+3 4 V
+3 3 V
+3 12 V
+3 -31 V
+4 30 V
+3 780 V
+3 -707 V
+3 -49 V
+3 32 V
+4 -1 V
+3 -18 V
+3 1 V
+3 -14 V
+3 22 V
+4 7 V
+3 -43 V
+3 18 V
+3 32 V
+3 -17 V
+3 1 V
+4 -29 V
+3 7 V
+3 33 V
+3 -14 V
+4 -35 V
+3 -6 V
+3 -24 V
+3 23 V
+3 -6 V
+4 -4 V
+3 15 V
+3 -37 V
+3 2 V
+3 5 V
+4 -9 V
+3 29 V
+3 -26 V
+3 11 V
+3 728 V
+4 -606 V
+3 -32 V
+3 -16 V
+3 10 V
+3 -12 V
+3 -2 V
+4 28 V
+3 -13 V
+3 -26 V
+3 9 V
+4 5 V
+3 14 V
+3 -18 V
+3 32 V
+3 -16 V
+4 -7 V
+3 -20 V
+3 24 V
+3 -24 V
+3 30 V
+4 -51 V
+3 45 V
+3 516 V
+3 -101 V
+3 -450 V
+4 20 V
+3 -10 V
+3 -20 V
+3 7 V
+3 1 V
+4 3 V
+3 13 V
+3 45 V
+3 1128 V
+3 -1227 V
+4 -52 V
+3 -5 V
+3 42 V
+3 -11 V
+3 15 V
+4 -12 V
+3 26 V
+3 -18 V
+3 3 V
+3 -10 V
+4 -49 V
+3 45 V
+3 -1 V
+3 28 V
+3 -22 V
+4 1 V
+3 1 V
+3 -12 V
+3 26 V
+3 -20 V
+4 3 V
+3 22 V
+3 -20 V
+3 9 V
+3 -32 V
+4 28 V
+3 -27 V
+3 18 V
+3 -9 V
+3 14 V
+4 -30 V
+3 1081 V
+3 268 V
+3 -1320 V
+3 -15 V
+4 10 V
+3 -20 V
+3 44 V
+3 -18 V
+3 -5 V
+3 -17 V
+4 10 V
+3 5 V
+3 -4 V
+3 -3 V
+3 -9 V
+4 4 V
+3 9 V
+3 12 V
+3 -15 V
+4 12 V
+3 -28 V
+3 16 V
+3 46 V
+3 -45 V
+stroke
+grestore
+end
+showpage
+%%Trailer
+%%DocumentFonts: Helvetica
diff --git a/share/doc/papers/timecounter/timecounter.ms b/share/doc/papers/timecounter/timecounter.ms
new file mode 100644
index 000000000000..a0a2e668ee9d
--- /dev/null
+++ b/share/doc/papers/timecounter/timecounter.ms
@@ -0,0 +1,1074 @@
+.EQ
+delim øø
+.EN
+.\"
+.\" ----------------------------------------------------------------------------
+.\" "THE BEER-WARE LICENSE" (Revision 42):
+.\" <phk@login.dknet.dk> wrote this file. As long as you retain this notice you
+.\" can do whatever you want with this stuff. If we meet some day, and you think
+.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+.\" ----------------------------------------------------------------------------
+.\"
+.if n .ND
+.TI
+Timecounters: Efficient and precise timekeeping in SMP kernels.
+.AA
+.A "Poul-Henning Kamp" "The FreeBSD Project"
+.AB
+The FreeBSD timecounters are an architecture-independent implementation
+of a binary timescale using whatever hardware support is at hand
+for tracking time. The binary timescale converts using simple
+multiplication to canonical timescales based on micro- or nano-seconds
+and can interface seamlessly to the NTP PLL/FLL facilities for clock
+synchronisation. Timecounters are implemented using lock-less
+stable-storage based primitives which scale efficiently in SMP
+systems. The math and implementation behind timecounters will
+be detailed as well as the mechanisms used for synchronisation. \**
+.AE
+.FS
+This paper was presented at the EuroBSDcon 2002 conference in Amsterdam.
+.FE
+.1C
+.SH
+Introduction
+.PP
+Despite digging around for it, I have not been able to positively
+identify the first computer which knew the time of day.
+The feature probably arrived either from the commercial side
+so service centres could bill computer cycles to customers or from
+the technical side so computers could timestamp external events,
+but I have not been able to conclusively nail the first implementation down.
+.LP
+But there is no doubt that it happened very early in the development
+of computers
+and if systems like the ``SAGE'' [SAGE] did not know what time
+it was I would be amazed.
+.LP
+On the other hand, it took a long time for a real time clock to
+become a standard feature:
+.LP
+The ``Apple ]['' computer
+had neither in hardware or software any notion what time it was.
+.LP
+The original ``IBM PC'' did know what time it was, provided you typed
+it in when you booted it, but it forgot when you turned it off.
+.LP
+One of the ``advanced technologies'' in the ``IBM PC/AT'' was a battery
+backed CMOS chip which kept track of time even when the computer
+was powered off.
+.LP
+Today we expect our computers to know the time, and with network
+protocols like NTP we will usually find that they do, give and
+take some milliseconds.
+.LP
+This article is about the code in the FreeBSD kernel which keeps
+track of time.
+.SH
+Time and timescale basics
+.PP
+Despite the fact that time is the physical quantity (or maybe entity
+?) about which we know the least, it is at the same time [sic!] what we
+can measure with the highest precision of all physical quantities.
+.LP
+The current crop of atomic clocks will neither gain nor lose a
+second in the next couple hundred million years, provided we
+stick to the preventative maintenance schedules. This is a feat
+roughly in line with to knowing the circumference of the Earth
+with one micrometer precision, in real time.
+.LP
+While it is possible to measure time by means other than oscillations,
+for instance transport or consumption of a substance at a well-known
+rate, such designs play no practical role in time measurement because
+their performance is significantly inferior to oscillation based
+designs.
+.LP
+In other words, it is pretty fair to say that all relevant
+timekeeping is based on oscillating phenomena:
+.TS
+center;
+l l.
+sun-dial Earths rotation about its axis.
+calendar Ditto + Earths orbit around the sun.
+clockwork Mechanical oscillation of pendulum.
+crystals Mechanical resonance in quartz.
+atomic Quantum-state transitions in atoms.
+.TE
+.LP
+We can therefore with good fidelity define ``a clock'' to be the
+combination of an oscillator and a counting mechanism:
+.LP
+.if t .PSPIC fig3.eps
+.LP
+The standard second is currently defined as
+.QP
+The duration of 9,192,631,770 periods of the radiation corresponding to the transition between the two hyperfine levels of the ground state of the caesium 133 atom.
+.LP
+and we have frequency standards which are able to mark a sequence of
+such seconds
+with an error less than ø2 cdot 10 sup{-15}ø [DMK2001] with commercially
+available products doing better than ø1 cdot 10 sup{-14}ø [AG2002].
+.LP
+Unlike other physical units with a conventionally defined origin,
+longitude for instance, the ephemeral nature of time prevents us
+from putting a stake in the ground, so to speak, and measure from
+there. For measuring time we have to rely on ``dead reckoning'',
+just like the navigators before Harrison built his clocks [RGO2002]:
+We have to tally how far we went from our reference point, keeping a
+running total at all times, and use that as our estimated position.
+.LP
+The upshot of this is, that we cannot define a timescale by any
+other means than some other timescale(s).
+.LP
+``Relative time'' is a time interval between two events, and for this
+we only need to agree on the rate of the oscillator.
+.LP
+``Absolute time'' consists of a well defined point in time and the
+time interval since then, this is a bit more tricky.
+.LP
+The Internationally agreed upon TAI and the UTC timescales
+starts at (from a physics point of view) arbitrary points in time
+and progresses in integral intervals of the standard second, with the
+difference being that UTC does tricks to the counting to stay roughly
+in sync with Earths rotation \**.
+.FS
+The first atomic based definition actually operated in a different way:
+each year would have its own value determined for the frequency of the
+caesium resonance, selected so as to match the revolution rate of the
+Earth. This resulted in time-intervals being very unwieldy business,
+and more and more scientists realized that that the caesium resonance
+was many times more stable than the angular momentum of the Earth.
+Eventually the new leap-second method were introduced in 1972.
+It is interesting to note that the autumn leaves falling on the
+northern hemisphere affects the angular momentum enough to change
+the Earths rotational rate measurably.
+.FE
+.LP
+TAI is defined as a sequence of standard seconds (the first timescale),
+counted from January 1st 1958 (the second timescale).
+.LP
+UTC is defined basically the same way, but every so often a leap-second
+is inserted (or theoretically deleted) to keep UTC synchronised
+with Earths rotation.
+.LP
+Both the implementation of these two, and a few others speciality
+timescales are the result of the
+combined efforts of several hundred atomic frequency standards in
+various laboratories and institutions throughout the world, all
+reporting to the BIPM in Paris who calculate the ``paper clock'' which
+TAI and UTC really are using a carefully designed weighting algorithm \**.
+.FS
+The majority of these clocks are model 5071A from Agilent (the test
+and measurement company formerly known as ``Hewlett-Packard'') which
+count for as much as 85% of the combined weight.
+A fact the company deservedly is proud of.
+The majority of the remaining weight is assigned to a handful of big
+custom-design units like the PTB2 and NIST7.
+.FE
+.LP
+Leap seconds are typically announced six to nine months in advance,
+based on precise observations of median transit times of stars and VLBI
+radio astronomy of very distant quasars.
+.LP
+The perceived wisdom of leap-seconds have been gradually decreasing
+in recent years, as devices and products with built-in calendar
+functionality becomes more and more common and people realize that
+user input or software upgrades are necessary to instruct the
+calendar functionality about upcoming leap seconds.
+.SH
+UNIX timescales
+.PP
+UNIX systems use a timescale which pretends to be UTC, but defined
+as the count of standard seconds since 00:00:00 01-01-1970 UTC,
+ignoring the leap-seconds. This definition has never been perceived
+as wise.
+.LP
+Ignoring leap seconds means that unless some trickery is performed
+when a leap second happens on the UTC scale, UNIX clocks would be
+one second off. Another implication is that the length of a
+time interval calculated on UNIX time_t variables, can be up to 22
+(and counting) seconds wrong relative to the same time interval
+measured on the UTC timescale.
+.LP
+Recent efforts have tried to make the NTP protocol make up for this
+deficiency by transmitting the UTC-TAI offset as part of the protocol.
+[MILLS2000A]
+.LP
+Fractional seconds are represented two ways in UNIX, ``timeval'' and
+``timespec''. Both of these formats are two-component structures
+which record the number of seconds, and the number of microseconds
+or nanoseconds respectively.
+.LP
+This unfortunate definition makes arithmetic on these two formats
+quite expensive to perform in terms of computer instructions:
+.DS
+.ps -1
+/* Subtract timeval from timespec */
+t3.tv_sec = t1.tv_sec - t2.tv_sec;
+t3.tv_nsec = t1.tv_nsec -
+ t2.tv_usec * 1000;
+if (t3.tv_nsec >= 1000000000) {
+ t3.tv_sec++;
+ t3.tv_nsec -= 1000000000;
+} else if (t3.tv_nsec < 0) {
+ t3.tv_sec--;
+ t3.tv_nsec += 1000000000;
+}
+.ps +1
+.DE
+.LP
+While nanoseconds will probably be enough for most timestamping
+tasks faced by UNIX computers for a number of years, it is an
+increasingly uncomfortable situation that CPU clock periods and
+instruction timings are already not representable in the standard
+time formats available on UNIX for consumer grade hardware,
+and the first POSIX mandated API, \fCclock_getres(3)\fP has
+already effectively reached end of life as a result of this.
+.LP
+Hopefully the various standards bodies will address this issue
+better in the future.
+.SH
+Precision, Stability and Resolution
+.PP
+Three very important terms in timekeeping are ``precision'',
+``stability'' and ``resolution''.
+While the three words may seem to describe somewhat the
+same property in most uses, their use in timekeeping covers three
+very distinct and well defined properties of a clock.
+.LP
+Resolution in clocks is simply a matter of the step-size of the
+counter or in other words: the rate at which it steps.
+A counter running on a 1 MHz frequency will have a resolution
+of 1 microsecond.
+.LP
+Precision talks about how close to the intended rate the clock runs,
+stability about how much the rate varies and resolution about the
+size of the smallest timeinterval we can measure.
+.LP
+From a quality point of view, Stability is a much more
+valuable property than precision, this is probably best explained
+using a graphic illustration of the difference between the two
+concepts:
+.LP
+.if t .PSPIC fig1.eps
+.LP
+In the top row we have instability, the bullet holes are spread over
+a large fraction of the target area.
+In the bottom row, the bullets all hit in a very small area.
+.LP
+On the left side, we have lack of precision, the holes obviously are
+not centred on the target, a systematic offset exists.
+In the right side we have precision, the bullets are centred on
+the target \**.
+.FS
+We cannot easily get resolution into this analogy, the obvious
+representation as the diameter of the bullet-hole is not correct,
+it would have to be the grid or other pattern of locations where
+the bullet could possibly penetrate the target material, but this
+gets too quantum-mechanical-oid to serve the instructional purpose.
+.FE
+.LP
+Transposing these four targets to actual clocks, the situation
+could look like the following plots:
+.LP
+.if t .PSPIC fig2.eps
+.LP
+On the x-axis we have time and on the y-axis how wrong the clock
+was at a given point in time.
+.LP
+The reason atomic standards are such a big deal in timekeeping is
+that they are incredibly stable: they are able to generate an oscillation
+where the period varies by roughly a millionth of a billonth of a
+second in long term measurements.
+.LP
+They are in fact not nearly as precise as they are stable, but as
+one can see from the graphic above, a stable clock which is not
+precise can be easily corrected for the offset and thus calibrated
+is as good as any clock.
+.LP
+This lack of precision is not necessarily a flaw in these kinds of
+devices, once you get into the ø10 cdot 10 sup{-15}ø territory
+things like the blackbody spectrum at the particular absolute
+temperature of the clocks hardware and general relativistic
+effects mostly dependent on the altitude above earths center
+has to be corrected for \**.
+.FS
+This particularly becomes an issue with space-based atomic standards
+as those found on the ``Navstar'' GPS satellites.
+.FE
+.SH
+Design goals of timecounters
+.PP
+After this brief description of the major features of the local
+landscape, we can look at the design goals of timecounters in detail:
+.LP
+.I "Provide timestamps in timeval and timespec formats,"
+.IP
+This is obviously the basic task we have to solve, but as was noted
+earlier, this is in no way the performance requirement.
+.LP
+.I "on both the ``uptime'' and the POSIX timescales,"
+.IP
+The ``uptime'' timescale is convenient for time intervals which are
+not anchored in UTC time: the run time of processes, the access
+time of disks and similar.
+.IP
+The uptime timescale counts seconds starting from when the system
+is booted. The POSIX/UTC timescale is implemented by adding an
+estimate of the POSIX time when the system booted to the uptime
+timescale.
+.LP
+.I "using whatever hardware we have available at the time,"
+.IP
+Which in a subtle way also implies ``be able to switch from one
+piece of hardware to another on the fly'' since we may not know
+right up front what hardware we have access to and which is
+preferable to use.
+.LP
+.I "while supporting time the NTP PLL/FLL discipline code,"
+.IP
+The NTP kernel PLL/FLL code allows the local clock and timescale
+to be synchronised or syntonised to an external timescale either
+via network packets or hardware connection. This also implies
+that the rate and phase of the timescale must be manoeuvrable
+with sufficient resolution.
+.LP
+.I "and providing support for the RFC 2783 PPS API,"
+.IP
+This is mainly for the benefit of the NTPD daemons communication
+with external clock or frequency hardware, but it has many other
+interesting uses as well [PHK2001].
+.LP
+.I "in a SMP efficient way."
+.IP
+Timestamps are used many places in the kernel and often at pretty
+high rate so it is important that the timekeeping facility
+does not become a point of CPU or lock contention.
+.SH
+Timecounter timestamp format.
+.PP
+Choosing the fundamental timestamp format for the timecounters is
+mostly a question of the resolution and steer-ability requirements.
+.LP
+There are two basic options on contemporary hardware: use a 32 bit
+integer for the fractional part of seconds, or use a 64 bit which
+is computationally more expensive.
+.LP
+The question therefore reduced to the somewhat simpler: can we get
+away with using only 32 bit ?
+.LP
+Since 32 bits fractional seconds have a resolution of slightly
+better than quarter of a nanosecond (.2328 nsec) it can obviously
+be converted to nanosecond resolution struct timespec timestamps
+with no loss of precision, but unfortunately not with pure 32 bit
+arithmetic as that would result in unacceptable rounding errors.
+.LP
+But timecounters also need to represent the clock period of the
+chosen hardware and this hardware might be the GHz range CPU-clock.
+The list of clock frequencies we could support with 32 bits are:
+.TS
+center;
+l l n l.
+ø2 sup{32} / 1ø ø=ø 4.294 GHz
+ø2 sup{32} / 2ø ø=ø 2.147 GHz
+ø2 sup{32} / 3ø ø=ø 1.432 GHz
+\&...
+ø2 sup{32} / (2 sup{32}-1)ø ø=ø 1.000 Hz
+.TE
+We can immediately see that 32 bit is insufficient to faithfully
+represent clock frequencies even in the low GHz area, much less in
+the range of frequencies which have already been vapourwared by
+both IBM, Intel and AMD.
+QED: 32 bit fractions are not enough.
+.LP
+With 64 bit fractions the same table looks like:
+.TS
+center;
+l l r l.
+ø2 sup{64} / 1ø ø=ø ø 18.45 cdot 10 sup{9}ø GHz
+ø2 sup{64} / 2ø ø=ø ø 9.223 cdot 10 sup{9}ø GHz
+\&...
+ø2 sup{64} / 2 sup{32}ø ø=ø 4.294 GHz
+\&...
+ø2 sup{64} / (2 sup{64}-1)ø ø=ø 1.000 Hz
+.TE
+And the resolution in the 4 GHz frequency range is approximately one Hz.
+.LP
+The following format have therefore been chosen as the basic format
+for timecounters operations:
+.DS
+.ps -1
+struct bintime {
+ time_t sec;
+ uint64_t frac;
+};
+.ps +1
+.DE
+Notice that the format will adapt to any size of time_t variable,
+keeping timecounters safely out of the ``We SHALL prepare for the
+Y2.038K problem'' war zone.
+.LP
+One beauty of the bintime format, compared to the timeval and
+timespec formats is that it is a binary number, not a pseudo-decimal
+number. If compilers and standards allowed, the representation
+would have been ``int128_t'' or at least ``int96_t'', but since this
+is currently not possible, we have to express the simple concept
+of multiword addition in the C language which has no concept of a
+``carry bit''.
+.LP
+To add two bintime values, the code therefore looks like this \**:
+.FS
+If the reader suspects the '>' is a typo, further study is suggested.
+.FE
+.LP
+.DS
+.ps -1
+uint64_t u;
+
+u = bt1->frac;
+bt3->frac = bt1->frac + bt2->frac;
+bt3->sec = bt1->sec + bt2->sec;
+if (u > bt3->frac)
+ bt3->sec += 1;
+.ps +1
+.DE
+.LP
+An important property of the bintime format is that it can be
+converted to and from timeval and timespec formats with simple
+multiplication and shift operations as shown in these two
+actual code fragments:
+.DS
+.ps -1
+void
+bintime2timespec(struct bintime *bt,
+ struct timespec *ts)
+{
+
+ ts->tv_sec = bt->sec;
+ ts->tv_nsec =
+ ((uint64_t)1000000000 *
+ (uint32_t)(bt->frac >> 32)) >> 32;
+}
+.ps +1
+.DE
+.DS
+.ps -1
+void
+timespec2bintime(struct timespec *ts,
+ struct bintime *bt)
+{
+
+ bt->sec = ts->tv_sec;
+ /* 18446744073 =
+ int(2^64 / 1000000000) */
+ bt->frac = ts->tv_nsec *
+ (uint64_t)18446744073LL;
+}
+.ps +1
+.DE
+.LP
+.SH
+How timecounters work
+.PP
+To produce a current timestamp the timecounter code
+reads the hardware counter, subtracts a reference
+count to find the number of steps the counter has
+progressed since the reference timestamp.
+This number of steps is multiplied with a factor
+derived from the counters frequency, taking into account
+any corrections from the NTP PLL/FLL and this product
+is added to the reference timestamp to get a timestamp.
+.LP
+This timestamp is on the ``uptime'' time scale, so if
+UNIX/UTC time is requested, the estimated time of boot is
+added to the timestamp and finally it is scaled to the
+timeval or timespec if that is the desired format.
+.LP
+A fairly large number of functions are provided to produce
+timestamps, depending on the desired timescale and output
+format:
+.TS
+center;
+l r r.
+Desired uptime UTC/POSIX
+Format timescale timescale
+_
+bintime binuptime() bintime()
+timespec nanouptime() nanotime()
+timeval microuptime() microtime()
+.TE
+.LP
+Some applications need to timestamp events, but are not
+particular picky about the precision.
+In many cases a precision of tenths or hundreds of
+seconds is sufficient.
+.LP
+A very typical case is UNIX file timestamps:
+There is little point in spending computational resources getting an
+exact nanosecond timestamp, when the data is written to
+a mechanical device which has several milliseconds of unpredictable
+delay before the operation is completed.
+.LP
+Therefore a complementary shadow family of timestamping functions
+with the prefix ``get'' have been added.
+.LP
+These functions return the reference
+timestamp from the current timehands structure without going to the
+hardware to determine how much time has elapsed since then.
+These timestamps are known to be correct to within rate at which
+the periodic update runs, which in practice means 1 to 10 milliseconds.
+.SH
+Timecounter math
+.LP
+The delta-count operation is straightforward subtraction, but we
+need to logically AND the result with a bit-mask with the same number
+(or less) bits as the counter implements,
+to prevent higher order bits from getting set when the counter rolls over:
+.DS
+.ce
+.EQ
+Delta Count = (Count sub{now} - Count sub{ref}) ~ BITAND ~ mask
+.EN
+.DE
+The scaling step is straightforward.
+.DS
+.ce
+.EQ
+T sub{now} = Delta Count cdot R sub{counter} + T sub{ref}
+.EN
+.DE
+The scaling factor øR sub{counter}ø will be described below.
+.LP
+At regular intervals, scheduled by \fChardclock()\fP, a housekeeping
+routine is run which does the following:
+.LP
+A timestamp with associated hardware counter reading is elevated
+to be the new reference timecount:
+.DS
+
+.ce
+.EQ
+Delta Count = (Count sub{now} - Count sub{ref}) ~ BITAND ~ mask
+.EN
+
+.ce
+.EQ
+T sub{now} = Delta Count cdot R sub{counter}
+.EN
+
+.ce
+.EQ
+Count sub{ref} = Count sub{now}
+.EN
+
+.ce
+.EQ
+T sub{ref} = T sub{now}
+.EN
+.DE
+.LP
+If a new second has started, the NTP processing routines are called
+and the correction they return and the counters frequency is used
+to calculate the new scaling factor øR sub{counter}ø:
+.DS
+.ce
+.EQ
+R sub{counter} = {2 sup{64} over Freq sub{counter}} cdot ( 1 + R sub{NTP} )
+.EN
+.DE
+Since we only have access to 64 bit arithmetic, dividing something
+into ø2 sup{64}ø is a problem, so in the name of code clarity
+and efficiency, we sacrifice the low order bit and instead calculate:
+.DS
+.ce
+.EQ
+R sub{counter} = 2 cdot {2 sup{63} over Freq sub{counter}} cdot ( 1 + R sub{NTP} )
+.EN
+.DE
+The øR sub{NTP}ø correct factor arrives as the signed number of
+nanoseconds (with 32 bit binary fractions) to adjust per second.
+This quasi-decimal number is a bit of a square peg in our round binary
+hole, and a conversion factor is needed.
+Ideally we want to multiply this factor by:
+.DS
+.ce
+.EQ
+2 sup {64} over {10 sup{9} cdot 2 sup{32}} = 4.294967296
+.EN
+.DE
+This is not a nice number to work with.
+Fortunately, the precision of this correction is not critical, we are
+within an factor of a million of the ø10 sup{-15}ø performance level
+of state of the art atomic clocks, so we can use an approximation
+on this term without anybody noticing.
+.LP
+Deciding which fraction to use as approximation needs to carefully
+consider any possible overflows that could happen.
+In this case the correction may be as large as \(+- 5000 PPM which
+leaves us room to multiply with about 850 in a multiply-before-divide
+setting.
+Unfortunately, there are no good fractions which multiply with less
+than 850 and at the same time divide by a power of two, which is
+desirable since it can be implemented as a binary shift instead of
+an expensive full division.
+.LP
+A divide-before-multiply approximation necessarily results in a loss
+of lower order bits, but in this case dividing by 512 and multiplying
+by 2199 gives a good approximation where the lower order bit loss is
+not a concern:
+.DE
+.EQ
+2199 over 512 = 4.294921875
+.EN
+.DE
+The resulting error is an systematic under compensation of 10.6PPM
+of the requested change, or ø1.06 cdot 10 sup -14ø per nanosecond
+of correction.
+This is perfectly acceptable.
+.LP
+Putting it all together, including the one bit we put on the alter for the
+Goddess of code clarity, the formula looks like this:
+.DS
+.ce
+.EQ
+R sub{counter} = 2 cdot {{2 sup{63} + 2199 cdot {R sub{NTP}} over 1024} over Freq sub{counter}}
+.EN
+.DE
+Presented here in slightly unorthodox format to show the component arithmetic
+operations as they are carried out in the code.
+.SH
+Frequency of the periodic update
+.PP
+The hardware counter should have a long enough
+period, ie, number of distinct counter values divided by
+frequency, to not roll over before our periodic update function
+has had a chance to update the reference timestamp data.
+.LP
+The periodic update function is called from \fChardclock()\fP which
+runs at a rate which is controlled by the kernel parameter
+.I HZ .
+.LP
+By default HZ is 100 which means that only hardware with a period
+longer than 10 msec is usable.
+If HZ is configured higher than 1000, an internal divider is
+activated to keep the timecounter periodic update running
+no more often than 2000 times per second.
+.LP
+Let us take an example:
+At HZ=100 a 16 bit counter can run no faster than:
+.DS
+.ce
+.EQ
+2 sup{16} cdot {100 Hz} = 6.5536 MHz
+.EN
+.DE
+Similarly, if the counter runs at 10MHz, the minimum HZ is
+.DS
+.ce
+.EQ
+{10 MHz} over {2 sup{16}} = 152.6 Hz
+.EN
+.DE
+.LP
+Some amount of margin is of course always advisable,
+and a factor two is considered prudent.
+.LP
+.SH
+Locking, lack of ...
+.PP
+Provided our hardware can be read atomically, that our arithmetic
+has enough bits to not roll over and that our clock frequency is
+perfectly, or at least sufficiently, stable, we could avoid the
+periodic update function, and consequently disregard the entire
+issue of locking.
+We are seldom that lucky in practice.
+.LP
+The straightforward way of dealing with meta data updates is to
+put a lock of some kind on the data and grab hold of that before
+doing anything.
+This would however be a very heavy-handed approach. First of
+all, the updates are infrequent compared to simple references,
+second it is not important which particular state of meta data
+a consumer gets hold of, as long as it is consistent: as long
+as the øCount sub{ref}ø and øT sub{ref}ø are a matching pair,
+and not old enough to cause an ambiguity with hardware counter
+rollover, a valid timestamp can be derived from them.
+.LP
+A pseudo-stable-storage with generation count method has been
+chosen instead.
+A ring of ten ``timehands'' data structures are used to hold the
+state of the timecounter system, the periodic update function
+updates the next structure with the new reference data and
+scaling factor and makes it the current timehands.
+.LP
+The beauty of this arrangement lies in the fact that even though
+a particular ``timehands'' data structure has been bumped from being
+the ``currents state'' by its successor, it still contains valid data
+for some amount of time into the future.
+.LP
+Therefore, a process which has started the timestamping process but
+suffered an interrupt which resulted in the above periodic processing
+can continue unaware of this afterwards and not suffer corruption
+or miscalculation even though it holds no locks on the shared
+meta-data.
+.if t .PSPIC fig4.eps
+.LP
+This scheme has an inherent risk that a process may be de-scheduled for
+so long time that it will not manage to complete the timestamping
+process before the entire ring of timehands have been recycled.
+This case is covered by each timehand having a private generation number
+which is temporarily set to zero during the periodic processing, to
+mark inconsistent data, and incremented to one more than the
+previous value when the update has finished and the timehands
+is again consistent.
+.LP
+The timestamping code will grab a copy of this generation number and
+compare this copy to the generation in the timehands after completion
+and if they differ it will restart the timestamping calculation.
+.DS
+.ps -1
+do {
+ th = timehands;
+ gen = th->th_generation;
+ /* calculate timestamp */
+} while (gen == 0 ||
+ gen != th->th_generation);
+.ps +1
+.DE
+.LP
+Each hardware device supporting timecounting is represented by a
+small data structure called a timecounter, which documents the
+frequency, the number of bits implemented by the counter and a method
+function to read the counter.
+.LP
+Part of the state in the timehands structure is a pointer to the
+relevant timecounter structure, this makes it possible to change
+to a one piece of hardware to another ``on the fly'' by updating
+the current timehands pointer in a manner similar to the periodic
+update function.
+.LP
+In practice this can be done with sysctl(8):
+.DS
+.ps -1
+sysctl kern.timecounter.hardware=TSC
+.ps +1
+.DE
+.LP
+at any time while the system is running.
+.SH
+Suitable hardware
+.PP
+A closer look on ``suitable hardware'' is warranted
+at this point.
+It is obvious from the above description that the ideal hardware
+for timecounting is a wide binary counter running at a constant
+high frequency
+and atomically readable by all CPUs in the system with a fast
+instruction(-sequence).
+.LP
+When looking at the hardware support on the PC platform, one
+is somewhat tempted to sigh deeply and mutter ``so much for theory'',
+because none of the above parameters seems to have been on the
+drawing board together yet.
+.LP
+All IBM PC derivatives contain a device more or less compatible
+with the venerable Intel i8254 chip.
+This device contains 3 counters of 16 bits each,
+one of which is wired so it can interrupt the CPU when the
+programmable terminal count is reached.
+.LP
+The problem with this device is that it only has 8bit bus-width,
+so reading a 16 bit timestamp takes 3 I/O operations: one to latch
+the count in an internal register, and two to read the high and
+low parts of that register respectively.
+.LP
+Obviously, on multi-CPU systems this cannot be done without some
+kind of locking mechanism preventing the other CPUs from trying
+to do the same thing at the same time.
+.LP
+Less obviously we find it is even worse than that:
+Since a low priority kernel thread
+might be reading a timestamp when an interrupt comes in, and since
+the interrupt thread might also attempt to generate a timestamp,
+we need to totally block interrupts out while doing those three
+I/O instructions.
+.LP
+And just to make life even more complicated, FreeBSD uses the same
+counter to provide the periodic interrupts which schedule the
+\fChardclock()\fP routine, so in addition the code has to deal with the
+fact that the counter does not count down from a power of two and
+that an interrupt is generated right after the reloading of the
+counter when it reaches zero.
+.LP
+Ohh, and did I mention that the interrupt rate for hardclock() will
+be set to a higher frequency if profiling is active ? \**
+.FS
+I will not even mention the fact that it can be set also to ridiculous
+high frequencies in order to be able to use the binary driven ``beep''
+speaker in the PC in a PCM fashion to output ``real sounds''.
+.FE
+.LP
+It hopefully doesn't ever get more complicated than that, but it
+shows, in its own bizarre and twisted way, just how little help the
+timecounter code needs from the actual hardware.
+.LP
+The next kind of hardware support to materialise was the ``CPU clock
+counter'' called ``TSC'' in official data-sheets.
+This is basically a on-CPU counter, which counts at the rate
+of the CPU clock.
+.LP
+Unfortunately, the electrical power needed to run a CPU is pretty
+precisely proportional with the clock frequency for the
+prevailing CMOS chip technology, so
+the advent of computers powered by batteries prompted technologies
+like APM, ACPI, SpeedStep and others which varies or throttles the
+CPU clock to match computing demand in order to minimise the power
+consumption \**.
+.FS
+This technology also found ways into stationary computers from
+two different vectors.
+The first vector was technical: Cheaper cooling solutions can be used
+for the CPU if they are employed resulting in cheaper commodity
+hardware.
+The second vector was political: For reasons beyond reason, energy
+conservation became an issue with personal computers, despite the fact
+that practically all north American households contains 4 to 5 household
+items which through inefficient designs waste more power than a
+personal computer use.
+.FE
+.LP
+Another wiggle for the TSC is that it is not usable on multi-CPU
+systems because the counter is implemented inside the CPU and
+not readable from other CPUs in the system.
+.LP
+The counters on different CPUs are not guaranteed
+to run syntonously (ie: show the same count at the same time).
+For some architectures like the DEC/alpha architecture they do not even
+run synchronously (ie: at the same rate) because the CPU clock frequency
+is generated by a small SAW device on the chip which is very sensitive
+to temperature changes.
+.LP
+The ACPI specification finally brings some light:
+it postulates the existence of a 24 or 32 bit
+counter running at a standardised constant frequency and
+specifically notes that this is intended to be used for timekeeping.
+.LP
+The frequency chosen, 3.5795454... MHz\**
+.FS
+The reason for this odd-ball frequency has to be sought in the ghastly
+colours offered by the original IBM PC Color Graphics Adapter: It
+delivered NTSC format output and therefore introduced the NTSC colour
+sync frequency into personal computers.
+.FE
+ is not quite as high as one
+could have wished for, but it is certainly a big improvement over
+the i8254 hardware in terms of access path.
+.LP
+But trust it to Murphys Law: The majority of implementations so far
+have failed to provide latching suitable to avoid meta-stability
+problems, and several readings from the counter is necessary to
+get a reliable timestamp.
+In difference from the i8254 mentioned above, we do not need to
+any locking while doing so, since each individual read is atomic.
+.LP
+An initialization routine tries to test if the ACPI counter is properly
+latched by examining the width of a histogram over read delta-values.
+.LP
+Other architectures are similarly equipped with means for timekeeping,
+but generally more carefully thought out compared to the haphazard
+developments of the IBM PC architecture.
+.LP
+One final important wiggle of all this, is that it may not be possible
+to determine which piece of hardware is best suited for clock
+use until well into or even after the bootstrap process.
+.LP
+One example of this is the Loran-C receiver designed by Prof. Dave Mills
+[MILLS1992]
+which is unsuitable as timecounter until the daemon program which
+implements the software-half of the receiver has properly initialised
+and locked onto a Loran-C signal.
+.SH
+Ideal timecounter hardware
+.LP
+As proof of concept, a sort of an existentialist protest against
+the sorry state describe above, the author undertook a project to
+prove that it is possible to do better than that, since none of
+the standard hardware offered a way to fully validate the timecounter
+design.
+.LP
+Using a COTS product, ``HOT1'', from Virtual Computers Corporation
+[VCC2002] containing a FPGA chip on a PCI form factor card, a 26
+bit timecounter running at 100MHz was successfully implemented.
+.LP
+.if t .PSPIC fig5.eps
+.LP
+.LP
+In order to show that timestamping does not necessarily have to
+be done using unpredictable and uncalibratable interrupts, an
+array of latches were implemented as well, which allow up to 10
+external signals to latch the reading of the counter when
+an external PPS signal transitions from logic high to logic
+low or vice versa.
+.LP
+Using this setup, a standard 133 MHz Pentium based PC is able to
+timestamp the PPS output of the Motorola UT+ GPS receiver with
+a precision of \(+- 10 nanoseconds \(+- one count which in practice
+averages out to roughly \(+- 15 nanoseconds\**:
+.FS
+The reason the plot does not show a very distinct 10 nanosecond
+quantization is that the GPS receiver produces the PPS signal from
+a clock with a roughly 55 nanosecond period and then predicts in
+the serial data stream how many nanoseconds this will be offset
+from the ideal time.
+This plot shows the timestamps corrected for this ``negative
+sawtooth correction''.
+.FE
+.LP
+.if t .PSPIC gps.ps
+.LP
+It shold be noted that the author is no hardware wizard and
+a number of issues in the implementation results in less than
+ideal noise performance.
+.LP
+Now compare this to ``ideal'' timecounter to the normal setup
+where the PPS signal is used
+to trigger an interrupt via the DCD pin on a serial port, and
+the interrupt handler calls \fCnanotime()\fP to timestamp
+the external event \**:
+.FS
+In both cases, the computers clock frequency controlled
+with a Rubidium Frequency standard.
+The average quality of crystals used for computers would
+totally obscure the curves due to their temperature coefficient.
+.FE
+.LP
+.if t .PSPIC intr.ps
+.LP
+It is painfully obvious that the interrupt latency is the
+dominant noise factor in PPS timestamping in the second case.
+The asymetric distribution of the noise in the second plot
+also more or less entirely invalidates the design assumption
+in the NTP PLL/FLL kernel code that timestamps are dominated
+by gaussian noise with few spikes.
+.SH
+Status and availability
+.PP
+The timecounter code has been developed and used in FreeBSD
+for a number of years and has now reached maturity.
+The source-code is located almost entirely in the kernel source file
+kern_tc.c, with a few necessary adaptations in code which
+interfaces to it, primarily the NTP PLL/FLL code.
+.LP
+The code runs on all FreeBSD platforms including i386, alpha,
+PC98, sparc64, ia64 and s/390 and contains no wordsize or
+endianess issues not specifically handled in the sourcecode.
+.LP
+The timecounter implementation is distributed under the ``BSD''
+open source license or the even more free ``Beer-ware'' license.
+.LP
+While the ability to accurately model and compensate for
+inaccuracies typical of atomic frequency standards are not
+catering to the larger userbase, but this ability and precision
+of the code guarntees solid support for the widespread deployment
+of NTP as a time synchronization protocol, without rounding
+or accumulative errors.
+.LP
+Adding support for new hardware and platforms have been
+done several times by other developers without any input from the
+author, so this particular aspect of timecounters design
+seems to work very well.
+.SH
+Future work
+.PP
+At this point in time, no specific plans exist for further
+development of the timecounters code.
+.LP
+Various micro-optimizations, mostly to compensate for inadequate
+compiler optimization could be contemplated, but the author
+resists these on the basis that they significantly decrease
+the readability of the source code.
+.SH
+Acknowledgements
+.PP
+.EQ
+delim ññ
+.EN
+The author would like to thank:
+.LP
+Bruce Evans
+for his invaluable assistance
+in taming the evil i8254 timecounter, as well as the enthusiastic
+resistance he has provided throughout.
+.PP
+Professor Dave Mills of University of Delaware for his work on
+NTP, for lending out the neglected twin Loran-C receiver and for
+picking up the glove when timecounters made it clear
+that the old ``microkernel'' NTP timekeeping code were not up to snuff
+[MILLS2000B].
+.PP
+Tom Van Baak for helping out, despite the best efforts of the National
+Danish Posts center for Customs and Dues to prevent it.
+.PP
+Corby Dawson for helping with the care and feeding for caesium standards.
+.PP
+The staff at the NELS Loran-C control station in Bø, Norway for providing
+information about step-changes.
+.PP
+The staff at NELS Loran-C station Eiðe, Faeroe
+Islands for permission to tour their installation.
+.PP
+The FreeBSD users for putting up with ``micro uptime went backwards''.
+.SH
+References
+.LP
+[AG2002]
+Published specifications for Agilent model 5071A Primary Frequency
+Standard on
+.br
+http://www.agilent.com
+.LP
+[DMK2001]
+"Accuracy Evaluation of a Cesium Fountain Primary Frequency Standard at NIST."
+D. M. Meekhof, S. R. Jefferts, M. Stephanovic, and T. E. Parker
+IEEE Transactions on instrumentation and measurement, VOL. 50, NO. 2,
+APRIL 2001.
+.LP
+[PHK2001]
+"Monitoring Natural Gas Usage"
+Poul-Henning Kamp
+http://phk.freebsd.dk/Gasdims/
+.LP
+[MILLS1992]
+"A computer-controlled LORAN-C receiver for precision timekeeping."
+Mills, D.L.
+Electrical Engineering Department Report 92-3-1, University of Delaware, March 1992, 63 pp.
+.LP
+[MILLS2000A]
+Levine, J., and D. Mills. "Using the Network Time Protocol to transmit International Atomic Time (TAI)". Proc. Precision Time and Time Interval (PTTI) Applications and Planning Meeting (Reston VA, November 2000), 431-439.
+.LP
+[MILLS2000B]
+"The nanokernel."
+Mills, D.L., and P.-H. Kamp.
+Proc. Precision Time and Time Interval (PTTI) Applications and Planning Meeting (Reston VA, November 2000), 423-430.
+.LP
+[RGO2002]
+For an introduction to Harrison and his clocks, see for
+instance
+.br
+http://www.rog.nmm.ac.uk/museum/harrison/
+.br
+or for
+a more detailed and possibly better researched account: Dava
+Sobels excellent book, "Longitude: The True Story of a Lone
+Genius Who Solved the Greatest Scientific Problem of His
+Time" Penguin USA (Paper); ISBN: 0140258795.
+.LP
+[SAGE]
+This ``gee-wiz'' kind of article in Dr. Dobbs Journal is a good place to
+start:
+.br
+http://www.ddj.com/documents/s=1493/ddj0001hc/0085a.htm
+.LP
+[VCC2002]
+Please consult Virtual Computer Corporations homepage:
+.br
+http://www.vcc.com
diff --git a/share/doc/papers/timecounter/tmac.usenix b/share/doc/papers/timecounter/tmac.usenix
new file mode 100644
index 000000000000..20fe1d625478
--- /dev/null
+++ b/share/doc/papers/timecounter/tmac.usenix
@@ -0,0 +1,952 @@
+.ds CC "
+.nr PS 10
+.nr FU 0.0i \" priniter prints this much too low
+.nr VS 11
+.ds Q `\h'-0.02i'`
+.ds U '\h'-0.02i''
+.ds `` `\h'-0.02i'`
+.ds '' '\h'-0.02i''
+.\" footnote stuff
+.nr * 0 1
+.ds [. \|[
+.ds .] ]
+.if t .ds [, \s-2\v'-.4m'\f2
+.if n .ds [, [
+.if t .ds ,] \v'.4m'\s+2\fP
+.if n .ds ,] ]
+.ds * \*([,\\n+*\*(,]
+.ds [o ``
+.ds [c ''
+.ev 1
+.ps \n(PS
+.vs \n(VS
+.ev
+.de pp
+.PP
+..
+.de PP
+.LP
+.if t .ti 0.3i
+.if n .ti 5
+..
+.de LP
+.if t .sp 0.3
+.if n .sp
+.ne 1
+.in 0
+.nr Ia 0
+.nr Ic 0
+.fi
+..
+.de IP
+.if t .sp 0.3
+.if n .sp
+.\" Ia = total indent for this guy
+.\" Ib = .ti value for this guy
+.\" Ic = auxiliary indent
+.nr Ib 0.0i
+.if \\n(Ia=0 .nr Ia 0.2i
+.if !\\$1 \{\
+. nr Ia \w\\$1\ \ u
+. nr Ib \\n(Ia
+.\}
+.if !\\$2 .nr Ia \\$2n
+.in \\n(Iau
+.in +\\n(Icu
+.ti -\\n(Ibu
+.if !\\$1 \{\
+\&\\$1\ \ \c
+.\}
+..
+.de QP
+.IP
+..
+.de RS
+.nr Ic +0.2i
+..
+.de RE
+.nr Ic -0.2i
+..
+.de PN
+.rs
+'sp |10.4i-\\n(FUu
+.rs
+'sp |10.4i-\\n(FUu \" how many traps could there be?
+.rs
+'sp |10.4i-\\n(FUu
+.PO
+'ie e \{\
+.ev 2
+.\".if t 'tl \s10\f3%\\*(CC\fP\s0
+.ev
+'\}
+'el \{\
+.ev 2
+.\".if t 'tl \s10\f3\\*(CC%\fP\s0
+.ev
+'\}
+.po
+.wh 0 hh
+'bp
+..
+.de ff
+.nr dn 0
+.if \\nx \{\
+. ev 1
+. vs \\n(VVu
+. mk QR
+' nr QS 11i+0.5v-1u+\\nyu
+' if \\n(QS>\\n(QR 'if t 'sp |\\n(QSu
+. nf
+. FN \" print the footnotes
+. vs
+. rm FN
+. if \\n(.zfy .br\" end overflow diversion
+. if \\n(.zfy .di\" end overflow diversion
+. nr x 0 1
+. ev
+.\}
+.nr N +1
+.if \n(dn .fz \" leftover footnote
+.ie \\nN<\\nC \{\
+' if t 'sp |\\nTu
+' ns
+' po +3.12i \" postition of 2nd column
+.\}
+.el \{\
+. rF
+. PN
+. PO
+. nr N 0
+.\}
+.nr y 0-\\nb
+.nr QQ 11i-\\nb
+.ch fx
+.ch ff
+.if t .wh \\n(QQu ff
+.if n .wh 66 ff
+.wh 12i fx
+.ch fx \\n(QQu
+.if \\n(dn .fz
+..
+.de fz \" get leftover footnote
+.FS \&
+.nf
+.fy
+.FE
+..
+.de fx \" footnote overflow processing
+.if \\nx .di fy
+..
+.de FS \" start a footnote
+.if \\n(.t<=1.7v .ne 2
+.da FN
+.nr YY \\n(.lu
+.ev 1
+.if t .ll \\n(YYu
+.if n .ll 70
+.if \\n+x=1 .fs
+.fi
+.ie \\$1 \ \ \*([,\\n*\*(,]\c
+.el \ \ \*([,\\$1\*(,]\c
+.ps -1
+.vs -1
+.nr VV \\n(.v
+..
+.de FE
+.br
+.ps +1
+.vs +1
+.ev
+.da
+.nr y -\\n(dn
+.nr QR 11i-1v-1u+\\nyu \" y is negative
+.ie \\n(nlu+1v<\\n(QRu .ch ff \\n(QRu
+.el .ch ff \\n(nlu+1v
+..
+.de fs
+.br
+.vs \\n(VS
+\v'-0.4v'\s16\D'l 1.5i 0'\s0
+.sp -0.4v
+.vs
+..
+.de PO
+.if t \{\
+.ie e .po 1.20i
+.el .po 1.20i
+.\}
+.if n .po 0
+..
+.de NC
+'PO
+.if t 'll \\n(LLu
+.if n 'll 78
+'nr N 0
+..
+.de 2C
+.br
+.nr LL 2.85i
+'NC
+'nr C 2
+'mk T
+'ns
+..
+.de 1C
+.br
+.if t .nr LL 6.5i
+.if n .nr LL 78
+.NC
+'nr C 1
+'mk T
+'ns
+..
+.de rF \" reset footer to nominal
+.nr b 1.0i+\\n(FUu \" nominal footer place
+..
+.rF
+'nr x 0 1 \" init:
+.nr y 0-\nb
+.pl 11i
+.nr QQ 11i+\ny
+.wh \n(QQu ff
+.wh 12i fx
+.ch fx \n(QQu
+.de hh
+'rs
+'if t 'sp |0.5i-\\n(FUu
+.PO
+'ie e \{\
+.ev 2
+'if t 'tl \s10\f3\\*(T2\\*(A2\fP\s0
+.ev
+'\}
+'el \{\
+.ev 2
+'if t 'tl \s10\f3\\*(A2\\*(T2\fP\s0
+.ev
+'\}
+'if t 'sp |1i-\\n(FUu
+'mk T
+'ns
+'nr x 0 1 \" number of footnotes
+.nr y 0-\\nb
+.nr QQ 11i+\\ny
+.ch ff
+.wh \\n(QQu ff
+.ch fx
+.wh 12i fx
+.ch fx \\n(QQu
+..
+.\"-------------------
+.de TI
+.nh
+.rs
+.in 0i
+.nr % \\$1
+.fi
+.nr QS \\n(.lu
+.ll 100i
+.ps 14
+.vs 17
+.ft 3
+.ds TT \\
+..
+.de AA
+.nr DL \w\\*(TT
+.nr NN 1
+.nr NL \\n(QSu-1i \" a nice line length for title
+.if \\n(NLu*\\n(NNu<\\n(DLu .nr NN +1
+.if \\n(NLu*\\n(NNu<\\n(DLu .nr NN +1
+.if \\n(NLu*\\n(NNu<\\n(DLu .nr NN +1
+.if \\n(NLu*\\n(NNu<\\n(DLu .nr NN +1
+.if \\n(NLu*\\n(NNu<\\n(DLu .nr NN +1
+.nr QR (\\n(DLu/\\n(NNu)+0.75i \" +.75 cuz words don't always balance
+.ll \\n(QRu
+.di TU
+.ad l
+\\*(TT
+.br
+.di
+.sp |1.0i-\\n(FUu
+.nr NP 0
+.if \\n(QSu>\\n(QRu .nr NP (\\n(QSu-\\n(QRu)/2u
+.po +\\n(NPu
+.ce 999
+.TU
+.ce 0
+.po
+.ll \\n(QSu
+.sp 0.1i
+.ft 1
+.ps 12
+.vs 14
+.sp 0.5
+..
+.de A \" .A "Brian Author" "Affiliation"
+.in 0
+.ie !\\$2 \{\
+.ce
+\f1\\$1
+.ce
+\f2\\$2
+.\}
+.el \{\
+.ce
+\f1\\$1\f2
+.\}
+..
+.de AB
+.sp 0.20i
+.po +0.5i
+.ll -1.125i
+.ce
+\f3\s12ABSTRACT\s0\f1
+.sp 0.5
+.ps \\n(PS
+.vs \\n(VS
+.ad b
+.fi
+..
+.de EA
+.sp
+.if t .2C
+.if n .1C
+.hy 14
+..
+.de AE
+.EA
+..
+.de SH
+.br
+.in 0
+.di St
+.ft 3
+.it 1 S2
+..
+.de SH
+.NH "\\$1" "\\$2" "\\$3"
+..
+.de S2
+.br
+.di
+.sp 0.75
+.ne 3
+.ce
+.St
+.br
+.ft 1
+.sp 0.5
+.ns
+..
+.de NH
+.br
+.ne 2
+.in 0
+.nr Ia 0
+.nr Ic 0
+.fi
+.nr L 1
+.if !\\$1 .nr L \\$1\" level
+.if \\nL1 .ft 3
+.if \\nL2 .ft 3
+.if \\nL3 .ft 2
+.di Nt
+.in 0.3i
+.ti 0
+.it 1 N2
+..
+.de N2
+.br
+.in 0
+.di
+.if t .if \\nL1 .sp 0.75
+.if t .if \\nL2 .sp 0.25
+.if t .if \\nL3 .sp 0.25
+.if t .if \\nL4 .sp 0.25
+.if n .sp
+.ne 3
+.if \\nL1 .ce
+.Nt
+.br
+.ft 1
+.if t .if \\nL1 .sp 0.50
+.if t .if \\nL2 .sp 0.25
+.if t .if \\nL3 .sp 0.25
+.if t .if \\nL4 .sp 0.25
+.if n .sp
+.ns
+..
+.de XP
+.sp 0.5
+.ne 2
+.in \w[3]\ \ u
+.ti 0
+.ns
+..
+.de I
+.nr PQ \\n(.f
+.ft 2
+.if !"\\$1"" \&\\$1\\f\\n(PQ\\$2
+..
+.de R
+.ft 1
+..
+.de B
+.nr PQ \\n(.f
+.ft 3
+.if !\\$1 \&\\$1\\f\\n(PQ\\$2
+..
+.de T
+.nr PQ \\n(.f
+.if !\\$1 \&\\$3\f(CW\\$1\\f\\n(PQ\\$2
+..
+.de Ds
+'sp 0.4
+'nr DY \\n(.i
+'in 0.1i
+.if !\\$1 .in \\$1
+.ft CW
+.nf
+..
+.de DS
+.br
+.Ds \\$1
+..
+.de DE
+.br
+.De
+..
+.de De
+'sp 0.4
+.in \\n(DYu
+.ft 1
+.fi
+..
+.de np
+.br
+.in \w\(bu\ \ u
+.ti -\w\(bu\ \ u
+\(bu\ \ \c
+..
+.de lp
+.br
+.in 0
+..
+.de TS
+.br
+.ul 0
+.sp 0.5
+..
+.de TE
+.sp 0.5
+..
+.de RT
+.ft 1
+.ce 0
+.ul 0
+.if t 'll \\n(LLu
+.if n 'll \\n(LL
+.ps \\n(PS
+.vs \\n(VS
+.in 0
+.\"bd 1
+.ta 5n 10n 15n 20n 25n 30n 35n 40n 45n 50n 55n 60n 65n 70n 75n 80n
+.fi
+..
+.de KF
+'sp 0.4
+.ev 2
+.nr Zs \\n(.s
+.nr Zv \\n(.v
+.ll \\n(LLu
+.in 0
+..
+.de KE
+.br
+.ps \\n(Zs
+.vs \\n(Zvu
+.ev
+'sp 0.4
+..
+.de UX
+\\$3\s-2UNIX\s0\\$1\\$2
+..
+.de SM
+.ps -2
+..
+. \" LG - larger
+.de LG
+.ps +2
+..
+.de EB
+.nr QQ 11i-\\nb-\\$1
+.nr b +\\n(QQu
+.nr y 0+\\nyu-\\n(QQu
+.nr QQ 11i+\\ny
+.ch ff
+.wh \\n(QQu ff
+.ch fx
+.wh 12i fx
+.ch fx \\n(QQu
+..
+.\"==============================================
+.de Zz
+.if \\nN=1 'ch Zz
+'sp 11i
+..
+.de Z
+.br
+.mk Qz
+.ev 2
+.nr Qy \\n(.l
+.ll 6.5i
+.di J
+.in 0
+.ft 1
+..
+.de ZZ
+.br
+.if !\\$1 \{\
+. if !\\$2 .ll \\$2
+. sp 0.4
+. ce
+. ft 1
+\\$1
+. ft
+. if !\\$2 .ll
+.\}
+.di
+.ev
+.nr QQ \\n(.t-\\n(dn-10u
+.if \\n(QQ<0 .tm oops -- called Z too late on page \\n%!
+.if \\n(QQ<0 .ex
+.sp \\n(QQu
+.mk Q2
+.ev 2
+.in 0
+.nf
+.J
+.fi
+.rm J
+.ll \\n(.lu
+.ev
+.sp |\\n(Qzu
+.nr QQ \\n(Q2-0.8v
+.EB \\n(QQu
+..
+.\"======================================================
+.de KS
+.\".tm KS: Not implemented yet
+..
+.de KE
+.\".tm KE: Not implemented yet
+..
+.de KF
+.\".tm KF: Not implemented yet
+..
+.ds ' \h'\w'e'u*4/10'\z\(aa\h'-\w'e'u*4/10'
+.de BE
+.br
+..
+.lt 6.5i
+.de T1
+.ds T2 \\$1
+..
+.de A1
+.ds A2 \\$1
+..
+.nr P1 1.1i \" picture width
+.nr P2 14u*\n(P1u/10u \" picture depth
+.de BB
+.in 0
+.\".nr QQ \\n(P2+0.1i
+.\".ne \\n(QQu
+.\".rs
+.\".ll -\\n(P1u
+.\".ll -0.1i
+.\".po +\\n(.lu+0.1i
+.\".sp 0.3
+.\"
+.\".sp -0.8
+.\"\!H\\n(.o
+.\".mk QQ
+.\"\!V\\n(QQ
+.\"\!DZ \\n(P1 \\n(P2
+.\".ie \\$1 .tm Picture not yet inserted for .BB
+.\".el \!P \\$1
+.\".sp -0.3
+.\".po
+.\".sp -1
+.\".if \\$1 \{\
+.\"\h'0.1i'\h'\\n(.lu'\D'l \\n(P1u 0'\D'l 0 \\n(P2u'\D'l -\\n(P1u 0'\D'l 0 -\\n(P2u'
+.\".sp -1
+.\".\}
+.\".sp 0.8
+.\".mk QQ
+.\".nr QQ +\\n(P2u
+.\".wh \\n(QQu Bb
+.\"=====
+.\" ::: .sp 1
+.\" ::: .ne 2
+.if \\n(SB=0 \{\
+.NH 1
+Author Information
+.\}
+.nr SB 1
+.PP
+..
+.de Bb
+'ch Bb
+'ll +\\n(P1u
+'ll +0.1i
+..
+.de GS
+.br
+..
+.de GE
+..
+.nr SL 0.3
+.nr LI 0.28i
+.de BL \" begin list
+.br
+.sp \\n(SL
+.in +\\n(LIu
+.ll -0.1i
+.if \\n(Ld \{\
+. ds Z\\n(Ld \\*(LT
+. af LN 1
+. nr N\\n(Ld \\n(LN
+. ds C\\n(Ld \\*(LC
+.\}
+.nr Ld +1
+.ds LT \\$1\" LT is the List Type: 1, a, or a bulletchar
+.if \\$1 .if '\\n(Ld'1'.ds LT \(bu
+.if \\$1 .if '\\n(Ld'2'.ds LT \(ci
+.if \\$1 .if '\\n(Ld'3'.ds LT \(sq
+.if '\\*(LT'1' .af LN \\$1
+.if '\\*(LT'i' .af LN \\$1
+.if '\\*(LT'I' .af LN \\$1
+.if '\\*(LT'a' .af LN \\$1
+.if '\\*(LT'A' .af LN \\$1
+.nr LN 0 \" LN is the list element number
+.ds LC\\$2
+.\" LC is the optional bullet trailer...
+..
+.de LE \" list element
+.br
+.ie '\\$1'' .nr LN +1
+.el \{\
+. nr LN 0
+. nr LN \\$1
+.\}
+.ds LX \\*(LT\\*(LC
+.if \\*(LT1 .ds LX \\n(LN\\*(LC
+.if \\*(LTa .ds LX \\n(LN\\*(LC
+.if \\*(LTA .ds LX \\n(LN\\*(LC
+.if \\*(LTi .ds LX \\n(LN\\*(LC
+.if \\*(LTI .ds LX \\n(LN\\*(LC
+.if \\n(LN=0 \{\
+. if !'\\$1'' .ds LX \\$1\\*(LC
+.\}
+.nr QQ 3u*\w' 'u/2u
+.ti -\\w'\\*(LX\h'\\n(QQu''u
+\\*(LX\h'\\n(QQu'\c
+..
+.de EL \" end list
+.br
+.nr Ld -1
+.if \\n(Ld>=0 \{\
+. ds LT \\*(Z\\n(Ld
+. nr LN \\n(N\\n(Ld
+. ds LC \\*(C\\n(Ld
+.if '\\*(LT'1' .af LN \\*(LT
+.if '\\*(LT'i' .af LN \\*(LT
+.if '\\*(LT'I' .af LN \\*(LT
+.if '\\*(LT'a' .af LN \\*(LT
+.if '\\*(LT'A' .af LN \\*(LT
+. \}
+.in -\\n(LIu
+.ll +0.1i
+..
+.de F1
+.in 0
+\v'-0.4'\D'l \\n(.lu 0'
+.sp -0.7
+.in
+..
+.de F2
+.mk QQ
+.if !'\\nT'\\n(QQ' \{\
+.in 0
+\v'-0.4'\D'l \\n(.lu 0'
+.sp -0.4
+.in
+.\}
+..
+.de EM
+.br
+.if o \{\
+.ds A2
+.ds T2
+.rs
+.bp
+.ch ff
+.ch fx
+.PO
+.rs
+.sp |10.4i-\\n(FUu
+.mk QQ
+'ie e \{\
+. ev 2
+.if t 'tl \s10\f3%\\*(CC\fP\s0
+. ev
+' \}
+'el \{\
+. ev 2
+.if t 'tl \s10\f3\\*(CC%\fP\s0
+. ev
+' \}
+.\}
+..
+.de RF
+.sp 0.1
+.in 0.3i
+.ie !\\$1 \{\
+.nr QQ \w'\\$1\ '
+.ti -\\n(QQu
+\\$1\ \c
+.\}
+.el .ti 0
+..
+.de RZ
+.sp 0.1
+.in 0.3i
+.nr QQ \w'\\$1\ '
+.ti -\\n(QQu
+\\$1\ \c
+..
+.de zz
+.tm note: .zz is not implemented.
+.ex
+.nr Z1 \\$1
+.nr Z2 \\$2
+.if \\n(.t<\\n(Z2 .tm note that figure ``\\$3'' does not fit at column bottom ------------------------ on page \\n%
+.ie '\\n(.z'' \{\
+.sp 0.2
+.ne \\n(Z2u
+\\!H\\n(.o
+.mk QQ
+.nr QQ +0.25v
+\\!V\\n(QQ
+\\!DZ \\n(Z1 \\n(Z2
+\\!P \\$3
+.rs
+.sp \\n(Z2u
+.sp 0.2
+.\}
+.el \{\
+.sp 0.2
+\\!.z3 \\n(Z1 \\n(Z2 "\\$3" \\n(.o
+.sp \\n(Z2u
+.sp 0.2
+.\}
+..
+.de z2
+.nr Z1 \\$1
+.nr Z2 \\$2
+.sp 0.2
+.ne \\n(Z2u
+.nr QQ (\\n(.lu-\\$1)/2u
+.sp \\n(Z2u
+.vs 0
+.po +\\n(QQu
+\X'ps: import \\$3 0 0 1 1 \\n(Z1 \\n(Z2'
+.br
+.po -\\n(QQu
+.vs
+.rs
+.sp 0.2
+..
+.de sz
+.vs \\$1
+.ps \\$1
+..
+.de M
+\f2\\$1\f1\|(\\$2)\\$3
+..
+.de B1
+.br
+.mk Bz
+..
+.de B2
+.br
+.mk By
+.nr D \\n(Byu-\\n(Bzu
+.nr L \\n(.lu+0.2i-\\n(.iu
+\h'-0.1i'\v'-0.7v'\D'l \\nLu 0'\D'l 0 -\\nDu'\D'l -\\nLu 0'\D'l 0 \\nDu'
+.sp -1
+..
+.de []
+.][ \\$1
+..
+.de ][
+.if \\$1>5 .tm Bad arg to []
+.[\\$1
+..
+.de [5 \" tm style
+.FS
+\\*([A, \\f2\\*([T\\f1,
+.ie \\n(TN \\*([M.
+.el Bell Laboratories internal memorandum (\\*([D).
+.RT
+.FE
+..
+.de [0 \" other
+.FS
+.nr [: 0
+.if !\\*([F .FP \\*([F
+.if !\\*([Q \{\
+.nr [: 1
+\\*([Q\c
+.\}
+.if !\\*([A \{\
+.nr [: 1
+\\*([A\c
+.\}
+.if !\\*([T \{\
+.if \\n([:>0 ,
+.nr [: 1
+\f2\\*([T\f1\c
+.\}
+.if !\\*([S , \\*([S\c
+.if !\\*([V , \\*([V\c
+.if !\\*([P \{\
+.ie \\n([P>0 , pp. \\*([P\c
+.el , p. \\*([P\c
+.\}
+.if !\\*([C , \\*([C\c
+.if !\\*([D , \\*([D\c
+.if \\n([:>0 \&.
+.if !\\*([O \\*([O
+.FE
+..
+.de [1
+.FS
+.if !\\*([F .FP \\*([F
+.if !\\*([Q \\*([Q,
+.if !\\*([A \\*([A,
+.if !\\*([T \\*([o\\*([T,\\*([c
+\f2\\*([J\f1\c
+.if !\\*([V , vol. \\*([V\c
+.if !\\*([N , no. \\*([N\c
+.if !\\*([P \{\
+.ie \\n([P>0 , pp. \\*([P\c
+.el , p. \\*([P\c
+.\}
+.if !\\*([I , \\*([I\c
+.if !\\*([C , \\*([C\c
+.if !\\*([D , \\*([D\c
+\&.
+.if !\\*([O \\*([O
+.FE
+..
+.de [2 \" book
+.FS
+.if !\\*([F .FP \\*([F
+.if !\\*([Q \\*([Q,
+.if !\\*([A \\*([A,
+.if !\\*([T \f2\\*([T,\f1
+.if !\\*([S \\*([S,
+.if !\\*([V \\*([V,
+.if !\\*([P \{\
+.ie \\n([P>0 pp. \\*([P,
+.el p. \\*([P,
+.\}
+\\*([I\c
+.if !\\*([C , \\*([C\c
+.if !\\*([D , \\*([D\c
+\&.
+.if !\\*([O \\*([O
+.FE
+..
+.de [4 \" report
+.FS
+.if !\\*([F .FP \\*([F
+.if !\\*([Q \\*([Q,
+.if !\\*([A \\*([A,
+.if !\\*([T \\*([o\\*([T,\\*([c
+.if !\\*([R \\*([R\c
+.if !\\*([G \& (\\*([G)\c
+.if !\\*([P \{\
+.ie \\n([P>0 , pp. \\*([P\c
+.el , p. \\*([P\c
+.\}
+.if !\\*([I , \\*([I\c
+.if !\\*([C , \\*([C\c
+.if !\\*([D , \\*([D\c
+\&.
+.if !\\*([O \\*([O
+.FE
+..
+.de [3 \" article in book
+.FS
+.if !\\*([F .FP \\*([F
+.if !\\*([Q \\*([Q,
+.if !\\*([A \\*([A,
+.if !\\*([T \\*([o\\*([T,\\*([c
+in \f2\\*([B\f1\c
+.if !\\*([E , ed. \\*([E\c
+.if !\\*([S , \\*([S\c
+.if !\\*([V , vol. \\*([V\c
+.if !\\*([P \{\
+.ie \\n([P>0 , pp. \\*([P\c
+.el , p. \\*([P\c
+.\}
+.if !\\*([I , \\*([I\c
+.if !\\*([C , \\*([C\c
+.if !\\*([D , \\*([D\c
+\&.
+.if !\\*([O \\*([O
+.FE
+..
+.de [<
+.]>
+..
+.de ]<
+.SH
+References
+.LP
+.de FP
+.\".IP \\\\$1.
+.RZ \\\\$1.
+\\..
+.rm FS FE
+..
+.de [>
+.]>
+..
+.de ]>
+.sp
+..
+.de [-
+.]-
+..
+.de ]-
+.rm [Q [A [T [J [B [E [S [V
+.rm [N [P [I [C [D [O [R [G
+..
+.de FG
+.ds QQ \fB\\$1\\fP: \\$2
+.ie \w\\*(QQ>\\n(.l \{\
+.in +0.25i
+.ti 0
+\\*(QQ
+.in 0
+.\}
+.el \{\
+.ce
+\\*(QQ
+.\}
+..
+.1C