9 files changed, 1469 insertions, 137 deletions
diff --git a/share/man/man4/bpf.4 b/share/man/man4/bpf.4
index bb278586fbb1..9116b2dfa7b8 100644
--- a/share/man/man4/bpf.4
+++ b/share/man/man4/bpf.4
@@ -1,3 +1,30 @@
+.\" Copyright (c) 2007 Seccuris Inc.
+.\" All rights reserved.
+.\"
+.\" This sofware was developed by Robert N. M. Watson under contract to
+.\" Seccuris Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\" 
 .\" Copyright (c) 1990 The Regents of the University of California.
 .\" All rights reserved.
 .\"
@@ -61,19 +88,6 @@ Whenever a packet is received by an interface,
 all file descriptors listening on that interface apply their filter.
 Each descriptor that accepts the packet receives its own copy.
 .Pp
-Reads from these files return the next group of packets
-that have matched the filter.
-To improve performance, the buffer passed to read must be
-the same size as the buffers used internally by
-.Nm .
-This size is returned by the
-.Dv BIOCGBLEN
-ioctl (see below), and
-can be set with
-.Dv BIOCSBLEN .
-Note that an individual packet larger than this size is necessarily
-truncated.
-.Pp
 The packet filter will support any link level protocol that has fixed length
 headers.
 Currently, only Ethernet,
@@ -94,6 +108,165 @@ The writes are unbuffered, meaning only one packet can be processed per write.
 Currently, only writes to Ethernets and
 .Tn SLIP
 links are supported.
+.Sh BUFFER MODES
+.Nm
+devices deliver packet data to the application via memory buffers provided by
+the application.
+The buffer mode is set using the
+.Dv BIOCSETBUFMODE
+ioctl, and read using the
+.Dv BIOCGETBUFMODE
+ioctl.
+.Ss Buffered read mode
+By default,
+.Nm
+devices operate in the
+.Dv BPF_BUFMODE_BUFFER
+mode, in which packet data is copied explicitly from kernel to user memory
+using the
+.Xr read 2
+system call.
+The user process will declare a fixed buffer size that will be used both for
+sizing internal buffers and for all
+.Xr read 2
+operations on the file.
+This size is queried using the
+.Dv BIOCGBLEN
+ioctl, and is set using the
+.Dv BIOCSBLEN
+ioctl.
+Note that an individual packet larger than the buffer size is necessarily
+truncated.
+.Ss Zero-copy buffer mode
+.Nm
+devices may also operate in the
+.Dv BPF_BUFMODE_ZEROCOPY
+mode, in which packet data is written directly into two user memory buffers
+by the kernel, avoiding both system call and copying overhead.
+Buffers are of fixed (and equal) size, page-aligned, and an even multiple of
+the page size.
+The maximum zero-copy buffer size is returned by the
+.Dv BIOCGETZMAX
+ioctl.
+Note that an individual packet larger than the buffer size is necessarily
+truncated.
+.Pp
+The user process registers two memory buffers using the
+.Dv BIOCSETZBUF
+ioctl, which accepts a
+.Vt struct bpf_zbuf
+pointer as an argument:
+.Bd -literal
+struct bpf_zbuf {
+	void *bz_bufa;
+	void *bz_bufb;
+	size_t bz_buflen;
+};
+.Ed
+.Pp
+.Vt bz_bufa
+is a pointer to the userspace address of the first buffer that will be
+filled, and
+.Vt bz_bufb
+is a pointer to the second buffer.
+.Nm
+will then cycle between the two buffers as they fill and are acknowledged.
+.Pp
+Each buffer begins with a fixed-length header to hold synchronization and
+data length information for the buffer:
+.Bd -literal
+struct bpf_zbuf_header {
+	volatile u_int  bzh_kernel_gen;	/* Kernel generation number. */
+	volatile u_int  bzh_kernel_len;	/* Length of data in the buffer. */
+	volatile u_int  bzh_user_gen;	/* User generation number. */
+	/* ...padding for future use... */
+};
+.Ed
+.Pp
+The header structure of each buffer, including all padding, should be zeroed
+before it is configured using
+.Dv BIOCSETZBUF .
+Remaining space in the buffer will be used by the kernel to store packet
+data, laid out in the same format as with buffered read mode.
+.Pp
+The kernel and the user process follow a simple acknowledgement protocol via
+the buffer header to synchronize access to the buffer: when the header
+generation numbers,
+.Vt bzh_kernel_gen
+and
+.Vt bzh_user_gen ,
+hold the same value, the kernel owns the buffer, and when they differ,
+userspace owns the buffer.
+.Pp
+While the kernel owns the buffer, the contents are unstable and may change
+asynchronously; while the user process owns the buffer, its contents are
+stable and will not be changed until the buffer has been acknowledged.
+.Pp
+Initializing the buffer headers to all 0's before registering the buffer has
+the effect of assigning initial ownership of both buffers to the kernel.
+The kernel signals that a buffer has been assigned to userspace by modifying
+.Vt bzh_kernel_gen ,
+and userspace acknowledges the buffer and returns it to the kernel by setting
+the value of
+.Vt bzh_user_gen
+to the value of
+.Vt bzh_kernel_gen .
+.Pp
+In order to avoid caching and memory re-ordering effects, the user process
+must use atomic operations and memory barriers when checking for and
+acknowledging buffers:
+.Bd -literal
+#include <machine/atomic.h>
+
+/*
+ * Return ownership of a buffer to the kernel for reuse.
+ */
+static void
+buffer_acknowledge(struct bpf_zbuf_header *bzh)
+{
+
+	atomic_store_rel_int(&bzh->bzh_user_gen, bzh->bzh_kernel_gen);
+}
+
+/*
+ * Check whether a buffer has been assigned to userspace by the kernel.
+ * Return true if userspace owns the buffer, and false otherwise.
+ */
+static int
+buffer_check(struct bpf_zbuf_header *bzh)
+{
+
+	return (bzh->bzh_user_gen !=
+	    atomic_load_acq_int(&bzh->bzh_kernel_gen));
+}
+.Ed
+.Pp
+The user process may force the assignment of the next buffer, if any data
+is pending, to userspace using the
+.Dv BIOCROTZBUF
+ioctl.
+This allows the user process to retrieve data in a partially filled buffer
+before the buffer is full, such as following a timeout; the process must
+recheck for buffer ownership using the header generation numbers, as the
+buffer will not be assigned to userspace if no data was present.
+.Pp
+As in the buffered read mode,
+.Xr kqueue 2 ,
+.Xr poll 2 ,
+and
+.Xr select 2
+may be used to sleep awaiting the availbility of a completed buffer.
+They will return a readable file descriptor when ownership of the next buffer
+is assigned to user space.
+.Pp
+In the current implementation, the kernel will assign ownership of at most
+one buffer at a time to the user process.
+The user processes must acknowledge the current buffer in order to be
+notified that the next buffer is ready for processing.
+Programs should not rely on this as an invariant, as it may change in future
+versions; in particular, they must maintain their own notion of which buffer
+is "next" so that if both buffers are owned by userspace, it can process them
+in the correct order.
 .Sh IOCTLS
 The
 .Xr ioctl 2
@@ -127,7 +300,7 @@ file.
 The (third) argument to
 .Xr ioctl 2
 should be a pointer to the type indicated.
-.Bl -tag -width BIOCGRTIMEOUT
+.Bl -tag -width BIOCGETBUFMODE
 .It Dv BIOCGBLEN
 .Pq Li u_int
 Returns the required buffer length for reads on
@@ -349,10 +522,55 @@ descriptor.
 This prevents the execution of
 ioctl commands which could change the underlying operating parameters of
 the device.
+.It Dv BIOCGETBUFMODE
+.It Dv BIOCSETBUFMODE
+.Pq Li u_int
+Get or set the current
+.Nm
+buffering mode; possible values are
+.Dv BPF_BUFMODE_BUFFER ,
+buffered read mode, and
+.Dv BPF_BUFMODE_ZBUF ,
+zero-copy buffer mode.
+.It Dv BIOCSETZBUF
+.Pq Li struct bpf_zbuf
+Set the current zero-copy buffer locations; buffer locations may be
+set only once zero-copy buffer mode has been selected, and prior to attaching
+to an interface.
+Buffers must be of identical size, page-aligned, and an integer multiple of
+pages in size.
+The three fields
+.Vt bz_bufa ,
+.Vt bz_bufb ,
+and
+.Vt bz_buflen
+must be filled out.
+If buffers have already been set for this device, the ioctl will fail.
+.It Dv BIOCGETZMAX
+.Pq Li size_t
+Get the largest individual zero-copy buffer size allowed.
+As two buffers are used in zero-copy buffer mode, the limit (in practice) is
+twice the returned size.
+As zero-copy buffers consume kernel address space, conservative selection of
+buffer size is suggested, especially when there are multiple
+.Nm
+descriptors in use on 32-bit systems.
+.It Dv BIOCROTZBUF
+Force ownership of the next buffer to be assigned to userspace, if any data
+present in the buffer.
+If no data is present, the buffer will remain owned by the kernel.
+This allows consumers of zero-copy buffering to implement timeouts and
+retrieve partially filled buffers.
+In order to handle the case where no data is present in the buffer and
+therefore ownership is not assigned, the user process must check
+.Vt bzh_kernel_gen
+against
+.Vt bzh_user_gen .
 .El
 .Sh BPF HEADER
 The following structure is prepended to each packet returned by
-.Xr read 2 :
+.Xr read 2
+or via a zero-copy buffer:
 .Bd -literal
 struct bpf_hdr {
         struct timeval bh_tstamp;     /* time stamp */
@@ -718,6 +936,9 @@ struct bpf_insn insns[] = {
 .Sh SEE ALSO
 .Xr tcpdump 1 ,
 .Xr ioctl 2 ,
+.Xr kqueue 2 ,
+.Xr poll 2 ,
+.Xr select 2 ,
 .Xr byteorder 3 ,
 .Xr ng_bpf 4 ,
 .Xr bpf 9
@@ -750,6 +971,10 @@ of Lawrence Berkeley Laboratory, implemented BPF in
 Summer 1990.
 Much of the design is due to
 .An Van Jacobson .
+.Pp
+Support for zero-copy buffers was added by
+.An Robert N. M. Watson
+under contract to Seccuris Inc.
 .Sh BUGS
 The read buffer must be of a fixed size (returned by the
 .Dv BIOCGBLEN
diff --git a/sys/conf/files b/sys/conf/files
index 2bd8a2368b2c..eac57fa58ee3 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1633,8 +1633,10 @@ libkern/strtoul.c		standard
 libkern/strtouq.c		standard
 libkern/strvalid.c		standard
 net/bpf.c			standard
+net/bpf_buffer.c		optional bpf
 net/bpf_jitter.c		optional bpf_jitter
 net/bpf_filter.c		optional bpf | netgraph_bpf
+net/bpf_zerocopy.c		optional bpf
 net/bridgestp.c			optional bridge | if_bridge
 net/bsd_comp.c			optional ppp_bsdcomp
 net/ieee8023ad_lacp.c		optional lagg
diff --git a/sys/net/bpf.c b/sys/net/bpf.c
index 49961754d690..433cc7aa4649 100644
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@@ -66,9 +66,11 @@ __FBSDID("$FreeBSD$");
 
 #include <net/if.h>
 #include <net/bpf.h>
+#include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
+#include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
 
 #include <netinet/in.h>
@@ -80,7 +82,7 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
-static MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
+MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
@@ -98,19 +100,17 @@ static LIST_HEAD(, bpf_if)	bpf_iflist;
 static struct mtx	bpf_mtx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
-static void	bpf_allocbufs(struct bpf_d *);
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_freed(struct bpf_d *);
-static void	bpf_mcopy(const void *, void *, size_t);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_insn *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
-static void	catchpacket(struct bpf_d *, u_char *, u_int,
-		    u_int, void (*)(const void *, void *, size_t),
+static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
+		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct timeval *);
 static void	reset_d(struct bpf_d *);
 static int	 bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
@@ -123,15 +123,12 @@ static void	bpf_clone(void *, struct ucred *, char *, int, struct cdev **);
 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
-static int bpf_bufsize = 4096;
-SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
-    &bpf_bufsize, 0, "Default bpf buffer size");
-static int bpf_maxbufsize = BPF_MAXBUFSIZE;
-SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
-    &bpf_maxbufsize, 0, "Maximum bpf buffer size");
 static int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
+static int bpf_zerocopy_enable = 0;
+SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
+    &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
@@ -158,6 +155,146 @@ static struct cdevsw bpf_cdevsw = {
 static struct filterops bpfread_filtops =
 	{ 1, NULL, filt_bpfdetach, filt_bpfread };
 
+/*
+ * Wrapper functions for various buffering methods.  If the set of buffer
+ * modes expands, we will probably want to introduce a switch data structure
+ * similar to protosw, et.
+ */
+static void
+bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
+
+	case BPF_BUFMODE_ZBUF:
+		d->bd_zcopy++;
+		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
+
+	default:
+		panic("bpf_buf_append_bytes");
+	}
+}
+
+static void
+bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
+
+	case BPF_BUFMODE_ZBUF:
+		d->bd_zcopy++;
+		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
+
+	default:
+		panic("bpf_buf_append_mbuf");
+	}
+}
+
+/*
+ * If the buffer mechanism has a way to decide that a held buffer can be made
+ * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
+ * returned if the buffer can be discarded, (0) is returned if it cannot.
+ */
+static int
+bpf_canfreebuf(struct bpf_d *d)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_canfreebuf(d));
+	}
+	return (0);
+}
+
+void
+bpf_bufheld(struct bpf_d *d)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_ZBUF:
+		bpf_zerocopy_bufheld(d);
+		break;
+	}
+}
+
+static void
+bpf_free(struct bpf_d *d)
+{
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_free(d));
+
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_free(d));
+
+	default:
+		panic("bpf_buf_free");
+	}
+}
+
+static int
+bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
+		return (EOPNOTSUPP);
+	return (bpf_buffer_uiomove(d, buf, len, uio));
+}
+
+static int
+bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
+		return (EOPNOTSUPP);
+	return (bpf_buffer_ioctl_sblen(d, i));
+}
+
+static int
+bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
+}
+
+static int
+bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
+}
+
+static int
+bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
+}
+
+/*
+ * General BPF functions.
+ */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
@@ -412,7 +549,14 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 		    "bpf%d", dev2unit(dev));
 	MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	dev->si_drv1 = d;
-	d->bd_bufsize = bpf_bufsize;
+
+	/*
+	 * For historical reasons, perform a one-time initialization call to
+	 * the buffer routines, even though we're not yet committed to a
+	 * particular buffer method.
+	 */
+	bpf_buffer_init(d);
+	d->bd_bufmode = BPF_BUFMODE_BUFFER;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	d->bd_pid = td->td_proc->p_pid;
@@ -459,18 +603,6 @@ bpfclose(struct cdev *dev, int flags, int fmt, struct thread *td)
 	return (0);
 }
 
-
-/*
- * Rotate the packet buffers in descriptor d.  Move the store buffer
- * into the hold slot, and the free buffer into the store slot.
- * Zero the length of the new store buffer.
- */
-#define ROTATE_BUFFERS(d) \
-	(d)->bd_hbuf = (d)->bd_sbuf; \
-	(d)->bd_hlen = (d)->bd_slen; \
-	(d)->bd_sbuf = (d)->bd_fbuf; \
-	(d)->bd_slen = 0; \
-	(d)->bd_fbuf = NULL;
 /*
  *  bpfread - read next chunk of packets from buffers
  */
@@ -490,6 +622,10 @@ bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 
 	BPFD_LOCK(d);
 	d->bd_pid = curthread->td_proc->p_pid;
+	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
+		BPFD_UNLOCK(d);
+		return (EOPNOTSUPP);
+	}
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
@@ -567,7 +703,7 @@ bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 	 * issues a read on the same fd at the same time?  Don't want this
 	 * getting invalidated.
 	 */
-	error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
+	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	d->bd_fbuf = d->bd_hbuf;
@@ -613,6 +749,20 @@ bpf_timed_out(void *arg)
 }
 
 static int
+bpf_ready(struct bpf_d *d)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
+		return (1);
+	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
+	    d->bd_slen != 0)
+		return (1);
+	return (0);
+}
+
+static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d = dev->si_drv1;
@@ -622,25 +772,34 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 	int error, hlen;
 
 	d->bd_pid = curthread->td_proc->p_pid;
-	if (d->bd_bif == NULL)
+	d->bd_wcount++;
+	if (d->bd_bif == NULL) {
+		d->bd_wdcount++;
 		return (ENXIO);
+	}
 
 	ifp = d->bd_bif->bif_ifp;
 
-	if ((ifp->if_flags & IFF_UP) == 0)
+	if ((ifp->if_flags & IFF_UP) == 0) {
+		d->bd_wdcount++;
 		return (ENETDOWN);
+	}
 
-	if (uio->uio_resid == 0)
+	if (uio->uio_resid == 0) {
+		d->bd_wdcount++;
 		return (0);
+	}
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
 	    &m, &dst, &hlen, d->bd_wfilter);
-	if (error)
+	if (error) {
+		d->bd_wdcount++;
 		return (error);
-
+	}
+	d->bd_wfcount++;
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
@@ -667,6 +826,8 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 #endif
 
 	error = (*ifp->if_output)(ifp, m, &dst, NULL);
+	if (error)
+		d->bd_wdcount++;
 
 	if (mc != NULL) {
 		if (error == 0)
@@ -697,6 +858,10 @@ reset_d(struct bpf_d *d)
 	d->bd_rcount = 0;
 	d->bd_dcount = 0;
 	d->bd_fcount = 0;
+	d->bd_wcount = 0;
+	d->bd_wfcount = 0;
+	d->bd_wdcount = 0;
+	d->bd_zcopy = 0;
 }
 
 /*
@@ -721,6 +886,11 @@ reset_d(struct bpf_d *d)
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
+ *  BIOCSETZBUF		Set current zero-copy buffer locations.
+ *  BIOCGETZMAX		Get maximum zero-copy buffer size.
+ *  BIOCROTZBUF		Force rotation of zero-copy buffer
+ *  BIOCSETBUFMODE	Set buffer mode.
+ *  BIOCGETBUFMODE	Get current buffer mode.
  */
 /* ARGSUSED */
 static	int
@@ -758,6 +928,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
 		case BIOCSRTIMEOUT:
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
+		case BIOCROTZBUF:
 			break;
 		default:
 			return (EPERM);
@@ -810,17 +981,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
-		if (d->bd_bif != NULL)
-			error = EINVAL;
-		else {
-			u_int size = *(u_int *)addr;
-
-			if (size > bpf_maxbufsize)
-				*(u_int *)addr = size = bpf_maxbufsize;
-			else if (size < BPF_MINBUFSIZE)
-				*(u_int *)addr = size = BPF_MINBUFSIZE;
-			d->bd_bufsize = size;
-		}
+		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
@@ -945,6 +1106,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
 		{
 			struct bpf_stat *bs = (struct bpf_stat *)addr;
 
+			/* XXXCSJP overflow */
 			bs->bs_recv = d->bd_rcount;
 			bs->bs_drop = d->bd_dcount;
 			break;
@@ -1055,6 +1217,50 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
 	case BIOCGRSIG:
 		*(u_int *)addr = d->bd_sig;
 		break;
+
+	case BIOCGETBUFMODE:
+		*(u_int *)addr = d->bd_bufmode;
+		break;
+
+	case BIOCSETBUFMODE:
+		/*
+		 * Allow the buffering mode to be changed as long as we
+		 * haven't yet committed to a particular mode.  Our
+		 * definition of commitment, for now, is whether or not a
+		 * buffer has been allocated or an interface attached, since
+		 * that's the point where things get tricky.
+		 */
+		switch (*(u_int *)addr) {
+		case BPF_BUFMODE_BUFFER:
+			break;
+
+		case BPF_BUFMODE_ZBUF:
+			if (bpf_zerocopy_enable)
+				break;
+			/* FALLSTHROUGH */
+
+		default:
+			return (EINVAL);
+		}
+
+		BPFD_LOCK(d);
+		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
+		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
+			BPFD_UNLOCK(d);
+			return (EBUSY);
+		}
+		d->bd_bufmode = *(u_int *)addr;
+		BPFD_UNLOCK(d);
+		break;
+
+	case BIOCGETZMAX:
+		return (bpf_ioctl_getzmax(td, d, (size_t *)addr));
+
+	case BIOCSETZBUF:
+		return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr));
+
+	case BIOCROTZBUF:
+		return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr));
 	}
 	return (error);
 }
@@ -1155,13 +1361,31 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
+
 	/*
-	 * Allocate the packet buffers if we need to.
-	 * If we're already attached to requested interface,
-	 * just flush the buffer.
+	 * Behavior here depends on the buffering model.  If we're using
+	 * kernel memory buffers, then we can allocate them here.  If we're
+	 * using zero-copy, then the user process must have registered
+	 * buffers by the time we get here.  If not, return an error.
+	 *
+	 * XXXRW: There are locking issues here with multi-threaded use: what
+	 * if two threads try to set the interface at once?
 	 */
-	if (d->bd_sbuf == NULL)
-		bpf_allocbufs(d);
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		if (d->bd_sbuf == NULL)
+			bpf_buffer_alloc(d);
+		KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL"));
+		break;
+
+	case BPF_BUFMODE_ZBUF:
+		if (d->bd_sbuf == NULL)
+			return (EINVAL);
+		break;
+
+	default:
+		panic("bpf_setif: bufmode %d", d->bd_bufmode);
+	}
 	if (bp != d->bd_bif) {
 		if (d->bd_bif)
 			/*
@@ -1305,37 +1529,14 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
-				catchpacket(d, pkt, pktlen, slen, bcopy, &tv);
+				catchpacket(d, pkt, pktlen, slen,
+				    bpf_append_bytes, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
 	BPFIF_UNLOCK(bp);
 }
 
-/*
- * Copy data from an mbuf chain into a buffer.  This code is derived
- * from m_copydata in sys/uipc_mbuf.c.
- */
-static void
-bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
-{
-	const struct mbuf *m;
-	u_int count;
-	u_char *dst;
-
-	m = src_arg;
-	dst = dst_arg;
-	while (len > 0) {
-		if (m == NULL)
-			panic("bpf_mcopy");
-		count = min(m->m_len, len);
-		bcopy(mtod(m, void *), dst, count);
-		m = m->m_next;
-		dst += count;
-		len -= count;
-	}
-}
-
 #define	BPF_CHECK_DIRECTION(d, m) \
 	if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \
 	    ((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL))
@@ -1385,7 +1586,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
-				    bpf_mcopy, &tv);
+				    bpf_append_mbuf, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
@@ -1440,7 +1641,7 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
-				    bpf_mcopy, &tv);
+				    bpf_append_mbuf, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
@@ -1453,19 +1654,34 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
- * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
+ * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
-    void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
+    void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
+    struct timeval *tv)
 {
-	struct bpf_hdr *hp;
+	struct bpf_hdr hdr;
 	int totlen, curlen;
 	int hdrlen = d->bd_bif->bif_hdrlen;
 	int do_wakeup = 0;
 
 	BPFD_LOCK_ASSERT(d);
+
+	/*
+	 * Detect whether user space has released a buffer back to us, and if
+	 * so, move it from being a hold buffer to a free buffer.  This may
+	 * not be the best place to do it (for example, we might only want to
+	 * run this check if we need the space), but for now it's a reliable
+	 * spot to do it.
+	 */
+	if (bpf_canfreebuf(d)) {
+		d->bd_fbuf = d->bd_hbuf;
+		d->bd_hbuf = NULL;
+		d->bd_hlen = 0;
+	}
+
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
@@ -1500,23 +1716,27 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
 	}
 	else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
 		/*
-		 * Immediate mode is set, or the read timeout has
-		 * already expired during a select call.  A packet
-		 * arrived, so the reader should be woken up.
+		 * Immediate mode is set, or the read timeout has already
+		 * expired during a select call.  A packet arrived, so the
+		 * reader should be woken up.
 		 */
 		do_wakeup = 1;
 
 	/*
-	 * Append the bpf header.
+	 * Append the bpf header.  Note we append the actual header size, but
+	 * move forward the length of the header plus padding.
 	 */
-	hp = (struct bpf_hdr *)(d->bd_sbuf + curlen);
-	hp->bh_tstamp = *tv;
-	hp->bh_datalen = pktlen;
-	hp->bh_hdrlen = hdrlen;
+	bzero(&hdr, sizeof(hdr));
+	hdr.bh_tstamp = *tv;
+	hdr.bh_datalen = pktlen;
+	hdr.bh_hdrlen = hdrlen;
+	hdr.bh_caplen = totlen - hdrlen;
+	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
+
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
-	(*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
+	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen);
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
@@ -1524,41 +1744,19 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
 }
 
 /*
- * Initialize all nonzero fields of a descriptor.
- */
-static void
-bpf_allocbufs(struct bpf_d *d)
-{
-
-	KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL"));
-	KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL"));
-	KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL"));
-
-	d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
-	d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
-	d->bd_slen = 0;
-	d->bd_hlen = 0;
-}
-
-/*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpf_freed(struct bpf_d *d)
 {
+
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
-	if (d->bd_sbuf != NULL) {
-		free(d->bd_sbuf, M_BPF);
-		if (d->bd_hbuf != NULL)
-			free(d->bd_hbuf, M_BPF);
-		if (d->bd_fbuf != NULL)
-			free(d->bd_fbuf, M_BPF);
-	}
+	bpf_free(d);
 	if (d->bd_rfilter) {
 		free((caddr_t)d->bd_rfilter, M_BPF);
 #ifdef BPF_JITTER
@@ -1762,6 +1960,7 @@ bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 
 	bzero(d, sizeof(*d));
 	BPFD_LOCK_ASSERT(bd);
+	d->bd_structsize = sizeof(*d);
 	d->bd_immediate = bd->bd_immediate;
 	d->bd_promisc = bd->bd_promisc;
 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
@@ -1779,6 +1978,11 @@ bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
+	d->bd_wcount = bd->bd_wcount;
+	d->bd_wdcount = bd->bd_wdcount;
+	d->bd_wfcount = bd->bd_wfcount;
+	d->bd_zcopy = bd->bd_zcopy;
+	d->bd_bufmode = bd->bd_bufmode;
 }
 
 static int
diff --git a/sys/net/bpf.h b/sys/net/bpf.h
index 91ea0f6827d2..1d6f9db7415d 100644
--- a/sys/net/bpf.h
+++ b/sys/net/bpf.h
@@ -92,6 +92,27 @@ struct bpf_version {
 #define BPF_MAJOR_VERSION 1
 #define BPF_MINOR_VERSION 1
 
+/*
+ * Historically, BPF has supported a single buffering model, first using mbuf
+ * clusters in kernel, and later using malloc(9) buffers in kernel.  We now
+ * support multiple buffering modes, which may be queried and set using
+ * BIOCGETBUFMODE and BIOCSETBUFMODE.  So as to avoid handling the complexity
+ * of changing modes while sniffing packets, the mode becomes fixed once an
+ * interface has been attached to the BPF descriptor.
+ */
+#define	BPF_BUFMODE_BUFFER	1	/* Kernel buffers with read(). */
+#define	BPF_BUFMODE_ZBUF	2	/* Zero-copy buffers. */
+
+/*-
+ * Struct used by BIOCSETZBUF, BIOCROTZBUF: describes up to two zero-copy
+ * buffer as used by BPF.
+ */
+struct bpf_zbuf {
+	void	*bz_bufa;	/* Location of 'a' zero-copy buffer. */
+	void	*bz_bufb;	/* Location of 'b' zero-copy buffer. */
+	size_t	 bz_buflen;	/* Size of zero-copy buffers. */
+};
+
 #define	BIOCGBLEN	_IOR('B',102, u_int)
 #define	BIOCSBLEN	_IOWR('B',102, u_int)
 #define	BIOCSETF	_IOW('B',103, struct bpf_program)
@@ -116,6 +137,11 @@ struct bpf_version {
 #define	BIOCLOCK	_IO('B', 122)
 #define	BIOCSETWF	_IOW('B',123, struct bpf_program)
 #define	BIOCFEEDBACK	_IOW('B',124, u_int)
+#define	BIOCGETBUFMODE	_IOR('B',125, u_int)
+#define	BIOCSETBUFMODE	_IOW('B',126, u_int)
+#define	BIOCGETZMAX	_IOR('B',127, size_t)
+#define	BIOCROTZBUF	_IOR('B',128, struct bpf_zbuf)
+#define	BIOCSETZBUF	_IOW('B',129, struct bpf_zbuf)
 
 /* Obsolete */
 #define	BIOCGSEESENT	BIOCGDIRECTION
@@ -149,6 +175,24 @@ struct bpf_hdr {
 #endif
 
 /*
+ * When using zero-copy BPF buffers, a shared memory header is present
+ * allowing the kernel BPF implementation and user process to synchronize
+ * without using system calls.  This structure defines that header.  When
+ * accessing these fields, appropriate atomic operation and memory barriers
+ * are required in order not to see stale or out-of-order data; see bpf(4)
+ * for reference code to access these fields from userspace.
+ *
+ * The layout of this structure is critical, and must not be changed; if must
+ * fit in a single page on all architectures.
+ */
+struct bpf_zbuf_header {
+	volatile u_int	bzh_kernel_gen;	/* Kernel generation number. */
+	volatile u_int	bzh_kernel_len;	/* Length of data in the buffer. */
+	volatile u_int	bzh_user_gen;	/* User generation number. */
+	u_int _bzh_pad[5];
+};
+
+/*
  * Data-link level type codes.
  */
 #define DLT_NULL	0	/* BSD loopback encapsulation */
@@ -761,6 +805,27 @@ struct bpf_dltlist {
 };
 
 #ifdef _KERNEL
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_BPF);
+#endif
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_net_bpf);
+#endif
+
+/*
+ * Rotate the packet buffers in descriptor d.  Move the store buffer into the
+ * hold slot, and the free buffer ino the store slot.  Zero the length of the
+ * new store buffer.  Descriptor lock should be held.
+ */
+#define	ROTATE_BUFFERS(d)	do {					\
+	(d)->bd_hbuf = (d)->bd_sbuf;					\
+	(d)->bd_hlen = (d)->bd_slen;					\
+	(d)->bd_sbuf = (d)->bd_fbuf;					\
+	(d)->bd_slen = 0;						\
+	(d)->bd_fbuf = NULL;						\
+	bpf_bufheld(d);							\
+} while (0)
+
 /*
  * Descriptor associated with each attached hardware interface.
  */
@@ -773,6 +838,7 @@ struct bpf_if {
 	struct mtx	bif_mtx;	/* mutex for interface */
 };
 
+void	 bpf_bufheld(struct bpf_d *d);
 int	 bpf_validate(const struct bpf_insn *, int);
 void	 bpf_tap(struct bpf_if *, u_char *, u_int);
 void	 bpf_mtap(struct bpf_if *, struct mbuf *);
diff --git a/sys/net/bpf_buffer.c b/sys/net/bpf_buffer.c
new file mode 100644
index 000000000000..f07e9486cbd6
--- /dev/null
+++ b/sys/net/bpf_buffer.c
@@ -0,0 +1,210 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (c) 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
+ * Berkeley Laboratory.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bpf.h"
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/bpf.h>
+#include <net/bpf_buffer.h>
+#include <net/bpfdesc.h>
+
+/*
+ * Implement historical kernel memory buffering model for BPF: two malloc(9)
+ * kernel buffers are hung off of the descriptor.  The size is fixed prior to
+ * attaching to an ifnet, ad cannot be changed after that.  read(2) simply
+ * copies the data to user space using uiomove(9).
+ */
+
+static int bpf_bufsize = 4096;
+SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
+    &bpf_bufsize, 0, "");
+static int bpf_maxbufsize = BPF_MAXBUFSIZE;
+SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
+    &bpf_maxbufsize, 0, "");
+
+void
+bpf_buffer_alloc(struct bpf_d *d)
+{
+
+	KASSERT(d->bd_fbuf == NULL, ("bpf_buffer_alloc: bd_fbuf != NULL"));
+	KASSERT(d->bd_sbuf == NULL, ("bpf_buffer_alloc: bd_sbuf != NULL"));
+	KASSERT(d->bd_hbuf == NULL, ("bpf_buffer_alloc: bd_hbuf != NULL"));
+
+	d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
+	d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
+	d->bd_hbuf = NULL;
+	d->bd_slen = 0;
+	d->bd_hlen = 0;
+}
+
+/*
+ * Simple data copy to the current kernel buffer.
+ */
+void
+bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+    void *src, u_int len)
+{
+	u_char *src_bytes;
+
+	src_bytes = (u_char *)src;
+	bcopy(src_bytes, buf + offset, len);
+}
+
+/*
+ * Scatter-gather data copy from an mbuf chain to the current kernel buffer.
+ */
+void
+bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
+{
+	const struct mbuf *m;
+	u_char *dst;
+	u_int count;
+
+	m = (struct mbuf *)src;
+	dst = (u_char *)buf + offset;
+	while (len > 0) {
+		if (m == NULL)
+			panic("bpf_mcopy");
+		count = min(m->m_len, len);
+		bcopy(mtod(m, void *), dst, count);
+		m = m->m_next;
+		dst += count;
+		len -= count;
+	}
+}
+
+/*
+ * Free BPF kernel buffers on device close.
+ */
+void
+bpf_buffer_free(struct bpf_d *d)
+{
+
+	if (d->bd_sbuf != NULL)
+		free(d->bd_sbuf, M_BPF);
+	if (d->bd_hbuf != NULL)
+		free(d->bd_hbuf, M_BPF);
+	if (d->bd_fbuf != NULL)
+		free(d->bd_fbuf, M_BPF);
+
+#ifdef INVARIANTS
+	d->bd_sbuf = d->bd_hbuf = d->bd_fbuf = (caddr_t)~0;
+#endif
+}
+
+/*
+ * This is a historical initialization that occurs when the BPF descriptor is
+ * first opened.  It does not imply selection of a buffer mode, so we don't
+ * allocate buffers here.
+ */
+void
+bpf_buffer_init(struct bpf_d *d)
+{
+
+	d->bd_bufsize = bpf_bufsize;
+}
+
+/*
+ * Allocate or resize buffers.
+ */
+int
+bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i)
+{
+	u_int size;
+
+	BPFD_LOCK(d);
+	if (d->bd_bif != NULL) {
+		BPFD_UNLOCK(d);
+		return (EINVAL);
+	}
+	size = *i;
+	if (size > bpf_maxbufsize)
+		*i = size = bpf_maxbufsize;
+	else if (size < BPF_MINBUFSIZE)
+		*i = size = BPF_MINBUFSIZE;
+	d->bd_bufsize = size;
+	BPFD_UNLOCK(d);
+	return (0);
+}
+
+/*
+ * Copy buffer storage to user space in read().
+ */
+int
+bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
+{
+
+	return (uiomove(buf, len, uio));
+}
diff --git a/sys/net/bpf_buffer.h b/sys/net/bpf_buffer.h
new file mode 100644
index 000000000000..82d0310b4d44
--- /dev/null
+++ b/sys/net/bpf_buffer.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_BPF_BUFFER_H_
+#define	_NET_BPF_BUFFER_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+void	bpf_buffer_alloc(struct bpf_d *d);
+void	bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+	    void *src, u_int len);
+void	bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+	    void *src, u_int len);
+void	bpf_buffer_free(struct bpf_d *d);
+void	bpf_buffer_init(struct bpf_d *d);
+int	bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i);
+int	bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len,
+	    struct uio *uio);
+
+#endif /* !_NET_BPF_BUFFER_H_ */
diff --git a/sys/net/bpf_zerocopy.c b/sys/net/bpf_zerocopy.c
new file mode 100644
index 000000000000..896ad1da29f4
--- /dev/null
+++ b/sys/net/bpf_zerocopy.c
@@ -0,0 +1,510 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bpf.h"
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sf_buf.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+#include <machine/atomic.h>
+
+#include <net/if.h>
+#include <net/bpf.h>
+#include <net/bpfdesc.h>
+#include <net/bpf_zerocopy.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
+/*
+ * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which
+ * are mapped into the kernel address space using sf_bufs and used directly
+ * by BPF.  Memory is wired since page faults cannot be tolerated in the
+ * contexts where the buffers are copied to (locks held, interrupt context,
+ * etc).  Access to shared memory buffers is synchronized using a header on
+ * each buffer, allowing the number of system calls to go to zero as BPF
+ * reaches saturation (buffers filled as fast as they can be drained by the
+ * user process).  Full details of the protocol for communicating between the
+ * user process and BPF may be found in bpf(4).
+ */
+
+/*
+ * Maximum number of pages per buffer.  Since all BPF devices use two, the
+ * maximum per device is 2*BPF_MAX_PAGES.  Resource limits on the number of
+ * sf_bufs may be an issue, so do not set this too high.  On older systems,
+ * kernel address space limits may also be an issue.
+ */
+#define	BPF_MAX_PAGES	512
+
+/*
+ * struct zbuf describes a memory buffer loaned by a user process to the
+ * kernel.  We represent this as a series of pages managed using an array of
+ * sf_bufs.  Even though the memory is contiguous in user space, it may not
+ * be mapped contiguously in the kernel (i.e., a set of physically
+ * non-contiguous pages in the direct map region) so we must implement
+ * scatter-gather copying.  One significant mitigating factor is that on
+ * systems with a direct memory map, we can avoid TLB misses.
+ *
+ * At the front of the shared memor region is a bpf_zbuf_header, which
+ * contains shared control data to allow user space and the kernel to
+ * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
+ * knows that the space is not available.
+ */
+struct zbuf {
+	vm_offset_t	 zb_uaddr;	/* User address, may be stale. */
+	size_t		 zb_size;	/* Size of buffer, incl. header. */
+	u_int		 zb_numpages;	/* Number of pages. */
+	struct sf_buf	**zb_pages;	/* Pages themselves. */
+	struct bpf_zbuf_header	*zb_header;	/* Shared header. */
+};
+
+/*
+ * Release a page we've previously wired.
+ */
+static void
+zbuf_page_free(vm_page_t pp)
+{
+
+	vm_page_lock_queues();
+	vm_page_unwire(pp, 0);
+	if (pp->wire_count == 0 && pp->object == NULL)
+		vm_page_free(pp);
+	vm_page_unlock_queues();
+}
+
+/*
+ * Free an sf_buf with attached page.
+ */
+static void
+zbuf_sfbuf_free(struct sf_buf *sf)
+{
+	vm_page_t pp;
+
+	pp = sf_buf_page(sf);
+	sf_buf_free(sf);
+	zbuf_page_free(pp);
+}
+
+/*
+ * Free a zbuf, including its page array, sbufs, and pages.  Allow partially
+ * allocated zbufs to be freed so that it may be used even during a zbuf
+ * setup.
+ */
+static void
+zbuf_free(struct zbuf *zb)
+{
+	int i;
+
+	for (i = 0; i < zb->zb_numpages; i++) {
+		if (zb->zb_pages[i] != NULL)
+			zbuf_sfbuf_free(zb->zb_pages[i]);
+	}
+	free(zb->zb_pages, M_BPF);
+	free(zb, M_BPF);
+}
+
+/*
+ * Given a user pointer to a page of user memory, return an sf_buf for the
+ * page.  Because we may be requesting quite a few sf_bufs, prefer failure to
+ * deadlock and use SFB_NOWAIT.
+ */
+static struct sf_buf *
+zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)
+{
+	struct sf_buf *sf;
+	vm_page_t pp;
+
+	if (vm_fault_quick((caddr_t) uaddr, VM_PROT_READ | VM_PROT_WRITE) <
+	    0)
+		return (NULL);
+	pp = pmap_extract_and_hold(map->pmap, uaddr, VM_PROT_READ |
+	    VM_PROT_WRITE);
+	if (pp == NULL)
+		return (NULL);
+	vm_page_lock_queues();
+	vm_page_wire(pp);
+	vm_page_unhold(pp);
+	vm_page_unlock_queues();
+	sf = sf_buf_alloc(pp, SFB_NOWAIT);
+	if (sf == NULL) {
+		zbuf_page_free(pp);
+		return (NULL);
+	}
+	return (sf);
+}
+
+/*
+ * Create a zbuf describing a range of user address space memory.  Validate
+ * page alignment, size requirements, etc.
+ */
+static int
+zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len,
+    struct zbuf **zbp)
+{
+	struct zbuf *zb;
+	struct vm_map *map;
+	int error, i;
+
+	*zbp = NULL;
+
+	/*
+	 * User address must be page-aligned.
+	 */
+	if (uaddr & PAGE_MASK)
+		return (EINVAL);
+
+	/*
+	 * Length must be an integer number of full pages.
+	 */
+	if (len & PAGE_MASK)
+		return (EINVAL);
+
+	/*
+	 * Length must not exceed per-buffer resource limit.
+	 */
+	if ((len / PAGE_SIZE) > BPF_MAX_PAGES)
+		return (EINVAL);
+
+	/*
+	 * Allocate the buffer and set up each page with is own sf_buf.
+	 */
+	error = 0;
+	zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK);
+	zb->zb_uaddr = uaddr;
+	zb->zb_size = len;
+	zb->zb_numpages = len / PAGE_SIZE;
+	zb->zb_pages = malloc(sizeof(struct sf_buf *) *
+	    zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK);
+	map = &td->td_proc->p_vmspace->vm_map;
+	for (i = 0; i < zb->zb_numpages; i++) {
+		zb->zb_pages[i] = zbuf_sfbuf_get(map,
+		    uaddr + (i * PAGE_SIZE));
+		if (zb->zb_pages[i] == NULL) {
+			error = EFAULT;
+			goto error;
+		}
+	}
+	zb->zb_header =
+	    (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]);
+	bzero(zb->zb_header, sizeof(*zb->zb_header));
+	*zbp = zb;
+	return (0);
+
+error:
+	zbuf_free(zb);
+	return (error);
+}
+
+/*
+ * Copy bytes from a source into the specified zbuf.  The caller is
+ * responsible for performing bounds checking, etc.
+ */
+void
+bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+    void *src, u_int len)
+{
+	u_int count, page, poffset;
+	u_char *src_bytes;
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_append_bytes: not in zbuf mode"));
+	KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf"));
+
+	src_bytes = (u_char *)src;
+	zb = (struct zbuf *)buf;
+
+	/*
+	 * Scatter-gather copy to user pages mapped into kernel address space
+	 * using sf_bufs: copy up to a page at a time.
+	 */
+	offset += sizeof(struct bpf_zbuf_header);
+	page = offset / PAGE_SIZE;
+	poffset = offset % PAGE_SIZE;
+	while (len > 0) {
+		KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:"
+		   " page overflow (%d p %d np)\n", page, zb->zb_numpages));
+
+		count = min(len, PAGE_SIZE - poffset);
+		bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) +
+		    poffset, count);
+		poffset += count;
+		if (poffset == PAGE_SIZE) {
+			poffset = 0;
+			page++;
+		}
+		KASSERT(poffset < PAGE_SIZE,
+		    ("bpf_zerocopy_append_bytes: page offset overflow (%d)",
+		    poffset));
+		len -= count;
+		src_bytes += count;
+	}
+}
+
+/*
+ * Copy bytes from an mbuf chain to the specified zbuf: copying will be
+ * scatter-gather both from mbufs, which may be fragmented over memory, and
+ * to pages, which may not be contiguously mapped in kernel address space.
+ * As with bpf_zerocopy_append_bytes(), the caller is responsible for
+ * checking that this will not exceed the buffer limit.
+ */
+void
+bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+    void *src, u_int len)
+{
+	u_int count, moffset, page, poffset;
+	const struct mbuf *m;
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_append_mbuf not in zbuf mode"));
+	KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf"));
+
+	m = (struct mbuf *)src;
+	zb = (struct zbuf *)buf;
+
+	/*
+	 * Scatter gather both from an mbuf chain and to a user page set
+	 * mapped into kernel address space using sf_bufs.  If we're lucky,
+	 * each mbuf requires one copy operation, but if page alignment and
+	 * mbuf alignment work out less well, we'll be doing two copies per
+	 * mbuf.
+	 */
+	offset += sizeof(struct bpf_zbuf_header);
+	page = offset / PAGE_SIZE;
+	poffset = offset % PAGE_SIZE;
+	moffset = 0;
+	while (len > 0) {
+		KASSERT(page < zb->zb_numpages,
+		    ("bpf_zerocopy_append_mbuf: page overflow (%d p %d "
+		    "np)\n", page, zb->zb_numpages));
+		KASSERT(m != NULL,
+		    ("bpf_zerocopy_append_mbuf: end of mbuf chain"));
+
+		count = min(m->m_len - moffset, len);
+		count = min(count, PAGE_SIZE - poffset);
+		bcopy(mtod(m, u_char *) + moffset,
+		    ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset,
+		    count);
+		poffset += count;
+		if (poffset == PAGE_SIZE) {
+			poffset = 0;
+			page++;
+		}
+		KASSERT(poffset < PAGE_SIZE,
+		    ("bpf_zerocopy_append_mbuf: page offset overflow (%d)",
+		    poffset));
+		moffset += count;
+		if (moffset == m->m_len) {
+			m = m->m_next;
+			moffset = 0;
+		}
+		len -= count;
+	}
+}
+
+/*
+ * Notification from the BPF framework that a buffer has moved into the held
+ * slot on a descriptor.  Zero-copy BPF will update the shared page to let
+ * the user process know.
+ */
+void
+bpf_zerocopy_bufheld(struct bpf_d *d)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_bufheld: not in zbuf mode"));
+
+	zb = (struct zbuf *)d->bd_hbuf;
+	KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
+	zb->zb_header->bzh_kernel_len = d->bd_hlen;
+	atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
+}
+
+/*
+ * Query from the BPF framework regarding whether the buffer currently in the
+ * held position can be moved to the free position, which can be indicated by
+ * the user process making their generation number equal to the kernel
+ * generation number.
+ */
+int
+bpf_zerocopy_canfreebuf(struct bpf_d *d)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_canfreebuf: not in zbuf mode"));
+
+	zb = (struct zbuf *)d->bd_hbuf;
+	if (zb == NULL)
+		return (0);
+	if (zb->zb_header->bzh_kernel_gen ==
+	    atomic_load_acq_int(&zb->zb_header->bzh_user_gen))
+		return (1);
+	return (0);
+}
+
+/*
+ * Free zero copy buffers at request of descriptor.
+ */
+void
+bpf_zerocopy_free(struct bpf_d *d)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_free: not in zbuf mode"));
+
+	zb = (struct zbuf *)d->bd_sbuf;
+	if (zb != NULL)
+		zbuf_free(zb);
+	zb = (struct zbuf *)d->bd_hbuf;
+	if (zb != NULL)
+		zbuf_free(zb);
+	zb = (struct zbuf *)d->bd_fbuf;
+	if (zb != NULL)
+		zbuf_free(zb);
+}
+
+/*
+ * Ioctl to return the maximum buffer size.
+ */
+int
+bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
+{
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode"));
+
+	*i = BPF_MAX_PAGES * PAGE_SIZE;
+	return (0);
+}
+
+/*
+ * Ioctl to force rotation of the two buffers, if there's any data available.
+ * This can be used by user space to implement time outs when waiting for a
+ * buffer to fill.
+ */
+int
+bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
+    struct bpf_zbuf *bz)
+{
+	struct zbuf *bzh;
+
+	bzero(bz, sizeof(*bz));
+	BPFD_LOCK(d);
+	if (d->bd_hbuf == NULL && d->bd_slen != 0) {
+		ROTATE_BUFFERS(d);
+		bzh = (struct zbuf *)d->bd_hbuf;
+		bz->bz_bufa = (void *)bzh->zb_uaddr;
+		bz->bz_buflen = d->bd_hlen;
+	}
+	BPFD_UNLOCK(d);
+	return (0);
+}
+
+/*
+ * Ioctl to configure zero-copy buffers -- may be done only once.
+ */
+int
+bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
+    struct bpf_zbuf *bz)
+{
+	struct zbuf *zba, *zbb;
+	int error;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode"));
+
+	/*
+	 * Must set both buffers.  Cannot clear them.
+	 */
+	if (bz->bz_bufa == NULL || bz->bz_bufb == NULL)
+		return (EINVAL);
+
+	/*
+	 * Buffers must have a size greater than 0.  Alignment and other size
+	 * validity checking is done in zbuf_setup().
+	 */
+	if (bz->bz_buflen == 0)
+		return (EINVAL);
+
+	/*
+	 * Allocate new buffers.
+	 */
+	error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen,
+	    &zba);
+	if (error)
+		return (error);
+	error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen,
+	    &zbb);
+	if (error) {
+		zbuf_free(zba);
+		return (error);
+	}
+
+	/*
+	 * We only allow buffers to be installed once, so atomically check
+	 * that no buffers are currently installed and install new buffers.
+	 */
+	BPFD_LOCK(d);
+	if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||
+	    d->bd_bif != NULL) {
+		BPFD_UNLOCK(d);
+		zbuf_free(zba);
+		zbuf_free(zbb);
+		return (EINVAL);
+	}
+	d->bd_fbuf = (caddr_t)zbb;
+	d->bd_sbuf = (caddr_t)zba;
+	d->bd_slen = 0;
+	d->bd_hlen = 0;
+
+	/*
+	 * We expose only the space left in the buffer after the size of the
+	 * shared management region.
+	 */
+	d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header);
+	BPFD_UNLOCK(d);
+	return (0);
+}
diff --git a/sys/net/bpf_zerocopy.h b/sys/net/bpf_zerocopy.h
new file mode 100644
index 000000000000..33d1f25041d8
--- /dev/null
+++ b/sys/net/bpf_zerocopy.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_BPF_ZEROCOPY_H_
+#define	_NET_BPF_ZEROCOPY_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+void	bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+	    void *src, u_int len);
+void	bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+	    void *src, u_int len);
+void	bpf_zerocopy_bufheld(struct bpf_d *);
+int	bpf_zerocopy_canfreebuf(struct bpf_d *);
+void	bpf_zerocopy_free(struct bpf_d *d);
+int	bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d,
+	    size_t *i);
+int	bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
+	    struct bpf_zbuf *bz);
+int	bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
+	    struct bpf_zbuf *bz);
+
+#endif /* !_NET_BPF_ZEROCOPY_H_ */
diff --git a/sys/net/bpfdesc.h b/sys/net/bpfdesc.h
index a46013edca43..ad9ab207dbf3 100644
--- a/sys/net/bpfdesc.h
+++ b/sys/net/bpfdesc.h
@@ -48,10 +48,11 @@
 /*
  * Descriptor associated with each open bpf file.
  */
+struct zbuf;
 struct bpf_d {
 	LIST_ENTRY(bpf_d) bd_next;	/* Linked list of descriptors */
 	/*
-	 * Buffer slots: two malloc buffers store the incoming packets.
+	 * Buffer slots: two memory buffers store the incoming packets.
 	 *   The model has three slots.  Sbuf is always occupied.
 	 *   sbuf (store) - Receive interrupt puts packets here.
 	 *   hbuf (hold) - When sbuf is full, put buffer here and
@@ -74,8 +75,8 @@ struct bpf_d {
 #ifdef BPF_JITTER
 	bpf_jit_filter	*bd_bfilter;	/* binary filter code */
 #endif
-	u_long		bd_rcount;	/* number of packets received */
-	u_long		bd_dcount;	/* number of packets dropped */
+	u_int64_t	bd_rcount;	/* number of packets received */
+	u_int64_t	bd_dcount;	/* number of packets dropped */
 
 	u_char		bd_promisc;	/* true if listening promiscuously */
 	u_char		bd_state;	/* idle, waiting, or timed out */
@@ -90,9 +91,14 @@ struct bpf_d {
 	struct mtx	bd_mtx;		/* mutex for this descriptor */
 	struct callout	bd_callout;	/* for BPF timeouts with select */
 	struct label	*bd_label;	/* MAC label for descriptor */
-	u_long		bd_fcount;	/* number of packets which matched filter */
+	u_int64_t	bd_fcount;	/* number of packets which matched filter */
 	pid_t		bd_pid;		/* PID which created descriptor */
 	int		bd_locked;	/* true if descriptor is locked */
+	u_int		bd_bufmode;	/* Current buffer mode. */
+	u_int64_t	bd_wcount;	/* number of packets written */
+	u_int64_t	bd_wfcount;	/* number of packets that matched write filter */
+	u_int64_t	bd_wdcount;	/* number of packets dropped during a write */
+	u_int64_t	bd_zcopy;	/* number of zero copy operations */
 };
 
 /* Values for bd_state */
@@ -104,25 +110,21 @@ struct bpf_d {
 #define BPFD_UNLOCK(bd)		mtx_unlock(&(bd)->bd_mtx)
 #define BPFD_LOCK_ASSERT(bd)	mtx_assert(&(bd)->bd_mtx, MA_OWNED);
 
-/* Test whether a BPF is ready for read(). */
-#define	bpf_ready(bd)						 \
-	((bd)->bd_hlen != 0 ||					 \
-	 (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \
-	  (bd)->bd_slen != 0))
-
 /*
  * External representation of the bpf descriptor
  */
 struct xbpf_d {
+	u_int		bd_structsize;	/* Size of this structure. */
 	u_char		bd_promisc;
 	u_char		bd_immediate;
+	u_char		__bd_pad[6];
 	int		bd_hdrcmplt;
 	int		bd_direction;
 	int		bd_feedback;
 	int		bd_async;
-	u_long		bd_rcount;
-	u_long		bd_dcount;
-	u_long		bd_fcount;
+	u_int64_t	bd_rcount;
+	u_int64_t	bd_dcount;
+	u_int64_t	bd_fcount;
 	int		bd_sig;
 	int		bd_slen;
 	int		bd_hlen;
@@ -130,6 +132,16 @@ struct xbpf_d {
 	pid_t		bd_pid;
 	char		bd_ifname[IFNAMSIZ];
 	int		bd_locked;
+	u_int64_t	bd_wcount;
+	u_int64_t	bd_wfcount;
+	u_int64_t	bd_wdcount;
+	u_int64_t	bd_zcopy;
+	int		bd_bufmode;
+	/*
+	 * Allocate 4 64 bit unsigned integers for future expansion so we do
+	 * not have to worry about breaking the ABI.
+	 */
+	u_int64_t	bd_spare[4];
 };
 
 #define BPFIF_LOCK(bif)		mtx_lock(&(bif)->bif_mtx)