summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--share/man/man4/bpf.4255
-rw-r--r--sys/conf/files2
-rw-r--r--sys/net/bpf.c422
-rw-r--r--sys/net/bpf.h66
-rw-r--r--sys/net/bpf_buffer.c210
-rw-r--r--sys/net/bpf_buffer.h50
-rw-r--r--sys/net/bpf_zerocopy.c510
-rw-r--r--sys/net/bpf_zerocopy.h53
-rw-r--r--sys/net/bpfdesc.h38
9 files changed, 1469 insertions, 137 deletions
diff --git a/share/man/man4/bpf.4 b/share/man/man4/bpf.4
index bb278586fbb1..9116b2dfa7b8 100644
--- a/share/man/man4/bpf.4
+++ b/share/man/man4/bpf.4
@@ -1,3 +1,30 @@
+.\" Copyright (c) 2007 Seccuris Inc.
+.\" All rights reserved.
+.\"
+.\" This sofware was developed by Robert N. M. Watson under contract to
+.\" Seccuris Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
.\" Copyright (c) 1990 The Regents of the University of California.
.\" All rights reserved.
.\"
@@ -61,19 +88,6 @@ Whenever a packet is received by an interface,
all file descriptors listening on that interface apply their filter.
Each descriptor that accepts the packet receives its own copy.
.Pp
-Reads from these files return the next group of packets
-that have matched the filter.
-To improve performance, the buffer passed to read must be
-the same size as the buffers used internally by
-.Nm .
-This size is returned by the
-.Dv BIOCGBLEN
-ioctl (see below), and
-can be set with
-.Dv BIOCSBLEN .
-Note that an individual packet larger than this size is necessarily
-truncated.
-.Pp
The packet filter will support any link level protocol that has fixed length
headers.
Currently, only Ethernet,
@@ -94,6 +108,165 @@ The writes are unbuffered, meaning only one packet can be processed per write.
Currently, only writes to Ethernets and
.Tn SLIP
links are supported.
+.Sh BUFFER MODES
+.Nm
+devices deliver packet data to the application via memory buffers provided by
+the application.
+The buffer mode is set using the
+.Dv BIOCSETBUFMODE
+ioctl, and read using the
+.Dv BIOCGETBUFMODE
+ioctl.
+.Ss Buffered read mode
+By default,
+.Nm
+devices operate in the
+.Dv BPF_BUFMODE_BUFFER
+mode, in which packet data is copied explicitly from kernel to user memory
+using the
+.Xr read 2
+system call.
+The user process will declare a fixed buffer size that will be used both for
+sizing internal buffers and for all
+.Xr read 2
+operations on the file.
+This size is queried using the
+.Dv BIOCGBLEN
+ioctl, and is set using the
+.Dv BIOCSBLEN
+ioctl.
+Note that an individual packet larger than the buffer size is necessarily
+truncated.
+.Ss Zero-copy buffer mode
+.Nm
+devices may also operate in the
+.Dv BPF_BUFMODE_ZEROCOPY
+mode, in which packet data is written directly into two user memory buffers
+by the kernel, avoiding both system call and copying overhead.
+Buffers are of fixed (and equal) size, page-aligned, and an even multiple of
+the page size.
+The maximum zero-copy buffer size is returned by the
+.Dv BIOCGETZMAX
+ioctl.
+Note that an individual packet larger than the buffer size is necessarily
+truncated.
+.Pp
+The user process registers two memory buffers using the
+.Dv BIOCSETZBUF
+ioctl, which accepts a
+.Vt struct bpf_zbuf
+pointer as an argument:
+.Bd -literal
+struct bpf_zbuf {
+ void *bz_bufa;
+ void *bz_bufb;
+ size_t bz_buflen;
+};
+.Ed
+.Pp
+.Vt bz_bufa
+is a pointer to the userspace address of the first buffer that will be
+filled, and
+.Vt bz_bufb
+is a pointer to the second buffer.
+.Nm
+will then cycle between the two buffers as they fill and are acknowledged.
+.Pp
+Each buffer begins with a fixed-length header to hold synchronization and
+data length information for the buffer:
+.Bd -literal
+struct bpf_zbuf_header {
+ volatile u_int bzh_kernel_gen; /* Kernel generation number. */
+ volatile u_int bzh_kernel_len; /* Length of data in the buffer. */
+ volatile u_int bzh_user_gen; /* User generation number. */
+ /* ...padding for future use... */
+};
+.Ed
+.Pp
+The header structure of each buffer, including all padding, should be zeroed
+before it is configured using
+.Dv BIOCSETZBUF .
+Remaining space in the buffer will be used by the kernel to store packet
+data, laid out in the same format as with buffered read mode.
+.Pp
+The kernel and the user process follow a simple acknowledgement protocol via
+the buffer header to synchronize access to the buffer: when the header
+generation numbers,
+.Vt bzh_kernel_gen
+and
+.Vt bzh_user_gen ,
+hold the same value, the kernel owns the buffer, and when they differ,
+userspace owns the buffer.
+.Pp
+While the kernel owns the buffer, the contents are unstable and may change
+asynchronously; while the user process owns the buffer, its contents are
+stable and will not be changed until the buffer has been acknowledged.
+.Pp
+Initializing the buffer headers to all 0's before registering the buffer has
+the effect of assigning initial ownership of both buffers to the kernel.
+The kernel signals that a buffer has been assigned to userspace by modifying
+.Vt bzh_kernel_gen ,
+and userspace acknowledges the buffer and returns it to the kernel by setting
+the value of
+.Vt bzh_user_gen
+to the value of
+.Vt bzh_kernel_gen .
+.Pp
+In order to avoid caching and memory re-ordering effects, the user process
+must use atomic operations and memory barriers when checking for and
+acknowledging buffers:
+.Bd -literal
+#include <machine/atomic.h>
+
+/*
+ * Return ownership of a buffer to the kernel for reuse.
+ */
+static void
+buffer_acknowledge(struct bpf_zbuf_header *bzh)
+{
+
+ atomic_store_rel_int(&bzh->bzh_user_gen, bzh->bzh_kernel_gen);
+}
+
+/*
+ * Check whether a buffer has been assigned to userspace by the kernel.
+ * Return true if userspace owns the buffer, and false otherwise.
+ */
+static int
+buffer_check(struct bpf_zbuf_header *bzh)
+{
+
+ return (bzh->bzh_user_gen !=
+ atomic_load_acq_int(&bzh->bzh_kernel_gen));
+}
+.Ed
+.Pp
+The user process may force the assignment of the next buffer, if any data
+is pending, to userspace using the
+.Dv BIOCROTZBUF
+ioctl.
+This allows the user process to retrieve data in a partially filled buffer
+before the buffer is full, such as following a timeout; the process must
+recheck for buffer ownership using the header generation numbers, as the
+buffer will not be assigned to userspace if no data was present.
+.Pp
+As in the buffered read mode,
+.Xr kqueue 2 ,
+.Xr poll 2 ,
+and
+.Xr select 2
+may be used to sleep awaiting the availbility of a completed buffer.
+They will return a readable file descriptor when ownership of the next buffer
+is assigned to user space.
+.Pp
+In the current implementation, the kernel will assign ownership of at most
+one buffer at a time to the user process.
+The user processes must acknowledge the current buffer in order to be
+notified that the next buffer is ready for processing.
+Programs should not rely on this as an invariant, as it may change in future
+versions; in particular, they must maintain their own notion of which buffer
+is "next" so that if both buffers are owned by userspace, it can process them
+in the correct order.
.Sh IOCTLS
The
.Xr ioctl 2
@@ -127,7 +300,7 @@ file.
The (third) argument to
.Xr ioctl 2
should be a pointer to the type indicated.
-.Bl -tag -width BIOCGRTIMEOUT
+.Bl -tag -width BIOCGETBUFMODE
.It Dv BIOCGBLEN
.Pq Li u_int
Returns the required buffer length for reads on
@@ -349,10 +522,55 @@ descriptor.
This prevents the execution of
ioctl commands which could change the underlying operating parameters of
the device.
+.It Dv BIOCGETBUFMODE
+.It Dv BIOCSETBUFMODE
+.Pq Li u_int
+Get or set the current
+.Nm
+buffering mode; possible values are
+.Dv BPF_BUFMODE_BUFFER ,
+buffered read mode, and
+.Dv BPF_BUFMODE_ZBUF ,
+zero-copy buffer mode.
+.It Dv BIOCSETZBUF
+.Pq Li struct bpf_zbuf
+Set the current zero-copy buffer locations; buffer locations may be
+set only once zero-copy buffer mode has been selected, and prior to attaching
+to an interface.
+Buffers must be of identical size, page-aligned, and an integer multiple of
+pages in size.
+The three fields
+.Vt bz_bufa ,
+.Vt bz_bufb ,
+and
+.Vt bz_buflen
+must be filled out.
+If buffers have already been set for this device, the ioctl will fail.
+.It Dv BIOCGETZMAX
+.Pq Li size_t
+Get the largest individual zero-copy buffer size allowed.
+As two buffers are used in zero-copy buffer mode, the limit (in practice) is
+twice the returned size.
+As zero-copy buffers consume kernel address space, conservative selection of
+buffer size is suggested, especially when there are multiple
+.Nm
+descriptors in use on 32-bit systems.
+.It Dv BIOCROTZBUF
+Force ownership of the next buffer to be assigned to userspace, if any data
+present in the buffer.
+If no data is present, the buffer will remain owned by the kernel.
+This allows consumers of zero-copy buffering to implement timeouts and
+retrieve partially filled buffers.
+In order to handle the case where no data is present in the buffer and
+therefore ownership is not assigned, the user process must check
+.Vt bzh_kernel_gen
+against
+.Vt bzh_user_gen .
.El
.Sh BPF HEADER
The following structure is prepended to each packet returned by
-.Xr read 2 :
+.Xr read 2
+or via a zero-copy buffer:
.Bd -literal
struct bpf_hdr {
struct timeval bh_tstamp; /* time stamp */
@@ -718,6 +936,9 @@ struct bpf_insn insns[] = {
.Sh SEE ALSO
.Xr tcpdump 1 ,
.Xr ioctl 2 ,
+.Xr kqueue 2 ,
+.Xr poll 2 ,
+.Xr select 2 ,
.Xr byteorder 3 ,
.Xr ng_bpf 4 ,
.Xr bpf 9
@@ -750,6 +971,10 @@ of Lawrence Berkeley Laboratory, implemented BPF in
Summer 1990.
Much of the design is due to
.An Van Jacobson .
+.Pp
+Support for zero-copy buffers was added by
+.An Robert N. M. Watson
+under contract to Seccuris Inc.
.Sh BUGS
The read buffer must be of a fixed size (returned by the
.Dv BIOCGBLEN
diff --git a/sys/conf/files b/sys/conf/files
index 2bd8a2368b2c..eac57fa58ee3 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1633,8 +1633,10 @@ libkern/strtoul.c standard
libkern/strtouq.c standard
libkern/strvalid.c standard
net/bpf.c standard
+net/bpf_buffer.c optional bpf
net/bpf_jitter.c optional bpf_jitter
net/bpf_filter.c optional bpf | netgraph_bpf
+net/bpf_zerocopy.c optional bpf
net/bridgestp.c optional bridge | if_bridge
net/bsd_comp.c optional ppp_bsdcomp
net/ieee8023ad_lacp.c optional lagg
diff --git a/sys/net/bpf.c b/sys/net/bpf.c
index 49961754d690..433cc7aa4649 100644
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@@ -66,9 +66,11 @@ __FBSDID("$FreeBSD$");
#include <net/if.h>
#include <net/bpf.h>
+#include <net/bpf_buffer.h>
#ifdef BPF_JITTER
#include <net/bpf_jitter.h>
#endif
+#include <net/bpf_zerocopy.h>
#include <net/bpfdesc.h>
#include <netinet/in.h>
@@ -80,7 +82,7 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
-static MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
+MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
#if defined(DEV_BPF) || defined(NETGRAPH_BPF)
@@ -98,19 +100,17 @@ static LIST_HEAD(, bpf_if) bpf_iflist;
static struct mtx bpf_mtx; /* bpf global lock */
static int bpf_bpfd_cnt;
-static void bpf_allocbufs(struct bpf_d *);
static void bpf_attachd(struct bpf_d *, struct bpf_if *);
static void bpf_detachd(struct bpf_d *);
static void bpf_freed(struct bpf_d *);
-static void bpf_mcopy(const void *, void *, size_t);
static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
struct sockaddr *, int *, struct bpf_insn *);
static int bpf_setif(struct bpf_d *, struct ifreq *);
static void bpf_timed_out(void *);
static __inline void
bpf_wakeup(struct bpf_d *);
-static void catchpacket(struct bpf_d *, u_char *, u_int,
- u_int, void (*)(const void *, void *, size_t),
+static void catchpacket(struct bpf_d *, u_char *, u_int, u_int,
+ void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
struct timeval *);
static void reset_d(struct bpf_d *);
static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
@@ -123,15 +123,12 @@ static void bpf_clone(void *, struct ucred *, char *, int, struct cdev **);
static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
-static int bpf_bufsize = 4096;
-SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
- &bpf_bufsize, 0, "Default bpf buffer size");
-static int bpf_maxbufsize = BPF_MAXBUFSIZE;
-SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
- &bpf_maxbufsize, 0, "Maximum bpf buffer size");
static int bpf_maxinsns = BPF_MAXINSNS;
SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
&bpf_maxinsns, 0, "Maximum bpf program instructions");
+static int bpf_zerocopy_enable = 0;
+SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
+ &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW,
bpf_stats_sysctl, "bpf statistics portal");
@@ -158,6 +155,146 @@ static struct cdevsw bpf_cdevsw = {
static struct filterops bpfread_filtops =
{ 1, NULL, filt_bpfdetach, filt_bpfread };
+/*
+ * Wrapper functions for various buffering methods. If the set of buffer
+ * modes expands, we will probably want to introduce a switch data structure
+ * similar to protosw, et.
+ */
+static void
+bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+ u_int len)
+{
+
+ BPFD_LOCK_ASSERT(d);
+
+ switch (d->bd_bufmode) {
+ case BPF_BUFMODE_BUFFER:
+ return (bpf_buffer_append_bytes(d, buf, offset, src, len));
+
+ case BPF_BUFMODE_ZBUF:
+ d->bd_zcopy++;
+ return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
+
+ default:
+ panic("bpf_buf_append_bytes");
+ }
+}
+
+static void
+bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+ u_int len)
+{
+
+ BPFD_LOCK_ASSERT(d);
+
+ switch (d->bd_bufmode) {
+ case BPF_BUFMODE_BUFFER:
+ return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
+
+ case BPF_BUFMODE_ZBUF:
+ d->bd_zcopy++;
+ return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
+
+ default:
+ panic("bpf_buf_append_mbuf");
+ }
+}
+
+/*
+ * If the buffer mechanism has a way to decide that a held buffer can be made
+ * free, then it is exposed via the bpf_canfreebuf() interface. (1) is
+ * returned if the buffer can be discarded, (0) is returned if it cannot.
+ */
+static int
+bpf_canfreebuf(struct bpf_d *d)
+{
+
+ BPFD_LOCK_ASSERT(d);
+
+ switch (d->bd_bufmode) {
+ case BPF_BUFMODE_ZBUF:
+ return (bpf_zerocopy_canfreebuf(d));
+ }
+ return (0);
+}
+
+void
+bpf_bufheld(struct bpf_d *d)
+{
+
+ BPFD_LOCK_ASSERT(d);
+
+ switch (d->bd_bufmode) {
+ case BPF_BUFMODE_ZBUF:
+ bpf_zerocopy_bufheld(d);
+ break;
+ }
+}
+
+static void
+bpf_free(struct bpf_d *d)
+{
+
+ switch (d->bd_bufmode) {
+ case BPF_BUFMODE_BUFFER:
+ return (bpf_buffer_free(d));
+
+ case BPF_BUFMODE_ZBUF:
+ return (bpf_zerocopy_free(d));
+
+ default:
+ panic("bpf_buf_free");
+ }
+}
+
+static int
+bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
+{
+
+ if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
+ return (EOPNOTSUPP);
+ return (bpf_buffer_uiomove(d, buf, len, uio));
+}
+
+static int
+bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
+{
+
+ if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
+ return (EOPNOTSUPP);
+ return (bpf_buffer_ioctl_sblen(d, i));
+}
+
+static int
+bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
+{
+
+ if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+ return (EOPNOTSUPP);
+ return (bpf_zerocopy_ioctl_getzmax(td, d, i));
+}
+
+static int
+bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+ if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+ return (EOPNOTSUPP);
+ return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
+}
+
+static int
+bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+ if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+ return (EOPNOTSUPP);
+ return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
+}
+
+/*
+ * General BPF functions.
+ */
static int
bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
@@ -412,7 +549,14 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
"bpf%d", dev2unit(dev));
MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
dev->si_drv1 = d;
- d->bd_bufsize = bpf_bufsize;
+
+ /*
+ * For historical reasons, perform a one-time initialization call to
+ * the buffer routines, even though we're not yet committed to a
+ * particular buffer method.
+ */
+ bpf_buffer_init(d);
+ d->bd_bufmode = BPF_BUFMODE_BUFFER;
d->bd_sig = SIGIO;
d->bd_direction = BPF_D_INOUT;
d->bd_pid = td->td_proc->p_pid;
@@ -459,18 +603,6 @@ bpfclose(struct cdev *dev, int flags, int fmt, struct thread *td)
return (0);
}
-
-/*
- * Rotate the packet buffers in descriptor d. Move the store buffer
- * into the hold slot, and the free buffer into the store slot.
- * Zero the length of the new store buffer.
- */
-#define ROTATE_BUFFERS(d) \
- (d)->bd_hbuf = (d)->bd_sbuf; \
- (d)->bd_hlen = (d)->bd_slen; \
- (d)->bd_sbuf = (d)->bd_fbuf; \
- (d)->bd_slen = 0; \
- (d)->bd_fbuf = NULL;
/*
* bpfread - read next chunk of packets from buffers
*/
@@ -490,6 +622,10 @@ bpfread(struct cdev *dev, struct uio *uio, int ioflag)
BPFD_LOCK(d);
d->bd_pid = curthread->td_proc->p_pid;
+ if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
+ BPFD_UNLOCK(d);
+ return (EOPNOTSUPP);
+ }
if (d->bd_state == BPF_WAITING)
callout_stop(&d->bd_callout);
timed_out = (d->bd_state == BPF_TIMED_OUT);
@@ -567,7 +703,7 @@ bpfread(struct cdev *dev, struct uio *uio, int ioflag)
* issues a read on the same fd at the same time? Don't want this
* getting invalidated.
*/
- error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
+ error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
BPFD_LOCK(d);
d->bd_fbuf = d->bd_hbuf;
@@ -613,6 +749,20 @@ bpf_timed_out(void *arg)
}
static int
+bpf_ready(struct bpf_d *d)
+{
+
+ BPFD_LOCK_ASSERT(d);
+
+ if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
+ return (1);
+ if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
+ d->bd_slen != 0)
+ return (1);
+ return (0);
+}
+
+static int
bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
{
struct bpf_d *d = dev->si_drv1;
@@ -622,25 +772,34 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
int error, hlen;
d->bd_pid = curthread->td_proc->p_pid;
- if (d->bd_bif == NULL)
+ d->bd_wcount++;
+ if (d->bd_bif == NULL) {
+ d->bd_wdcount++;
return (ENXIO);
+ }
ifp = d->bd_bif->bif_ifp;
- if ((ifp->if_flags & IFF_UP) == 0)
+ if ((ifp->if_flags & IFF_UP) == 0) {
+ d->bd_wdcount++;
return (ENETDOWN);
+ }
- if (uio->uio_resid == 0)
+ if (uio->uio_resid == 0) {
+ d->bd_wdcount++;
return (0);
+ }
bzero(&dst, sizeof(dst));
m = NULL;
hlen = 0;
error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
&m, &dst, &hlen, d->bd_wfilter);
- if (error)
+ if (error) {
+ d->bd_wdcount++;
return (error);
-
+ }
+ d->bd_wfcount++;
if (d->bd_hdrcmplt)
dst.sa_family = pseudo_AF_HDRCMPLT;
@@ -667,6 +826,8 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
#endif
error = (*ifp->if_output)(ifp, m, &dst, NULL);
+ if (error)
+ d->bd_wdcount++;
if (mc != NULL) {
if (error == 0)
@@ -697,6 +858,10 @@ reset_d(struct bpf_d *d)
d->bd_rcount = 0;
d->bd_dcount = 0;
d->bd_fcount = 0;
+ d->bd_wcount = 0;
+ d->bd_wfcount = 0;
+ d->bd_wdcount = 0;
+ d->bd_zcopy = 0;
}
/*
@@ -721,6 +886,11 @@ reset_d(struct bpf_d *d)
* BIOCSDIRECTION Set packet direction flag
* BIOCLOCK Set "locked" flag
* BIOCFEEDBACK Set packet feedback mode.
+ * BIOCSETZBUF Set current zero-copy buffer locations.
+ * BIOCGETZMAX Get maximum zero-copy buffer size.
+ * BIOCROTZBUF Force rotation of zero-copy buffer
+ * BIOCSETBUFMODE Set buffer mode.
+ * BIOCGETBUFMODE Get current buffer mode.
*/
/* ARGSUSED */
static int
@@ -758,6 +928,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
case BIOCSRTIMEOUT:
case BIOCIMMEDIATE:
case TIOCGPGRP:
+ case BIOCROTZBUF:
break;
default:
return (EPERM);
@@ -810,17 +981,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
* Set buffer length.
*/
case BIOCSBLEN:
- if (d->bd_bif != NULL)
- error = EINVAL;
- else {
- u_int size = *(u_int *)addr;
-
- if (size > bpf_maxbufsize)
- *(u_int *)addr = size = bpf_maxbufsize;
- else if (size < BPF_MINBUFSIZE)
- *(u_int *)addr = size = BPF_MINBUFSIZE;
- d->bd_bufsize = size;
- }
+ error = bpf_ioctl_sblen(d, (u_int *)addr);
break;
/*
@@ -945,6 +1106,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
{
struct bpf_stat *bs = (struct bpf_stat *)addr;
+ /* XXXCSJP overflow */
bs->bs_recv = d->bd_rcount;
bs->bs_drop = d->bd_dcount;
break;
@@ -1055,6 +1217,50 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
case BIOCGRSIG:
*(u_int *)addr = d->bd_sig;
break;
+
+ case BIOCGETBUFMODE:
+ *(u_int *)addr = d->bd_bufmode;
+ break;
+
+ case BIOCSETBUFMODE:
+ /*
+ * Allow the buffering mode to be changed as long as we
+ * haven't yet committed to a particular mode. Our
+ * definition of commitment, for now, is whether or not a
+ * buffer has been allocated or an interface attached, since
+ * that's the point where things get tricky.
+ */
+ switch (*(u_int *)addr) {
+ case BPF_BUFMODE_BUFFER:
+ break;
+
+ case BPF_BUFMODE_ZBUF:
+ if (bpf_zerocopy_enable)
+ break;
+ /* FALLSTHROUGH */
+
+ default:
+ return (EINVAL);
+ }
+
+ BPFD_LOCK(d);
+ if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
+ d->bd_fbuf != NULL || d->bd_bif != NULL) {
+ BPFD_UNLOCK(d);
+ return (EBUSY);
+ }
+ d->bd_bufmode = *(u_int *)addr;
+ BPFD_UNLOCK(d);
+ break;
+
+ case BIOCGETZMAX:
+ return (bpf_ioctl_getzmax(td, d, (size_t *)addr));
+
+ case BIOCSETZBUF:
+ return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr));
+
+ case BIOCROTZBUF:
+ return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr));
}
return (error);
}
@@ -1155,13 +1361,31 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr)
return (ENXIO);
bp = theywant->if_bpf;
+
/*
- * Allocate the packet buffers if we need to.
- * If we're already attached to requested interface,
- * just flush the buffer.
+ * Behavior here depends on the buffering model. If we're using
+ * kernel memory buffers, then we can allocate them here. If we're
+ * using zero-copy, then the user process must have registered
+ * buffers by the time we get here. If not, return an error.
+ *
+ * XXXRW: There are locking issues here with multi-threaded use: what
+ * if two threads try to set the interface at once?
*/
- if (d->bd_sbuf == NULL)
- bpf_allocbufs(d);
+ switch (d->bd_bufmode) {
+ case BPF_BUFMODE_BUFFER:
+ if (d->bd_sbuf == NULL)
+ bpf_buffer_alloc(d);
+ KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL"));
+ break;
+
+ case BPF_BUFMODE_ZBUF:
+ if (d->bd_sbuf == NULL)
+ return (EINVAL);
+ break;
+
+ default:
+ panic("bpf_setif: bufmode %d", d->bd_bufmode);
+ }
if (bp != d->bd_bif) {
if (d->bd_bif)
/*
@@ -1305,37 +1529,14 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
#ifdef MAC
if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
#endif
- catchpacket(d, pkt, pktlen, slen, bcopy, &tv);
+ catchpacket(d, pkt, pktlen, slen,
+ bpf_append_bytes, &tv);
}
BPFD_UNLOCK(d);
}
BPFIF_UNLOCK(bp);
}
-/*
- * Copy data from an mbuf chain into a buffer. This code is derived
- * from m_copydata in sys/uipc_mbuf.c.
- */
-static void
-bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
-{
- const struct mbuf *m;
- u_int count;
- u_char *dst;
-
- m = src_arg;
- dst = dst_arg;
- while (len > 0) {
- if (m == NULL)
- panic("bpf_mcopy");
- count = min(m->m_len, len);
- bcopy(mtod(m, void *), dst, count);
- m = m->m_next;
- dst += count;
- len -= count;
- }
-}
-
#define BPF_CHECK_DIRECTION(d, m) \
if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \
((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL))
@@ -1385,7 +1586,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
#endif
catchpacket(d, (u_char *)m, pktlen, slen,
- bpf_mcopy, &tv);
+ bpf_append_mbuf, &tv);
}
BPFD_UNLOCK(d);
}
@@ -1440,7 +1641,7 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
#endif
catchpacket(d, (u_char *)&mb, pktlen, slen,
- bpf_mcopy, &tv);
+ bpf_append_mbuf, &tv);
}
BPFD_UNLOCK(d);
}
@@ -1453,19 +1654,34 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
* Move the packet data from interface memory (pkt) into the
* store buffer. "cpfn" is the routine called to do the actual data
* transfer. bcopy is passed in to copy contiguous chunks, while
- * bpf_mcopy is passed in to copy mbuf chains. In the latter case,
+ * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case,
* pkt is really an mbuf.
*/
static void
catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
- void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
+ void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
+ struct timeval *tv)
{
- struct bpf_hdr *hp;
+ struct bpf_hdr hdr;
int totlen, curlen;
int hdrlen = d->bd_bif->bif_hdrlen;
int do_wakeup = 0;
BPFD_LOCK_ASSERT(d);
+
+ /*
+ * Detect whether user space has released a buffer back to us, and if
+ * so, move it from being a hold buffer to a free buffer. This may
+ * not be the best place to do it (for example, we might only want to
+ * run this check if we need the space), but for now it's a reliable
+ * spot to do it.
+ */
+ if (bpf_canfreebuf(d)) {
+ d->bd_fbuf = d->bd_hbuf;
+ d->bd_hbuf = NULL;
+ d->bd_hlen = 0;
+ }
+
/*
* Figure out how many bytes to move. If the packet is
* greater or equal to the snapshot length, transfer that
@@ -1500,23 +1716,27 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
}
else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
/*
- * Immediate mode is set, or the read timeout has
- * already expired during a select call. A packet
- * arrived, so the reader should be woken up.
+ * Immediate mode is set, or the read timeout has already
+ * expired during a select call. A packet arrived, so the
+ * reader should be woken up.
*/
do_wakeup = 1;
/*
- * Append the bpf header.
+ * Append the bpf header. Note we append the actual header size, but
+ * move forward the length of the header plus padding.
*/
- hp = (struct bpf_hdr *)(d->bd_sbuf + curlen);
- hp->bh_tstamp = *tv;
- hp->bh_datalen = pktlen;
- hp->bh_hdrlen = hdrlen;
+ bzero(&hdr, sizeof(hdr));
+ hdr.bh_tstamp = *tv;
+ hdr.bh_datalen = pktlen;
+ hdr.bh_hdrlen = hdrlen;
+ hdr.bh_caplen = totlen - hdrlen;
+ bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
+
/*
* Copy the packet data into the store buffer and update its length.
*/
- (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
+ (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen);
d->bd_slen = curlen + totlen;
if (do_wakeup)
@@ -1524,41 +1744,19 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
}
/*
- * Initialize all nonzero fields of a descriptor.
- */
-static void
-bpf_allocbufs(struct bpf_d *d)
-{
-
- KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL"));
- KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL"));
- KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL"));
-
- d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
- d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
- d->bd_slen = 0;
- d->bd_hlen = 0;
-}
-
-/*
* Free buffers currently in use by a descriptor.
* Called on close.
*/
static void
bpf_freed(struct bpf_d *d)
{
+
/*
* We don't need to lock out interrupts since this descriptor has
* been detached from its interface and it yet hasn't been marked
* free.
*/
- if (d->bd_sbuf != NULL) {
- free(d->bd_sbuf, M_BPF);
- if (d->bd_hbuf != NULL)
- free(d->bd_hbuf, M_BPF);
- if (d->bd_fbuf != NULL)
- free(d->bd_fbuf, M_BPF);
- }
+ bpf_free(d);
if (d->bd_rfilter) {
free((caddr_t)d->bd_rfilter, M_BPF);
#ifdef BPF_JITTER
@@ -1762,6 +1960,7 @@ bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
bzero(d, sizeof(*d));
BPFD_LOCK_ASSERT(bd);
+ d->bd_structsize = sizeof(*d);
d->bd_immediate = bd->bd_immediate;
d->bd_promisc = bd->bd_promisc;
d->bd_hdrcmplt = bd->bd_hdrcmplt;
@@ -1779,6 +1978,11 @@ bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
strlcpy(d->bd_ifname,
bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
d->bd_locked = bd->bd_locked;
+ d->bd_wcount = bd->bd_wcount;
+ d->bd_wdcount = bd->bd_wdcount;
+ d->bd_wfcount = bd->bd_wfcount;
+ d->bd_zcopy = bd->bd_zcopy;
+ d->bd_bufmode = bd->bd_bufmode;
}
static int
diff --git a/sys/net/bpf.h b/sys/net/bpf.h
index 91ea0f6827d2..1d6f9db7415d 100644
--- a/sys/net/bpf.h
+++ b/sys/net/bpf.h
@@ -92,6 +92,27 @@ struct bpf_version {
#define BPF_MAJOR_VERSION 1
#define BPF_MINOR_VERSION 1
+/*
+ * Historically, BPF has supported a single buffering model, first using mbuf
+ * clusters in kernel, and later using malloc(9) buffers in kernel. We now
+ * support multiple buffering modes, which may be queried and set using
+ * BIOCGETBUFMODE and BIOCSETBUFMODE. So as to avoid handling the complexity
+ * of changing modes while sniffing packets, the mode becomes fixed once an
+ * interface has been attached to the BPF descriptor.
+ */
+#define BPF_BUFMODE_BUFFER 1 /* Kernel buffers with read(). */
+#define BPF_BUFMODE_ZBUF 2 /* Zero-copy buffers. */
+
+/*-
+ * Struct used by BIOCSETZBUF, BIOCROTZBUF: describes up to two zero-copy
+ * buffer as used by BPF.
+ */
+struct bpf_zbuf {
+ void *bz_bufa; /* Location of 'a' zero-copy buffer. */
+ void *bz_bufb; /* Location of 'b' zero-copy buffer. */
+ size_t bz_buflen; /* Size of zero-copy buffers. */
+};
+
#define BIOCGBLEN _IOR('B',102, u_int)
#define BIOCSBLEN _IOWR('B',102, u_int)
#define BIOCSETF _IOW('B',103, struct bpf_program)
@@ -116,6 +137,11 @@ struct bpf_version {
#define BIOCLOCK _IO('B', 122)
#define BIOCSETWF _IOW('B',123, struct bpf_program)
#define BIOCFEEDBACK _IOW('B',124, u_int)
+#define BIOCGETBUFMODE _IOR('B',125, u_int)
+#define BIOCSETBUFMODE _IOW('B',126, u_int)
+#define BIOCGETZMAX _IOR('B',127, size_t)
+#define BIOCROTZBUF _IOR('B',128, struct bpf_zbuf)
+#define BIOCSETZBUF _IOW('B',129, struct bpf_zbuf)
/* Obsolete */
#define BIOCGSEESENT BIOCGDIRECTION
@@ -149,6 +175,24 @@ struct bpf_hdr {
#endif
/*
+ * When using zero-copy BPF buffers, a shared memory header is present
+ * allowing the kernel BPF implementation and user process to synchronize
+ * without using system calls. This structure defines that header. When
+ * accessing these fields, appropriate atomic operation and memory barriers
+ * are required in order not to see stale or out-of-order data; see bpf(4)
+ * for reference code to access these fields from userspace.
+ *
+ * The layout of this structure is critical, and must not be changed; if must
+ * fit in a single page on all architectures.
+ */
+struct bpf_zbuf_header {
+ volatile u_int bzh_kernel_gen; /* Kernel generation number. */
+ volatile u_int bzh_kernel_len; /* Length of data in the buffer. */
+ volatile u_int bzh_user_gen; /* User generation number. */
+ u_int _bzh_pad[5];
+};
+
+/*
* Data-link level type codes.
*/
#define DLT_NULL 0 /* BSD loopback encapsulation */
@@ -761,6 +805,27 @@ struct bpf_dltlist {
};
#ifdef _KERNEL
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_BPF);
+#endif
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_net_bpf);
+#endif
+
+/*
+ * Rotate the packet buffers in descriptor d. Move the store buffer into the
+ * hold slot, and the free buffer ino the store slot. Zero the length of the
+ * new store buffer. Descriptor lock should be held.
+ */
+#define ROTATE_BUFFERS(d) do { \
+ (d)->bd_hbuf = (d)->bd_sbuf; \
+ (d)->bd_hlen = (d)->bd_slen; \
+ (d)->bd_sbuf = (d)->bd_fbuf; \
+ (d)->bd_slen = 0; \
+ (d)->bd_fbuf = NULL; \
+ bpf_bufheld(d); \
+} while (0)
+
/*
* Descriptor associated with each attached hardware interface.
*/
@@ -773,6 +838,7 @@ struct bpf_if {
struct mtx bif_mtx; /* mutex for interface */
};
+void bpf_bufheld(struct bpf_d *d);
int bpf_validate(const struct bpf_insn *, int);
void bpf_tap(struct bpf_if *, u_char *, u_int);
void bpf_mtap(struct bpf_if *, struct mbuf *);
diff --git a/sys/net/bpf_buffer.c b/sys/net/bpf_buffer.c
new file mode 100644
index 000000000000..f07e9486cbd6
--- /dev/null
+++ b/sys/net/bpf_buffer.c
@@ -0,0 +1,210 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (c) 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
+ * Berkeley Laboratory.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)bpf.c 8.4 (Berkeley) 1/9/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bpf.h"
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/bpf.h>
+#include <net/bpf_buffer.h>
+#include <net/bpfdesc.h>
+
+/*
+ * Implement historical kernel memory buffering model for BPF: two malloc(9)
+ * kernel buffers are hung off of the descriptor. The size is fixed prior to
+ * attaching to an ifnet, ad cannot be changed after that. read(2) simply
+ * copies the data to user space using uiomove(9).
+ */
+
+static int bpf_bufsize = 4096;
+SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
+ &bpf_bufsize, 0, "");
+static int bpf_maxbufsize = BPF_MAXBUFSIZE;
+SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
+ &bpf_maxbufsize, 0, "");
+
+void
+bpf_buffer_alloc(struct bpf_d *d)
+{
+
+ KASSERT(d->bd_fbuf == NULL, ("bpf_buffer_alloc: bd_fbuf != NULL"));
+ KASSERT(d->bd_sbuf == NULL, ("bpf_buffer_alloc: bd_sbuf != NULL"));
+ KASSERT(d->bd_hbuf == NULL, ("bpf_buffer_alloc: bd_hbuf != NULL"));
+
+ d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
+ d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
+ d->bd_hbuf = NULL;
+ d->bd_slen = 0;
+ d->bd_hlen = 0;
+}
+
+/*
+ * Simple data copy to the current kernel buffer.
+ */
+void
+bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len)
+{
+ u_char *src_bytes;
+
+ src_bytes = (u_char *)src;
+ bcopy(src_bytes, buf + offset, len);
+}
+
+/*
+ * Scatter-gather data copy from an mbuf chain to the current kernel buffer.
+ */
+void
+bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+ u_int len)
+{
+ const struct mbuf *m;
+ u_char *dst;
+ u_int count;
+
+ m = (struct mbuf *)src;
+ dst = (u_char *)buf + offset;
+ while (len > 0) {
+ if (m == NULL)
+ panic("bpf_mcopy");
+ count = min(m->m_len, len);
+ bcopy(mtod(m, void *), dst, count);
+ m = m->m_next;
+ dst += count;
+ len -= count;
+ }
+}
+
+/*
+ * Free BPF kernel buffers on device close.
+ */
+void
+bpf_buffer_free(struct bpf_d *d)
+{
+
+ if (d->bd_sbuf != NULL)
+ free(d->bd_sbuf, M_BPF);
+ if (d->bd_hbuf != NULL)
+ free(d->bd_hbuf, M_BPF);
+ if (d->bd_fbuf != NULL)
+ free(d->bd_fbuf, M_BPF);
+
+#ifdef INVARIANTS
+ d->bd_sbuf = d->bd_hbuf = d->bd_fbuf = (caddr_t)~0;
+#endif
+}
+
+/*
+ * This is a historical initialization that occurs when the BPF descriptor is
+ * first opened. It does not imply selection of a buffer mode, so we don't
+ * allocate buffers here.
+ */
+void
+bpf_buffer_init(struct bpf_d *d)
+{
+
+ d->bd_bufsize = bpf_bufsize;
+}
+
+/*
+ * Allocate or resize buffers.
+ */
+int
+bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i)
+{
+ u_int size;
+
+ BPFD_LOCK(d);
+ if (d->bd_bif != NULL) {
+ BPFD_UNLOCK(d);
+ return (EINVAL);
+ }
+ size = *i;
+ if (size > bpf_maxbufsize)
+ *i = size = bpf_maxbufsize;
+ else if (size < BPF_MINBUFSIZE)
+ *i = size = BPF_MINBUFSIZE;
+ d->bd_bufsize = size;
+ BPFD_UNLOCK(d);
+ return (0);
+}
+
+/*
+ * Copy buffer storage to user space in read().
+ */
+int
+bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
+{
+
+ return (uiomove(buf, len, uio));
+}
diff --git a/sys/net/bpf_buffer.h b/sys/net/bpf_buffer.h
new file mode 100644
index 000000000000..82d0310b4d44
--- /dev/null
+++ b/sys/net/bpf_buffer.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_BPF_BUFFER_H_
+#define _NET_BPF_BUFFER_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+void bpf_buffer_alloc(struct bpf_d *d);
+void bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len);
+void bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len);
+void bpf_buffer_free(struct bpf_d *d);
+void bpf_buffer_init(struct bpf_d *d);
+int bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i);
+int bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len,
+ struct uio *uio);
+
+#endif /* !_NET_BPF_BUFFER_H_ */
diff --git a/sys/net/bpf_zerocopy.c b/sys/net/bpf_zerocopy.c
new file mode 100644
index 000000000000..896ad1da29f4
--- /dev/null
+++ b/sys/net/bpf_zerocopy.c
@@ -0,0 +1,510 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bpf.h"
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sf_buf.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+#include <machine/atomic.h>
+
+#include <net/if.h>
+#include <net/bpf.h>
+#include <net/bpfdesc.h>
+#include <net/bpf_zerocopy.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
+/*
+ * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which
+ * are mapped into the kernel address space using sf_bufs and used directly
+ * by BPF. Memory is wired since page faults cannot be tolerated in the
+ * contexts where the buffers are copied to (locks held, interrupt context,
+ * etc). Access to shared memory buffers is synchronized using a header on
+ * each buffer, allowing the number of system calls to go to zero as BPF
+ * reaches saturation (buffers filled as fast as they can be drained by the
+ * user process). Full details of the protocol for communicating between the
+ * user process and BPF may be found in bpf(4).
+ */
+
+/*
+ * Maximum number of pages per buffer. Since all BPF devices use two, the
+ * maximum per device is 2*BPF_MAX_PAGES. Resource limits on the number of
+ * sf_bufs may be an issue, so do not set this too high. On older systems,
+ * kernel address space limits may also be an issue.
+ */
+#define BPF_MAX_PAGES 512
+
+/*
+ * struct zbuf describes a memory buffer loaned by a user process to the
+ * kernel. We represent this as a series of pages managed using an array of
+ * sf_bufs. Even though the memory is contiguous in user space, it may not
+ * be mapped contiguously in the kernel (i.e., a set of physically
+ * non-contiguous pages in the direct map region) so we must implement
+ * scatter-gather copying. One significant mitigating factor is that on
+ * systems with a direct memory map, we can avoid TLB misses.
+ *
+ * At the front of the shared memor region is a bpf_zbuf_header, which
+ * contains shared control data to allow user space and the kernel to
+ * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
+ * knows that the space is not available.
+ */
+struct zbuf {
+ vm_offset_t zb_uaddr; /* User address, may be stale. */
+ size_t zb_size; /* Size of buffer, incl. header. */
+ u_int zb_numpages; /* Number of pages. */
+ struct sf_buf **zb_pages; /* Pages themselves. */
+ struct bpf_zbuf_header *zb_header; /* Shared header. */
+};
+
+/*
+ * Release a page we've previously wired.
+ */
+static void
+zbuf_page_free(vm_page_t pp)
+{
+
+ vm_page_lock_queues();
+ vm_page_unwire(pp, 0);
+ if (pp->wire_count == 0 && pp->object == NULL)
+ vm_page_free(pp);
+ vm_page_unlock_queues();
+}
+
+/*
+ * Free an sf_buf with attached page.
+ */
+static void
+zbuf_sfbuf_free(struct sf_buf *sf)
+{
+ vm_page_t pp;
+
+ pp = sf_buf_page(sf);
+ sf_buf_free(sf);
+ zbuf_page_free(pp);
+}
+
+/*
+ * Free a zbuf, including its page array, sbufs, and pages. Allow partially
+ * allocated zbufs to be freed so that it may be used even during a zbuf
+ * setup.
+ */
+static void
+zbuf_free(struct zbuf *zb)
+{
+ int i;
+
+ for (i = 0; i < zb->zb_numpages; i++) {
+ if (zb->zb_pages[i] != NULL)
+ zbuf_sfbuf_free(zb->zb_pages[i]);
+ }
+ free(zb->zb_pages, M_BPF);
+ free(zb, M_BPF);
+}
+
+/*
+ * Given a user pointer to a page of user memory, return an sf_buf for the
+ * page. Because we may be requesting quite a few sf_bufs, prefer failure to
+ * deadlock and use SFB_NOWAIT.
+ */
+static struct sf_buf *
+zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)
+{
+ struct sf_buf *sf;
+ vm_page_t pp;
+
+ if (vm_fault_quick((caddr_t) uaddr, VM_PROT_READ | VM_PROT_WRITE) <
+ 0)
+ return (NULL);
+ pp = pmap_extract_and_hold(map->pmap, uaddr, VM_PROT_READ |
+ VM_PROT_WRITE);
+ if (pp == NULL)
+ return (NULL);
+ vm_page_lock_queues();
+ vm_page_wire(pp);
+ vm_page_unhold(pp);
+ vm_page_unlock_queues();
+ sf = sf_buf_alloc(pp, SFB_NOWAIT);
+ if (sf == NULL) {
+ zbuf_page_free(pp);
+ return (NULL);
+ }
+ return (sf);
+}
+
+/*
+ * Create a zbuf describing a range of user address space memory. Validate
+ * page alignment, size requirements, etc.
+ */
+static int
+zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len,
+ struct zbuf **zbp)
+{
+ struct zbuf *zb;
+ struct vm_map *map;
+ int error, i;
+
+ *zbp = NULL;
+
+ /*
+ * User address must be page-aligned.
+ */
+ if (uaddr & PAGE_MASK)
+ return (EINVAL);
+
+ /*
+ * Length must be an integer number of full pages.
+ */
+ if (len & PAGE_MASK)
+ return (EINVAL);
+
+ /*
+ * Length must not exceed per-buffer resource limit.
+ */
+ if ((len / PAGE_SIZE) > BPF_MAX_PAGES)
+ return (EINVAL);
+
+ /*
+ * Allocate the buffer and set up each page with is own sf_buf.
+ */
+ error = 0;
+ zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK);
+ zb->zb_uaddr = uaddr;
+ zb->zb_size = len;
+ zb->zb_numpages = len / PAGE_SIZE;
+ zb->zb_pages = malloc(sizeof(struct sf_buf *) *
+ zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK);
+ map = &td->td_proc->p_vmspace->vm_map;
+ for (i = 0; i < zb->zb_numpages; i++) {
+ zb->zb_pages[i] = zbuf_sfbuf_get(map,
+ uaddr + (i * PAGE_SIZE));
+ if (zb->zb_pages[i] == NULL) {
+ error = EFAULT;
+ goto error;
+ }
+ }
+ zb->zb_header =
+ (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]);
+ bzero(zb->zb_header, sizeof(*zb->zb_header));
+ *zbp = zb;
+ return (0);
+
+error:
+ zbuf_free(zb);
+ return (error);
+}
+
+/*
+ * Copy bytes from a source into the specified zbuf. The caller is
+ * responsible for performing bounds checking, etc.
+ */
+void
+bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len)
+{
+ u_int count, page, poffset;
+ u_char *src_bytes;
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_append_bytes: not in zbuf mode"));
+ KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf"));
+
+ src_bytes = (u_char *)src;
+ zb = (struct zbuf *)buf;
+
+ /*
+ * Scatter-gather copy to user pages mapped into kernel address space
+ * using sf_bufs: copy up to a page at a time.
+ */
+ offset += sizeof(struct bpf_zbuf_header);
+ page = offset / PAGE_SIZE;
+ poffset = offset % PAGE_SIZE;
+ while (len > 0) {
+ KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:"
+ " page overflow (%d p %d np)\n", page, zb->zb_numpages));
+
+ count = min(len, PAGE_SIZE - poffset);
+ bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) +
+ poffset, count);
+ poffset += count;
+ if (poffset == PAGE_SIZE) {
+ poffset = 0;
+ page++;
+ }
+ KASSERT(poffset < PAGE_SIZE,
+ ("bpf_zerocopy_append_bytes: page offset overflow (%d)",
+ poffset));
+ len -= count;
+ src_bytes += count;
+ }
+}
+
+/*
+ * Copy bytes from an mbuf chain to the specified zbuf: copying will be
+ * scatter-gather both from mbufs, which may be fragmented over memory, and
+ * to pages, which may not be contiguously mapped in kernel address space.
+ * As with bpf_zerocopy_append_bytes(), the caller is responsible for
+ * checking that this will not exceed the buffer limit.
+ */
+void
+bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len)
+{
+ u_int count, moffset, page, poffset;
+ const struct mbuf *m;
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_append_mbuf not in zbuf mode"));
+ KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf"));
+
+ m = (struct mbuf *)src;
+ zb = (struct zbuf *)buf;
+
+ /*
+ * Scatter gather both from an mbuf chain and to a user page set
+ * mapped into kernel address space using sf_bufs. If we're lucky,
+ * each mbuf requires one copy operation, but if page alignment and
+ * mbuf alignment work out less well, we'll be doing two copies per
+ * mbuf.
+ */
+ offset += sizeof(struct bpf_zbuf_header);
+ page = offset / PAGE_SIZE;
+ poffset = offset % PAGE_SIZE;
+ moffset = 0;
+ while (len > 0) {
+ KASSERT(page < zb->zb_numpages,
+ ("bpf_zerocopy_append_mbuf: page overflow (%d p %d "
+ "np)\n", page, zb->zb_numpages));
+ KASSERT(m != NULL,
+ ("bpf_zerocopy_append_mbuf: end of mbuf chain"));
+
+ count = min(m->m_len - moffset, len);
+ count = min(count, PAGE_SIZE - poffset);
+ bcopy(mtod(m, u_char *) + moffset,
+ ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset,
+ count);
+ poffset += count;
+ if (poffset == PAGE_SIZE) {
+ poffset = 0;
+ page++;
+ }
+ KASSERT(poffset < PAGE_SIZE,
+ ("bpf_zerocopy_append_mbuf: page offset overflow (%d)",
+ poffset));
+ moffset += count;
+ if (moffset == m->m_len) {
+ m = m->m_next;
+ moffset = 0;
+ }
+ len -= count;
+ }
+}
+
+/*
+ * Notification from the BPF framework that a buffer has moved into the held
+ * slot on a descriptor. Zero-copy BPF will update the shared page to let
+ * the user process know.
+ */
+void
+bpf_zerocopy_bufheld(struct bpf_d *d)
+{
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_bufheld: not in zbuf mode"));
+
+ zb = (struct zbuf *)d->bd_hbuf;
+ KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
+ zb->zb_header->bzh_kernel_len = d->bd_hlen;
+ atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
+}
+
+/*
+ * Query from the BPF framework regarding whether the buffer currently in the
+ * held position can be moved to the free position, which can be indicated by
+ * the user process making their generation number equal to the kernel
+ * generation number.
+ */
+int
+bpf_zerocopy_canfreebuf(struct bpf_d *d)
+{
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_canfreebuf: not in zbuf mode"));
+
+ zb = (struct zbuf *)d->bd_hbuf;
+ if (zb == NULL)
+ return (0);
+ if (zb->zb_header->bzh_kernel_gen ==
+ atomic_load_acq_int(&zb->zb_header->bzh_user_gen))
+ return (1);
+ return (0);
+}
+
+/*
+ * Free zero copy buffers at request of descriptor.
+ */
+void
+bpf_zerocopy_free(struct bpf_d *d)
+{
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_free: not in zbuf mode"));
+
+ zb = (struct zbuf *)d->bd_sbuf;
+ if (zb != NULL)
+ zbuf_free(zb);
+ zb = (struct zbuf *)d->bd_hbuf;
+ if (zb != NULL)
+ zbuf_free(zb);
+ zb = (struct zbuf *)d->bd_fbuf;
+ if (zb != NULL)
+ zbuf_free(zb);
+}
+
+/*
+ * Ioctl to return the maximum buffer size.
+ */
+int
+bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
+{
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode"));
+
+ *i = BPF_MAX_PAGES * PAGE_SIZE;
+ return (0);
+}
+
+/*
+ * Ioctl to force rotation of the two buffers, if there's any data available.
+ * This can be used by user space to implement time outs when waiting for a
+ * buffer to fill.
+ */
+int
+bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
+ struct bpf_zbuf *bz)
+{
+ struct zbuf *bzh;
+
+ bzero(bz, sizeof(*bz));
+ BPFD_LOCK(d);
+ if (d->bd_hbuf == NULL && d->bd_slen != 0) {
+ ROTATE_BUFFERS(d);
+ bzh = (struct zbuf *)d->bd_hbuf;
+ bz->bz_bufa = (void *)bzh->zb_uaddr;
+ bz->bz_buflen = d->bd_hlen;
+ }
+ BPFD_UNLOCK(d);
+ return (0);
+}
+
+/*
+ * Ioctl to configure zero-copy buffers -- may be done only once.
+ */
+int
+bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
+ struct bpf_zbuf *bz)
+{
+ struct zbuf *zba, *zbb;
+ int error;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode"));
+
+ /*
+ * Must set both buffers. Cannot clear them.
+ */
+ if (bz->bz_bufa == NULL || bz->bz_bufb == NULL)
+ return (EINVAL);
+
+ /*
+ * Buffers must have a size greater than 0. Alignment and other size
+ * validity checking is done in zbuf_setup().
+ */
+ if (bz->bz_buflen == 0)
+ return (EINVAL);
+
+ /*
+ * Allocate new buffers.
+ */
+ error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen,
+ &zba);
+ if (error)
+ return (error);
+ error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen,
+ &zbb);
+ if (error) {
+ zbuf_free(zba);
+ return (error);
+ }
+
+ /*
+ * We only allow buffers to be installed once, so atomically check
+ * that no buffers are currently installed and install new buffers.
+ */
+ BPFD_LOCK(d);
+ if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||
+ d->bd_bif != NULL) {
+ BPFD_UNLOCK(d);
+ zbuf_free(zba);
+ zbuf_free(zbb);
+ return (EINVAL);
+ }
+ d->bd_fbuf = (caddr_t)zbb;
+ d->bd_sbuf = (caddr_t)zba;
+ d->bd_slen = 0;
+ d->bd_hlen = 0;
+
+ /*
+ * We expose only the space left in the buffer after the size of the
+ * shared management region.
+ */
+ d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header);
+ BPFD_UNLOCK(d);
+ return (0);
+}
diff --git a/sys/net/bpf_zerocopy.h b/sys/net/bpf_zerocopy.h
new file mode 100644
index 000000000000..33d1f25041d8
--- /dev/null
+++ b/sys/net/bpf_zerocopy.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_BPF_ZEROCOPY_H_
+#define _NET_BPF_ZEROCOPY_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+void bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len);
+void bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len);
+void bpf_zerocopy_bufheld(struct bpf_d *);
+int bpf_zerocopy_canfreebuf(struct bpf_d *);
+void bpf_zerocopy_free(struct bpf_d *d);
+int bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d,
+ size_t *i);
+int bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
+ struct bpf_zbuf *bz);
+int bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
+ struct bpf_zbuf *bz);
+
+#endif /* !_NET_BPF_ZEROCOPY_H_ */
diff --git a/sys/net/bpfdesc.h b/sys/net/bpfdesc.h
index a46013edca43..ad9ab207dbf3 100644
--- a/sys/net/bpfdesc.h
+++ b/sys/net/bpfdesc.h
@@ -48,10 +48,11 @@
/*
* Descriptor associated with each open bpf file.
*/
+struct zbuf;
struct bpf_d {
LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */
/*
- * Buffer slots: two malloc buffers store the incoming packets.
+ * Buffer slots: two memory buffers store the incoming packets.
* The model has three slots. Sbuf is always occupied.
* sbuf (store) - Receive interrupt puts packets here.
* hbuf (hold) - When sbuf is full, put buffer here and
@@ -74,8 +75,8 @@ struct bpf_d {
#ifdef BPF_JITTER
bpf_jit_filter *bd_bfilter; /* binary filter code */
#endif
- u_long bd_rcount; /* number of packets received */
- u_long bd_dcount; /* number of packets dropped */
+ u_int64_t bd_rcount; /* number of packets received */
+ u_int64_t bd_dcount; /* number of packets dropped */
u_char bd_promisc; /* true if listening promiscuously */
u_char bd_state; /* idle, waiting, or timed out */
@@ -90,9 +91,14 @@ struct bpf_d {
struct mtx bd_mtx; /* mutex for this descriptor */
struct callout bd_callout; /* for BPF timeouts with select */
struct label *bd_label; /* MAC label for descriptor */
- u_long bd_fcount; /* number of packets which matched filter */
+ u_int64_t bd_fcount; /* number of packets which matched filter */
pid_t bd_pid; /* PID which created descriptor */
int bd_locked; /* true if descriptor is locked */
+ u_int bd_bufmode; /* Current buffer mode. */
+ u_int64_t bd_wcount; /* number of packets written */
+ u_int64_t bd_wfcount; /* number of packets that matched write filter */
+ u_int64_t bd_wdcount; /* number of packets dropped during a write */
+ u_int64_t bd_zcopy; /* number of zero copy operations */
};
/* Values for bd_state */
@@ -104,25 +110,21 @@ struct bpf_d {
#define BPFD_UNLOCK(bd) mtx_unlock(&(bd)->bd_mtx)
#define BPFD_LOCK_ASSERT(bd) mtx_assert(&(bd)->bd_mtx, MA_OWNED);
-/* Test whether a BPF is ready for read(). */
-#define bpf_ready(bd) \
- ((bd)->bd_hlen != 0 || \
- (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \
- (bd)->bd_slen != 0))
-
/*
* External representation of the bpf descriptor
*/
struct xbpf_d {
+ u_int bd_structsize; /* Size of this structure. */
u_char bd_promisc;
u_char bd_immediate;
+ u_char __bd_pad[6];
int bd_hdrcmplt;
int bd_direction;
int bd_feedback;
int bd_async;
- u_long bd_rcount;
- u_long bd_dcount;
- u_long bd_fcount;
+ u_int64_t bd_rcount;
+ u_int64_t bd_dcount;
+ u_int64_t bd_fcount;
int bd_sig;
int bd_slen;
int bd_hlen;
@@ -130,6 +132,16 @@ struct xbpf_d {
pid_t bd_pid;
char bd_ifname[IFNAMSIZ];
int bd_locked;
+ u_int64_t bd_wcount;
+ u_int64_t bd_wfcount;
+ u_int64_t bd_wdcount;
+ u_int64_t bd_zcopy;
+ int bd_bufmode;
+ /*
+ * Allocate 4 64 bit unsigned integers for future expansion so we do
+ * not have to worry about breaking the ABI.
+ */
+ u_int64_t bd_spare[4];
};
#define BPFIF_LOCK(bif) mtx_lock(&(bif)->bif_mtx)