diff options
-rw-r--r-- | share/man/man4/bpf.4 | 255 | ||||
-rw-r--r-- | sys/conf/files | 2 | ||||
-rw-r--r-- | sys/net/bpf.c | 422 | ||||
-rw-r--r-- | sys/net/bpf.h | 66 | ||||
-rw-r--r-- | sys/net/bpf_buffer.c | 210 | ||||
-rw-r--r-- | sys/net/bpf_buffer.h | 50 | ||||
-rw-r--r-- | sys/net/bpf_zerocopy.c | 510 | ||||
-rw-r--r-- | sys/net/bpf_zerocopy.h | 53 | ||||
-rw-r--r-- | sys/net/bpfdesc.h | 38 |
9 files changed, 1469 insertions, 137 deletions
diff --git a/share/man/man4/bpf.4 b/share/man/man4/bpf.4 index bb278586fbb1..9116b2dfa7b8 100644 --- a/share/man/man4/bpf.4 +++ b/share/man/man4/bpf.4 @@ -1,3 +1,30 @@ +.\" Copyright (c) 2007 Seccuris Inc. +.\" All rights reserved. +.\" +.\" This sofware was developed by Robert N. M. Watson under contract to +.\" Seccuris Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" .\" Copyright (c) 1990 The Regents of the University of California. .\" All rights reserved. .\" @@ -61,19 +88,6 @@ Whenever a packet is received by an interface, all file descriptors listening on that interface apply their filter. Each descriptor that accepts the packet receives its own copy. .Pp -Reads from these files return the next group of packets -that have matched the filter. -To improve performance, the buffer passed to read must be -the same size as the buffers used internally by -.Nm . -This size is returned by the -.Dv BIOCGBLEN -ioctl (see below), and -can be set with -.Dv BIOCSBLEN . -Note that an individual packet larger than this size is necessarily -truncated. -.Pp The packet filter will support any link level protocol that has fixed length headers. Currently, only Ethernet, @@ -94,6 +108,165 @@ The writes are unbuffered, meaning only one packet can be processed per write. Currently, only writes to Ethernets and .Tn SLIP links are supported. +.Sh BUFFER MODES +.Nm +devices deliver packet data to the application via memory buffers provided by +the application. +The buffer mode is set using the +.Dv BIOCSETBUFMODE +ioctl, and read using the +.Dv BIOCGETBUFMODE +ioctl. +.Ss Buffered read mode +By default, +.Nm +devices operate in the +.Dv BPF_BUFMODE_BUFFER +mode, in which packet data is copied explicitly from kernel to user memory +using the +.Xr read 2 +system call. +The user process will declare a fixed buffer size that will be used both for +sizing internal buffers and for all +.Xr read 2 +operations on the file. +This size is queried using the +.Dv BIOCGBLEN +ioctl, and is set using the +.Dv BIOCSBLEN +ioctl. +Note that an individual packet larger than the buffer size is necessarily +truncated. +.Ss Zero-copy buffer mode +.Nm +devices may also operate in the +.Dv BPF_BUFMODE_ZEROCOPY +mode, in which packet data is written directly into two user memory buffers +by the kernel, avoiding both system call and copying overhead. +Buffers are of fixed (and equal) size, page-aligned, and an even multiple of +the page size. +The maximum zero-copy buffer size is returned by the +.Dv BIOCGETZMAX +ioctl. +Note that an individual packet larger than the buffer size is necessarily +truncated. +.Pp +The user process registers two memory buffers using the +.Dv BIOCSETZBUF +ioctl, which accepts a +.Vt struct bpf_zbuf +pointer as an argument: +.Bd -literal +struct bpf_zbuf { + void *bz_bufa; + void *bz_bufb; + size_t bz_buflen; +}; +.Ed +.Pp +.Vt bz_bufa +is a pointer to the userspace address of the first buffer that will be +filled, and +.Vt bz_bufb +is a pointer to the second buffer. +.Nm +will then cycle between the two buffers as they fill and are acknowledged. +.Pp +Each buffer begins with a fixed-length header to hold synchronization and +data length information for the buffer: +.Bd -literal +struct bpf_zbuf_header { + volatile u_int bzh_kernel_gen; /* Kernel generation number. */ + volatile u_int bzh_kernel_len; /* Length of data in the buffer. */ + volatile u_int bzh_user_gen; /* User generation number. */ + /* ...padding for future use... */ +}; +.Ed +.Pp +The header structure of each buffer, including all padding, should be zeroed +before it is configured using +.Dv BIOCSETZBUF . +Remaining space in the buffer will be used by the kernel to store packet +data, laid out in the same format as with buffered read mode. +.Pp +The kernel and the user process follow a simple acknowledgement protocol via +the buffer header to synchronize access to the buffer: when the header +generation numbers, +.Vt bzh_kernel_gen +and +.Vt bzh_user_gen , +hold the same value, the kernel owns the buffer, and when they differ, +userspace owns the buffer. +.Pp +While the kernel owns the buffer, the contents are unstable and may change +asynchronously; while the user process owns the buffer, its contents are +stable and will not be changed until the buffer has been acknowledged. +.Pp +Initializing the buffer headers to all 0's before registering the buffer has +the effect of assigning initial ownership of both buffers to the kernel. +The kernel signals that a buffer has been assigned to userspace by modifying +.Vt bzh_kernel_gen , +and userspace acknowledges the buffer and returns it to the kernel by setting +the value of +.Vt bzh_user_gen +to the value of +.Vt bzh_kernel_gen . +.Pp +In order to avoid caching and memory re-ordering effects, the user process +must use atomic operations and memory barriers when checking for and +acknowledging buffers: +.Bd -literal +#include <machine/atomic.h> + +/* + * Return ownership of a buffer to the kernel for reuse. + */ +static void +buffer_acknowledge(struct bpf_zbuf_header *bzh) +{ + + atomic_store_rel_int(&bzh->bzh_user_gen, bzh->bzh_kernel_gen); +} + +/* + * Check whether a buffer has been assigned to userspace by the kernel. + * Return true if userspace owns the buffer, and false otherwise. + */ +static int +buffer_check(struct bpf_zbuf_header *bzh) +{ + + return (bzh->bzh_user_gen != + atomic_load_acq_int(&bzh->bzh_kernel_gen)); +} +.Ed +.Pp +The user process may force the assignment of the next buffer, if any data +is pending, to userspace using the +.Dv BIOCROTZBUF +ioctl. +This allows the user process to retrieve data in a partially filled buffer +before the buffer is full, such as following a timeout; the process must +recheck for buffer ownership using the header generation numbers, as the +buffer will not be assigned to userspace if no data was present. +.Pp +As in the buffered read mode, +.Xr kqueue 2 , +.Xr poll 2 , +and +.Xr select 2 +may be used to sleep awaiting the availbility of a completed buffer. +They will return a readable file descriptor when ownership of the next buffer +is assigned to user space. +.Pp +In the current implementation, the kernel will assign ownership of at most +one buffer at a time to the user process. +The user processes must acknowledge the current buffer in order to be +notified that the next buffer is ready for processing. +Programs should not rely on this as an invariant, as it may change in future +versions; in particular, they must maintain their own notion of which buffer +is "next" so that if both buffers are owned by userspace, it can process them +in the correct order. .Sh IOCTLS The .Xr ioctl 2 @@ -127,7 +300,7 @@ file. The (third) argument to .Xr ioctl 2 should be a pointer to the type indicated. -.Bl -tag -width BIOCGRTIMEOUT +.Bl -tag -width BIOCGETBUFMODE .It Dv BIOCGBLEN .Pq Li u_int Returns the required buffer length for reads on @@ -349,10 +522,55 @@ descriptor. This prevents the execution of ioctl commands which could change the underlying operating parameters of the device. +.It Dv BIOCGETBUFMODE +.It Dv BIOCSETBUFMODE +.Pq Li u_int +Get or set the current +.Nm +buffering mode; possible values are +.Dv BPF_BUFMODE_BUFFER , +buffered read mode, and +.Dv BPF_BUFMODE_ZBUF , +zero-copy buffer mode. +.It Dv BIOCSETZBUF +.Pq Li struct bpf_zbuf +Set the current zero-copy buffer locations; buffer locations may be +set only once zero-copy buffer mode has been selected, and prior to attaching +to an interface. +Buffers must be of identical size, page-aligned, and an integer multiple of +pages in size. +The three fields +.Vt bz_bufa , +.Vt bz_bufb , +and +.Vt bz_buflen +must be filled out. +If buffers have already been set for this device, the ioctl will fail. +.It Dv BIOCGETZMAX +.Pq Li size_t +Get the largest individual zero-copy buffer size allowed. +As two buffers are used in zero-copy buffer mode, the limit (in practice) is +twice the returned size. +As zero-copy buffers consume kernel address space, conservative selection of +buffer size is suggested, especially when there are multiple +.Nm +descriptors in use on 32-bit systems. +.It Dv BIOCROTZBUF +Force ownership of the next buffer to be assigned to userspace, if any data +present in the buffer. +If no data is present, the buffer will remain owned by the kernel. +This allows consumers of zero-copy buffering to implement timeouts and +retrieve partially filled buffers. +In order to handle the case where no data is present in the buffer and +therefore ownership is not assigned, the user process must check +.Vt bzh_kernel_gen +against +.Vt bzh_user_gen . .El .Sh BPF HEADER The following structure is prepended to each packet returned by -.Xr read 2 : +.Xr read 2 +or via a zero-copy buffer: .Bd -literal struct bpf_hdr { struct timeval bh_tstamp; /* time stamp */ @@ -718,6 +936,9 @@ struct bpf_insn insns[] = { .Sh SEE ALSO .Xr tcpdump 1 , .Xr ioctl 2 , +.Xr kqueue 2 , +.Xr poll 2 , +.Xr select 2 , .Xr byteorder 3 , .Xr ng_bpf 4 , .Xr bpf 9 @@ -750,6 +971,10 @@ of Lawrence Berkeley Laboratory, implemented BPF in Summer 1990. Much of the design is due to .An Van Jacobson . +.Pp +Support for zero-copy buffers was added by +.An Robert N. M. Watson +under contract to Seccuris Inc. .Sh BUGS The read buffer must be of a fixed size (returned by the .Dv BIOCGBLEN diff --git a/sys/conf/files b/sys/conf/files index 2bd8a2368b2c..eac57fa58ee3 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1633,8 +1633,10 @@ libkern/strtoul.c standard libkern/strtouq.c standard libkern/strvalid.c standard net/bpf.c standard +net/bpf_buffer.c optional bpf net/bpf_jitter.c optional bpf_jitter net/bpf_filter.c optional bpf | netgraph_bpf +net/bpf_zerocopy.c optional bpf net/bridgestp.c optional bridge | if_bridge net/bsd_comp.c optional ppp_bsdcomp net/ieee8023ad_lacp.c optional lagg diff --git a/sys/net/bpf.c b/sys/net/bpf.c index 49961754d690..433cc7aa4649 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -66,9 +66,11 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/bpf.h> +#include <net/bpf_buffer.h> #ifdef BPF_JITTER #include <net/bpf_jitter.h> #endif +#include <net/bpf_zerocopy.h> #include <net/bpfdesc.h> #include <netinet/in.h> @@ -80,7 +82,7 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> -static MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); +MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); #if defined(DEV_BPF) || defined(NETGRAPH_BPF) @@ -98,19 +100,17 @@ static LIST_HEAD(, bpf_if) bpf_iflist; static struct mtx bpf_mtx; /* bpf global lock */ static int bpf_bpfd_cnt; -static void bpf_allocbufs(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_freed(struct bpf_d *); -static void bpf_mcopy(const void *, void *, size_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_insn *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); -static void catchpacket(struct bpf_d *, u_char *, u_int, - u_int, void (*)(const void *, void *, size_t), +static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, + void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct timeval *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); @@ -123,15 +123,12 @@ static void bpf_clone(void *, struct ucred *, char *, int, struct cdev **); static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); -static int bpf_bufsize = 4096; -SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW, - &bpf_bufsize, 0, "Default bpf buffer size"); -static int bpf_maxbufsize = BPF_MAXBUFSIZE; -SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW, - &bpf_maxbufsize, 0, "Maximum bpf buffer size"); static int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); +static int bpf_zerocopy_enable = 0; +SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, + &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); @@ -158,6 +155,146 @@ static struct cdevsw bpf_cdevsw = { static struct filterops bpfread_filtops = { 1, NULL, filt_bpfdetach, filt_bpfread }; +/* + * Wrapper functions for various buffering methods. If the set of buffer + * modes expands, we will probably want to introduce a switch data structure + * similar to protosw, et. + */ +static void +bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_bytes(d, buf, offset, src, len)); + + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); + + default: + panic("bpf_buf_append_bytes"); + } +} + +static void +bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); + + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); + + default: + panic("bpf_buf_append_mbuf"); + } +} + +/* + * If the buffer mechanism has a way to decide that a held buffer can be made + * free, then it is exposed via the bpf_canfreebuf() interface. (1) is + * returned if the buffer can be discarded, (0) is returned if it cannot. + */ +static int +bpf_canfreebuf(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_canfreebuf(d)); + } + return (0); +} + +void +bpf_bufheld(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_ZBUF: + bpf_zerocopy_bufheld(d); + break; + } +} + +static void +bpf_free(struct bpf_d *d) +{ + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_free(d)); + + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_free(d)); + + default: + panic("bpf_buf_free"); + } +} + +static int +bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) +{ + + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) + return (EOPNOTSUPP); + return (bpf_buffer_uiomove(d, buf, len, uio)); +} + +static int +bpf_ioctl_sblen(struct bpf_d *d, u_int *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) + return (EOPNOTSUPP); + return (bpf_buffer_ioctl_sblen(d, i)); +} + +static int +bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_getzmax(td, d, i)); +} + +static int +bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); +} + +static int +bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); +} + +/* + * General BPF functions. + */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) @@ -412,7 +549,14 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) "bpf%d", dev2unit(dev)); MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO); dev->si_drv1 = d; - d->bd_bufsize = bpf_bufsize; + + /* + * For historical reasons, perform a one-time initialization call to + * the buffer routines, even though we're not yet committed to a + * particular buffer method. + */ + bpf_buffer_init(d); + d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; d->bd_pid = td->td_proc->p_pid; @@ -459,18 +603,6 @@ bpfclose(struct cdev *dev, int flags, int fmt, struct thread *td) return (0); } - -/* - * Rotate the packet buffers in descriptor d. Move the store buffer - * into the hold slot, and the free buffer into the store slot. - * Zero the length of the new store buffer. - */ -#define ROTATE_BUFFERS(d) \ - (d)->bd_hbuf = (d)->bd_sbuf; \ - (d)->bd_hlen = (d)->bd_slen; \ - (d)->bd_sbuf = (d)->bd_fbuf; \ - (d)->bd_slen = 0; \ - (d)->bd_fbuf = NULL; /* * bpfread - read next chunk of packets from buffers */ @@ -490,6 +622,10 @@ bpfread(struct cdev *dev, struct uio *uio, int ioflag) BPFD_LOCK(d); d->bd_pid = curthread->td_proc->p_pid; + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { + BPFD_UNLOCK(d); + return (EOPNOTSUPP); + } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); @@ -567,7 +703,7 @@ bpfread(struct cdev *dev, struct uio *uio, int ioflag) * issues a read on the same fd at the same time? Don't want this * getting invalidated. */ - error = uiomove(d->bd_hbuf, d->bd_hlen, uio); + error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); d->bd_fbuf = d->bd_hbuf; @@ -613,6 +749,20 @@ bpf_timed_out(void *arg) } static int +bpf_ready(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + if (!bpf_canfreebuf(d) && d->bd_hlen != 0) + return (1); + if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && + d->bd_slen != 0) + return (1); + return (0); +} + +static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d = dev->si_drv1; @@ -622,25 +772,34 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) int error, hlen; d->bd_pid = curthread->td_proc->p_pid; - if (d->bd_bif == NULL) + d->bd_wcount++; + if (d->bd_bif == NULL) { + d->bd_wdcount++; return (ENXIO); + } ifp = d->bd_bif->bif_ifp; - if ((ifp->if_flags & IFF_UP) == 0) + if ((ifp->if_flags & IFF_UP) == 0) { + d->bd_wdcount++; return (ENETDOWN); + } - if (uio->uio_resid == 0) + if (uio->uio_resid == 0) { + d->bd_wdcount++; return (0); + } bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, &m, &dst, &hlen, d->bd_wfilter); - if (error) + if (error) { + d->bd_wdcount++; return (error); - + } + d->bd_wfcount++; if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; @@ -667,6 +826,8 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) #endif error = (*ifp->if_output)(ifp, m, &dst, NULL); + if (error) + d->bd_wdcount++; if (mc != NULL) { if (error == 0) @@ -697,6 +858,10 @@ reset_d(struct bpf_d *d) d->bd_rcount = 0; d->bd_dcount = 0; d->bd_fcount = 0; + d->bd_wcount = 0; + d->bd_wfcount = 0; + d->bd_wdcount = 0; + d->bd_zcopy = 0; } /* @@ -721,6 +886,11 @@ reset_d(struct bpf_d *d) * BIOCSDIRECTION Set packet direction flag * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. + * BIOCSETZBUF Set current zero-copy buffer locations. + * BIOCGETZMAX Get maximum zero-copy buffer size. + * BIOCROTZBUF Force rotation of zero-copy buffer + * BIOCSETBUFMODE Set buffer mode. + * BIOCGETBUFMODE Get current buffer mode. */ /* ARGSUSED */ static int @@ -758,6 +928,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, case BIOCSRTIMEOUT: case BIOCIMMEDIATE: case TIOCGPGRP: + case BIOCROTZBUF: break; default: return (EPERM); @@ -810,17 +981,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, * Set buffer length. */ case BIOCSBLEN: - if (d->bd_bif != NULL) - error = EINVAL; - else { - u_int size = *(u_int *)addr; - - if (size > bpf_maxbufsize) - *(u_int *)addr = size = bpf_maxbufsize; - else if (size < BPF_MINBUFSIZE) - *(u_int *)addr = size = BPF_MINBUFSIZE; - d->bd_bufsize = size; - } + error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* @@ -945,6 +1106,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, { struct bpf_stat *bs = (struct bpf_stat *)addr; + /* XXXCSJP overflow */ bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; break; @@ -1055,6 +1217,50 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, case BIOCGRSIG: *(u_int *)addr = d->bd_sig; break; + + case BIOCGETBUFMODE: + *(u_int *)addr = d->bd_bufmode; + break; + + case BIOCSETBUFMODE: + /* + * Allow the buffering mode to be changed as long as we + * haven't yet committed to a particular mode. Our + * definition of commitment, for now, is whether or not a + * buffer has been allocated or an interface attached, since + * that's the point where things get tricky. + */ + switch (*(u_int *)addr) { + case BPF_BUFMODE_BUFFER: + break; + + case BPF_BUFMODE_ZBUF: + if (bpf_zerocopy_enable) + break; + /* FALLSTHROUGH */ + + default: + return (EINVAL); + } + + BPFD_LOCK(d); + if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || + d->bd_fbuf != NULL || d->bd_bif != NULL) { + BPFD_UNLOCK(d); + return (EBUSY); + } + d->bd_bufmode = *(u_int *)addr; + BPFD_UNLOCK(d); + break; + + case BIOCGETZMAX: + return (bpf_ioctl_getzmax(td, d, (size_t *)addr)); + + case BIOCSETZBUF: + return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr)); + + case BIOCROTZBUF: + return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr)); } return (error); } @@ -1155,13 +1361,31 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) return (ENXIO); bp = theywant->if_bpf; + /* - * Allocate the packet buffers if we need to. - * If we're already attached to requested interface, - * just flush the buffer. + * Behavior here depends on the buffering model. If we're using + * kernel memory buffers, then we can allocate them here. If we're + * using zero-copy, then the user process must have registered + * buffers by the time we get here. If not, return an error. + * + * XXXRW: There are locking issues here with multi-threaded use: what + * if two threads try to set the interface at once? */ - if (d->bd_sbuf == NULL) - bpf_allocbufs(d); + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + if (d->bd_sbuf == NULL) + bpf_buffer_alloc(d); + KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL")); + break; + + case BPF_BUFMODE_ZBUF: + if (d->bd_sbuf == NULL) + return (EINVAL); + break; + + default: + panic("bpf_setif: bufmode %d", d->bd_bufmode); + } if (bp != d->bd_bif) { if (d->bd_bif) /* @@ -1305,37 +1529,14 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif - catchpacket(d, pkt, pktlen, slen, bcopy, &tv); + catchpacket(d, pkt, pktlen, slen, + bpf_append_bytes, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } -/* - * Copy data from an mbuf chain into a buffer. This code is derived - * from m_copydata in sys/uipc_mbuf.c. - */ -static void -bpf_mcopy(const void *src_arg, void *dst_arg, size_t len) -{ - const struct mbuf *m; - u_int count; - u_char *dst; - - m = src_arg; - dst = dst_arg; - while (len > 0) { - if (m == NULL) - panic("bpf_mcopy"); - count = min(m->m_len, len); - bcopy(mtod(m, void *), dst, count); - m = m->m_next; - dst += count; - len -= count; - } -} - #define BPF_CHECK_DIRECTION(d, m) \ if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \ ((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL)) @@ -1385,7 +1586,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, - bpf_mcopy, &tv); + bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } @@ -1440,7 +1641,7 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, - bpf_mcopy, &tv); + bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } @@ -1453,19 +1654,34 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while - * bpf_mcopy is passed in to copy mbuf chains. In the latter case, + * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, - void (*cpfn)(const void *, void *, size_t), struct timeval *tv) + void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), + struct timeval *tv) { - struct bpf_hdr *hp; + struct bpf_hdr hdr; int totlen, curlen; int hdrlen = d->bd_bif->bif_hdrlen; int do_wakeup = 0; BPFD_LOCK_ASSERT(d); + + /* + * Detect whether user space has released a buffer back to us, and if + * so, move it from being a hold buffer to a free buffer. This may + * not be the best place to do it (for example, we might only want to + * run this check if we need the space), but for now it's a reliable + * spot to do it. + */ + if (bpf_canfreebuf(d)) { + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = NULL; + d->bd_hlen = 0; + } + /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that @@ -1500,23 +1716,27 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* - * Immediate mode is set, or the read timeout has - * already expired during a select call. A packet - * arrived, so the reader should be woken up. + * Immediate mode is set, or the read timeout has already + * expired during a select call. A packet arrived, so the + * reader should be woken up. */ do_wakeup = 1; /* - * Append the bpf header. + * Append the bpf header. Note we append the actual header size, but + * move forward the length of the header plus padding. */ - hp = (struct bpf_hdr *)(d->bd_sbuf + curlen); - hp->bh_tstamp = *tv; - hp->bh_datalen = pktlen; - hp->bh_hdrlen = hdrlen; + bzero(&hdr, sizeof(hdr)); + hdr.bh_tstamp = *tv; + hdr.bh_datalen = pktlen; + hdr.bh_hdrlen = hdrlen; + hdr.bh_caplen = totlen - hdrlen; + bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); + /* * Copy the packet data into the store buffer and update its length. */ - (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); + (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen); d->bd_slen = curlen + totlen; if (do_wakeup) @@ -1524,41 +1744,19 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, } /* - * Initialize all nonzero fields of a descriptor. - */ -static void -bpf_allocbufs(struct bpf_d *d) -{ - - KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL")); - KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL")); - KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL")); - - d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_slen = 0; - d->bd_hlen = 0; -} - -/* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { + /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ - if (d->bd_sbuf != NULL) { - free(d->bd_sbuf, M_BPF); - if (d->bd_hbuf != NULL) - free(d->bd_hbuf, M_BPF); - if (d->bd_fbuf != NULL) - free(d->bd_fbuf, M_BPF); - } + bpf_free(d); if (d->bd_rfilter) { free((caddr_t)d->bd_rfilter, M_BPF); #ifdef BPF_JITTER @@ -1762,6 +1960,7 @@ bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) bzero(d, sizeof(*d)); BPFD_LOCK_ASSERT(bd); + d->bd_structsize = sizeof(*d); d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; @@ -1779,6 +1978,11 @@ bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; + d->bd_wcount = bd->bd_wcount; + d->bd_wdcount = bd->bd_wdcount; + d->bd_wfcount = bd->bd_wfcount; + d->bd_zcopy = bd->bd_zcopy; + d->bd_bufmode = bd->bd_bufmode; } static int diff --git a/sys/net/bpf.h b/sys/net/bpf.h index 91ea0f6827d2..1d6f9db7415d 100644 --- a/sys/net/bpf.h +++ b/sys/net/bpf.h @@ -92,6 +92,27 @@ struct bpf_version { #define BPF_MAJOR_VERSION 1 #define BPF_MINOR_VERSION 1 +/* + * Historically, BPF has supported a single buffering model, first using mbuf + * clusters in kernel, and later using malloc(9) buffers in kernel. We now + * support multiple buffering modes, which may be queried and set using + * BIOCGETBUFMODE and BIOCSETBUFMODE. So as to avoid handling the complexity + * of changing modes while sniffing packets, the mode becomes fixed once an + * interface has been attached to the BPF descriptor. + */ +#define BPF_BUFMODE_BUFFER 1 /* Kernel buffers with read(). */ +#define BPF_BUFMODE_ZBUF 2 /* Zero-copy buffers. */ + +/*- + * Struct used by BIOCSETZBUF, BIOCROTZBUF: describes up to two zero-copy + * buffer as used by BPF. + */ +struct bpf_zbuf { + void *bz_bufa; /* Location of 'a' zero-copy buffer. */ + void *bz_bufb; /* Location of 'b' zero-copy buffer. */ + size_t bz_buflen; /* Size of zero-copy buffers. */ +}; + #define BIOCGBLEN _IOR('B',102, u_int) #define BIOCSBLEN _IOWR('B',102, u_int) #define BIOCSETF _IOW('B',103, struct bpf_program) @@ -116,6 +137,11 @@ struct bpf_version { #define BIOCLOCK _IO('B', 122) #define BIOCSETWF _IOW('B',123, struct bpf_program) #define BIOCFEEDBACK _IOW('B',124, u_int) +#define BIOCGETBUFMODE _IOR('B',125, u_int) +#define BIOCSETBUFMODE _IOW('B',126, u_int) +#define BIOCGETZMAX _IOR('B',127, size_t) +#define BIOCROTZBUF _IOR('B',128, struct bpf_zbuf) +#define BIOCSETZBUF _IOW('B',129, struct bpf_zbuf) /* Obsolete */ #define BIOCGSEESENT BIOCGDIRECTION @@ -149,6 +175,24 @@ struct bpf_hdr { #endif /* + * When using zero-copy BPF buffers, a shared memory header is present + * allowing the kernel BPF implementation and user process to synchronize + * without using system calls. This structure defines that header. When + * accessing these fields, appropriate atomic operation and memory barriers + * are required in order not to see stale or out-of-order data; see bpf(4) + * for reference code to access these fields from userspace. + * + * The layout of this structure is critical, and must not be changed; if must + * fit in a single page on all architectures. + */ +struct bpf_zbuf_header { + volatile u_int bzh_kernel_gen; /* Kernel generation number. */ + volatile u_int bzh_kernel_len; /* Length of data in the buffer. */ + volatile u_int bzh_user_gen; /* User generation number. */ + u_int _bzh_pad[5]; +}; + +/* * Data-link level type codes. */ #define DLT_NULL 0 /* BSD loopback encapsulation */ @@ -761,6 +805,27 @@ struct bpf_dltlist { }; #ifdef _KERNEL +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_BPF); +#endif +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_bpf); +#endif + +/* + * Rotate the packet buffers in descriptor d. Move the store buffer into the + * hold slot, and the free buffer ino the store slot. Zero the length of the + * new store buffer. Descriptor lock should be held. + */ +#define ROTATE_BUFFERS(d) do { \ + (d)->bd_hbuf = (d)->bd_sbuf; \ + (d)->bd_hlen = (d)->bd_slen; \ + (d)->bd_sbuf = (d)->bd_fbuf; \ + (d)->bd_slen = 0; \ + (d)->bd_fbuf = NULL; \ + bpf_bufheld(d); \ +} while (0) + /* * Descriptor associated with each attached hardware interface. */ @@ -773,6 +838,7 @@ struct bpf_if { struct mtx bif_mtx; /* mutex for interface */ }; +void bpf_bufheld(struct bpf_d *d); int bpf_validate(const struct bpf_insn *, int); void bpf_tap(struct bpf_if *, u_char *, u_int); void bpf_mtap(struct bpf_if *, struct mbuf *); diff --git a/sys/net/bpf_buffer.c b/sys/net/bpf_buffer.c new file mode 100644 index 000000000000..f07e9486cbd6 --- /dev/null +++ b/sys/net/bpf_buffer.c @@ -0,0 +1,210 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf.c 8.4 (Berkeley) 1/9/95 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_bpf.h" + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/bpf.h> +#include <net/bpf_buffer.h> +#include <net/bpfdesc.h> + +/* + * Implement historical kernel memory buffering model for BPF: two malloc(9) + * kernel buffers are hung off of the descriptor. The size is fixed prior to + * attaching to an ifnet, ad cannot be changed after that. read(2) simply + * copies the data to user space using uiomove(9). + */ + +static int bpf_bufsize = 4096; +SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW, + &bpf_bufsize, 0, ""); +static int bpf_maxbufsize = BPF_MAXBUFSIZE; +SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW, + &bpf_maxbufsize, 0, ""); + +void +bpf_buffer_alloc(struct bpf_d *d) +{ + + KASSERT(d->bd_fbuf == NULL, ("bpf_buffer_alloc: bd_fbuf != NULL")); + KASSERT(d->bd_sbuf == NULL, ("bpf_buffer_alloc: bd_sbuf != NULL")); + KASSERT(d->bd_hbuf == NULL, ("bpf_buffer_alloc: bd_hbuf != NULL")); + + d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); + d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); + d->bd_hbuf = NULL; + d->bd_slen = 0; + d->bd_hlen = 0; +} + +/* + * Simple data copy to the current kernel buffer. + */ +void +bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len) +{ + u_char *src_bytes; + + src_bytes = (u_char *)src; + bcopy(src_bytes, buf + offset, len); +} + +/* + * Scatter-gather data copy from an mbuf chain to the current kernel buffer. + */ +void +bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + const struct mbuf *m; + u_char *dst; + u_int count; + + m = (struct mbuf *)src; + dst = (u_char *)buf + offset; + while (len > 0) { + if (m == NULL) + panic("bpf_mcopy"); + count = min(m->m_len, len); + bcopy(mtod(m, void *), dst, count); + m = m->m_next; + dst += count; + len -= count; + } +} + +/* + * Free BPF kernel buffers on device close. + */ +void +bpf_buffer_free(struct bpf_d *d) +{ + + if (d->bd_sbuf != NULL) + free(d->bd_sbuf, M_BPF); + if (d->bd_hbuf != NULL) + free(d->bd_hbuf, M_BPF); + if (d->bd_fbuf != NULL) + free(d->bd_fbuf, M_BPF); + +#ifdef INVARIANTS + d->bd_sbuf = d->bd_hbuf = d->bd_fbuf = (caddr_t)~0; +#endif +} + +/* + * This is a historical initialization that occurs when the BPF descriptor is + * first opened. It does not imply selection of a buffer mode, so we don't + * allocate buffers here. + */ +void +bpf_buffer_init(struct bpf_d *d) +{ + + d->bd_bufsize = bpf_bufsize; +} + +/* + * Allocate or resize buffers. + */ +int +bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i) +{ + u_int size; + + BPFD_LOCK(d); + if (d->bd_bif != NULL) { + BPFD_UNLOCK(d); + return (EINVAL); + } + size = *i; + if (size > bpf_maxbufsize) + *i = size = bpf_maxbufsize; + else if (size < BPF_MINBUFSIZE) + *i = size = BPF_MINBUFSIZE; + d->bd_bufsize = size; + BPFD_UNLOCK(d); + return (0); +} + +/* + * Copy buffer storage to user space in read(). + */ +int +bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) +{ + + return (uiomove(buf, len, uio)); +} diff --git a/sys/net/bpf_buffer.h b/sys/net/bpf_buffer.h new file mode 100644 index 000000000000..82d0310b4d44 --- /dev/null +++ b/sys/net/bpf_buffer.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_BPF_BUFFER_H_ +#define _NET_BPF_BUFFER_H_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +void bpf_buffer_alloc(struct bpf_d *d); +void bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_buffer_free(struct bpf_d *d); +void bpf_buffer_init(struct bpf_d *d); +int bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i); +int bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, + struct uio *uio); + +#endif /* !_NET_BPF_BUFFER_H_ */ diff --git a/sys/net/bpf_zerocopy.c b/sys/net/bpf_zerocopy.c new file mode 100644 index 000000000000..896ad1da29f4 --- /dev/null +++ b/sys/net/bpf_zerocopy.c @@ -0,0 +1,510 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_bpf.h" + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sf_buf.h> +#include <sys/socket.h> +#include <sys/uio.h> + +#include <machine/atomic.h> + +#include <net/if.h> +#include <net/bpf.h> +#include <net/bpfdesc.h> +#include <net/bpf_zerocopy.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> + +/* + * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which + * are mapped into the kernel address space using sf_bufs and used directly + * by BPF. Memory is wired since page faults cannot be tolerated in the + * contexts where the buffers are copied to (locks held, interrupt context, + * etc). Access to shared memory buffers is synchronized using a header on + * each buffer, allowing the number of system calls to go to zero as BPF + * reaches saturation (buffers filled as fast as they can be drained by the + * user process). Full details of the protocol for communicating between the + * user process and BPF may be found in bpf(4). + */ + +/* + * Maximum number of pages per buffer. Since all BPF devices use two, the + * maximum per device is 2*BPF_MAX_PAGES. Resource limits on the number of + * sf_bufs may be an issue, so do not set this too high. On older systems, + * kernel address space limits may also be an issue. + */ +#define BPF_MAX_PAGES 512 + +/* + * struct zbuf describes a memory buffer loaned by a user process to the + * kernel. We represent this as a series of pages managed using an array of + * sf_bufs. Even though the memory is contiguous in user space, it may not + * be mapped contiguously in the kernel (i.e., a set of physically + * non-contiguous pages in the direct map region) so we must implement + * scatter-gather copying. One significant mitigating factor is that on + * systems with a direct memory map, we can avoid TLB misses. + * + * At the front of the shared memor region is a bpf_zbuf_header, which + * contains shared control data to allow user space and the kernel to + * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF + * knows that the space is not available. + */ +struct zbuf { + vm_offset_t zb_uaddr; /* User address, may be stale. */ + size_t zb_size; /* Size of buffer, incl. header. */ + u_int zb_numpages; /* Number of pages. */ + struct sf_buf **zb_pages; /* Pages themselves. */ + struct bpf_zbuf_header *zb_header; /* Shared header. */ +}; + +/* + * Release a page we've previously wired. + */ +static void +zbuf_page_free(vm_page_t pp) +{ + + vm_page_lock_queues(); + vm_page_unwire(pp, 0); + if (pp->wire_count == 0 && pp->object == NULL) + vm_page_free(pp); + vm_page_unlock_queues(); +} + +/* + * Free an sf_buf with attached page. + */ +static void +zbuf_sfbuf_free(struct sf_buf *sf) +{ + vm_page_t pp; + + pp = sf_buf_page(sf); + sf_buf_free(sf); + zbuf_page_free(pp); +} + +/* + * Free a zbuf, including its page array, sbufs, and pages. Allow partially + * allocated zbufs to be freed so that it may be used even during a zbuf + * setup. + */ +static void +zbuf_free(struct zbuf *zb) +{ + int i; + + for (i = 0; i < zb->zb_numpages; i++) { + if (zb->zb_pages[i] != NULL) + zbuf_sfbuf_free(zb->zb_pages[i]); + } + free(zb->zb_pages, M_BPF); + free(zb, M_BPF); +} + +/* + * Given a user pointer to a page of user memory, return an sf_buf for the + * page. Because we may be requesting quite a few sf_bufs, prefer failure to + * deadlock and use SFB_NOWAIT. + */ +static struct sf_buf * +zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr) +{ + struct sf_buf *sf; + vm_page_t pp; + + if (vm_fault_quick((caddr_t) uaddr, VM_PROT_READ | VM_PROT_WRITE) < + 0) + return (NULL); + pp = pmap_extract_and_hold(map->pmap, uaddr, VM_PROT_READ | + VM_PROT_WRITE); + if (pp == NULL) + return (NULL); + vm_page_lock_queues(); + vm_page_wire(pp); + vm_page_unhold(pp); + vm_page_unlock_queues(); + sf = sf_buf_alloc(pp, SFB_NOWAIT); + if (sf == NULL) { + zbuf_page_free(pp); + return (NULL); + } + return (sf); +} + +/* + * Create a zbuf describing a range of user address space memory. Validate + * page alignment, size requirements, etc. + */ +static int +zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len, + struct zbuf **zbp) +{ + struct zbuf *zb; + struct vm_map *map; + int error, i; + + *zbp = NULL; + + /* + * User address must be page-aligned. + */ + if (uaddr & PAGE_MASK) + return (EINVAL); + + /* + * Length must be an integer number of full pages. + */ + if (len & PAGE_MASK) + return (EINVAL); + + /* + * Length must not exceed per-buffer resource limit. + */ + if ((len / PAGE_SIZE) > BPF_MAX_PAGES) + return (EINVAL); + + /* + * Allocate the buffer and set up each page with is own sf_buf. + */ + error = 0; + zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK); + zb->zb_uaddr = uaddr; + zb->zb_size = len; + zb->zb_numpages = len / PAGE_SIZE; + zb->zb_pages = malloc(sizeof(struct sf_buf *) * + zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK); + map = &td->td_proc->p_vmspace->vm_map; + for (i = 0; i < zb->zb_numpages; i++) { + zb->zb_pages[i] = zbuf_sfbuf_get(map, + uaddr + (i * PAGE_SIZE)); + if (zb->zb_pages[i] == NULL) { + error = EFAULT; + goto error; + } + } + zb->zb_header = + (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]); + bzero(zb->zb_header, sizeof(*zb->zb_header)); + *zbp = zb; + return (0); + +error: + zbuf_free(zb); + return (error); +} + +/* + * Copy bytes from a source into the specified zbuf. The caller is + * responsible for performing bounds checking, etc. + */ +void +bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len) +{ + u_int count, page, poffset; + u_char *src_bytes; + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_append_bytes: not in zbuf mode")); + KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf")); + + src_bytes = (u_char *)src; + zb = (struct zbuf *)buf; + + /* + * Scatter-gather copy to user pages mapped into kernel address space + * using sf_bufs: copy up to a page at a time. + */ + offset += sizeof(struct bpf_zbuf_header); + page = offset / PAGE_SIZE; + poffset = offset % PAGE_SIZE; + while (len > 0) { + KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:" + " page overflow (%d p %d np)\n", page, zb->zb_numpages)); + + count = min(len, PAGE_SIZE - poffset); + bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) + + poffset, count); + poffset += count; + if (poffset == PAGE_SIZE) { + poffset = 0; + page++; + } + KASSERT(poffset < PAGE_SIZE, + ("bpf_zerocopy_append_bytes: page offset overflow (%d)", + poffset)); + len -= count; + src_bytes += count; + } +} + +/* + * Copy bytes from an mbuf chain to the specified zbuf: copying will be + * scatter-gather both from mbufs, which may be fragmented over memory, and + * to pages, which may not be contiguously mapped in kernel address space. + * As with bpf_zerocopy_append_bytes(), the caller is responsible for + * checking that this will not exceed the buffer limit. + */ +void +bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len) +{ + u_int count, moffset, page, poffset; + const struct mbuf *m; + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_append_mbuf not in zbuf mode")); + KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf")); + + m = (struct mbuf *)src; + zb = (struct zbuf *)buf; + + /* + * Scatter gather both from an mbuf chain and to a user page set + * mapped into kernel address space using sf_bufs. If we're lucky, + * each mbuf requires one copy operation, but if page alignment and + * mbuf alignment work out less well, we'll be doing two copies per + * mbuf. + */ + offset += sizeof(struct bpf_zbuf_header); + page = offset / PAGE_SIZE; + poffset = offset % PAGE_SIZE; + moffset = 0; + while (len > 0) { + KASSERT(page < zb->zb_numpages, + ("bpf_zerocopy_append_mbuf: page overflow (%d p %d " + "np)\n", page, zb->zb_numpages)); + KASSERT(m != NULL, + ("bpf_zerocopy_append_mbuf: end of mbuf chain")); + + count = min(m->m_len - moffset, len); + count = min(count, PAGE_SIZE - poffset); + bcopy(mtod(m, u_char *) + moffset, + ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset, + count); + poffset += count; + if (poffset == PAGE_SIZE) { + poffset = 0; + page++; + } + KASSERT(poffset < PAGE_SIZE, + ("bpf_zerocopy_append_mbuf: page offset overflow (%d)", + poffset)); + moffset += count; + if (moffset == m->m_len) { + m = m->m_next; + moffset = 0; + } + len -= count; + } +} + +/* + * Notification from the BPF framework that a buffer has moved into the held + * slot on a descriptor. Zero-copy BPF will update the shared page to let + * the user process know. + */ +void +bpf_zerocopy_bufheld(struct bpf_d *d) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_bufheld: not in zbuf mode")); + + zb = (struct zbuf *)d->bd_hbuf; + KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL")); + zb->zb_header->bzh_kernel_len = d->bd_hlen; + atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1); +} + +/* + * Query from the BPF framework regarding whether the buffer currently in the + * held position can be moved to the free position, which can be indicated by + * the user process making their generation number equal to the kernel + * generation number. + */ +int +bpf_zerocopy_canfreebuf(struct bpf_d *d) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_canfreebuf: not in zbuf mode")); + + zb = (struct zbuf *)d->bd_hbuf; + if (zb == NULL) + return (0); + if (zb->zb_header->bzh_kernel_gen == + atomic_load_acq_int(&zb->zb_header->bzh_user_gen)) + return (1); + return (0); +} + +/* + * Free zero copy buffers at request of descriptor. + */ +void +bpf_zerocopy_free(struct bpf_d *d) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_free: not in zbuf mode")); + + zb = (struct zbuf *)d->bd_sbuf; + if (zb != NULL) + zbuf_free(zb); + zb = (struct zbuf *)d->bd_hbuf; + if (zb != NULL) + zbuf_free(zb); + zb = (struct zbuf *)d->bd_fbuf; + if (zb != NULL) + zbuf_free(zb); +} + +/* + * Ioctl to return the maximum buffer size. + */ +int +bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) +{ + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode")); + + *i = BPF_MAX_PAGES * PAGE_SIZE; + return (0); +} + +/* + * Ioctl to force rotation of the two buffers, if there's any data available. + * This can be used by user space to implement time outs when waiting for a + * buffer to fill. + */ +int +bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz) +{ + struct zbuf *bzh; + + bzero(bz, sizeof(*bz)); + BPFD_LOCK(d); + if (d->bd_hbuf == NULL && d->bd_slen != 0) { + ROTATE_BUFFERS(d); + bzh = (struct zbuf *)d->bd_hbuf; + bz->bz_bufa = (void *)bzh->zb_uaddr; + bz->bz_buflen = d->bd_hlen; + } + BPFD_UNLOCK(d); + return (0); +} + +/* + * Ioctl to configure zero-copy buffers -- may be done only once. + */ +int +bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz) +{ + struct zbuf *zba, *zbb; + int error; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode")); + + /* + * Must set both buffers. Cannot clear them. + */ + if (bz->bz_bufa == NULL || bz->bz_bufb == NULL) + return (EINVAL); + + /* + * Buffers must have a size greater than 0. Alignment and other size + * validity checking is done in zbuf_setup(). + */ + if (bz->bz_buflen == 0) + return (EINVAL); + + /* + * Allocate new buffers. + */ + error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen, + &zba); + if (error) + return (error); + error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen, + &zbb); + if (error) { + zbuf_free(zba); + return (error); + } + + /* + * We only allow buffers to be installed once, so atomically check + * that no buffers are currently installed and install new buffers. + */ + BPFD_LOCK(d); + if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL || + d->bd_bif != NULL) { + BPFD_UNLOCK(d); + zbuf_free(zba); + zbuf_free(zbb); + return (EINVAL); + } + d->bd_fbuf = (caddr_t)zbb; + d->bd_sbuf = (caddr_t)zba; + d->bd_slen = 0; + d->bd_hlen = 0; + + /* + * We expose only the space left in the buffer after the size of the + * shared management region. + */ + d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header); + BPFD_UNLOCK(d); + return (0); +} diff --git a/sys/net/bpf_zerocopy.h b/sys/net/bpf_zerocopy.h new file mode 100644 index 000000000000..33d1f25041d8 --- /dev/null +++ b/sys/net/bpf_zerocopy.h @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_BPF_ZEROCOPY_H_ +#define _NET_BPF_ZEROCOPY_H_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +void bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_zerocopy_bufheld(struct bpf_d *); +int bpf_zerocopy_canfreebuf(struct bpf_d *); +void bpf_zerocopy_free(struct bpf_d *d); +int bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, + size_t *i); +int bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); +int bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); + +#endif /* !_NET_BPF_ZEROCOPY_H_ */ diff --git a/sys/net/bpfdesc.h b/sys/net/bpfdesc.h index a46013edca43..ad9ab207dbf3 100644 --- a/sys/net/bpfdesc.h +++ b/sys/net/bpfdesc.h @@ -48,10 +48,11 @@ /* * Descriptor associated with each open bpf file. */ +struct zbuf; struct bpf_d { LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ /* - * Buffer slots: two malloc buffers store the incoming packets. + * Buffer slots: two memory buffers store the incoming packets. * The model has three slots. Sbuf is always occupied. * sbuf (store) - Receive interrupt puts packets here. * hbuf (hold) - When sbuf is full, put buffer here and @@ -74,8 +75,8 @@ struct bpf_d { #ifdef BPF_JITTER bpf_jit_filter *bd_bfilter; /* binary filter code */ #endif - u_long bd_rcount; /* number of packets received */ - u_long bd_dcount; /* number of packets dropped */ + u_int64_t bd_rcount; /* number of packets received */ + u_int64_t bd_dcount; /* number of packets dropped */ u_char bd_promisc; /* true if listening promiscuously */ u_char bd_state; /* idle, waiting, or timed out */ @@ -90,9 +91,14 @@ struct bpf_d { struct mtx bd_mtx; /* mutex for this descriptor */ struct callout bd_callout; /* for BPF timeouts with select */ struct label *bd_label; /* MAC label for descriptor */ - u_long bd_fcount; /* number of packets which matched filter */ + u_int64_t bd_fcount; /* number of packets which matched filter */ pid_t bd_pid; /* PID which created descriptor */ int bd_locked; /* true if descriptor is locked */ + u_int bd_bufmode; /* Current buffer mode. */ + u_int64_t bd_wcount; /* number of packets written */ + u_int64_t bd_wfcount; /* number of packets that matched write filter */ + u_int64_t bd_wdcount; /* number of packets dropped during a write */ + u_int64_t bd_zcopy; /* number of zero copy operations */ }; /* Values for bd_state */ @@ -104,25 +110,21 @@ struct bpf_d { #define BPFD_UNLOCK(bd) mtx_unlock(&(bd)->bd_mtx) #define BPFD_LOCK_ASSERT(bd) mtx_assert(&(bd)->bd_mtx, MA_OWNED); -/* Test whether a BPF is ready for read(). */ -#define bpf_ready(bd) \ - ((bd)->bd_hlen != 0 || \ - (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \ - (bd)->bd_slen != 0)) - /* * External representation of the bpf descriptor */ struct xbpf_d { + u_int bd_structsize; /* Size of this structure. */ u_char bd_promisc; u_char bd_immediate; + u_char __bd_pad[6]; int bd_hdrcmplt; int bd_direction; int bd_feedback; int bd_async; - u_long bd_rcount; - u_long bd_dcount; - u_long bd_fcount; + u_int64_t bd_rcount; + u_int64_t bd_dcount; + u_int64_t bd_fcount; int bd_sig; int bd_slen; int bd_hlen; @@ -130,6 +132,16 @@ struct xbpf_d { pid_t bd_pid; char bd_ifname[IFNAMSIZ]; int bd_locked; + u_int64_t bd_wcount; + u_int64_t bd_wfcount; + u_int64_t bd_wdcount; + u_int64_t bd_zcopy; + int bd_bufmode; + /* + * Allocate 4 64 bit unsigned integers for future expansion so we do + * not have to worry about breaking the ABI. + */ + u_int64_t bd_spare[4]; }; #define BPFIF_LOCK(bif) mtx_lock(&(bif)->bif_mtx) |