diff options
Diffstat (limited to 'usr.sbin/bhyve/net_backends.c')
| -rw-r--r-- | usr.sbin/bhyve/net_backends.c | 502 | 
1 files changed, 502 insertions, 0 deletions
| diff --git a/usr.sbin/bhyve/net_backends.c b/usr.sbin/bhyve/net_backends.c new file mode 100644 index 000000000000..95909d1f8ea2 --- /dev/null +++ b/usr.sbin/bhyve/net_backends.c @@ -0,0 +1,502 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file implements multiple network backends (tap, netmap, ...), + * to be used by network frontends such as virtio-net and e1000. + * The API to access the backend (e.g. send/receive packets, negotiate + * features) is exported by net_backends.h. + */ + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/uio.h> + +#include <net/if.h> +#include <net/if_tap.h> + +#include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <pthread.h> +#include <pthread_np.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include "config.h" +#include "debug.h" +#include "iov.h" +#include "mevent.h" +#include "net_backends.h" +#include "net_backends_priv.h" +#include "pci_emul.h" + +#define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size) + +void +tap_cleanup(struct net_backend *be) +{ +	struct tap_priv *priv = NET_BE_PRIV(be); + +	if (priv->mevp) { +		mevent_delete(priv->mevp); +	} +	if (be->fd != -1) { +		close(be->fd); +		be->fd = -1; +	} +} + +static int +tap_init(struct net_backend *be, const char *devname, +    nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) +{ +	struct tap_priv *priv = NET_BE_PRIV(be); +	char tbuf[80]; +	int opt = 1, up = IFF_UP; + +#ifndef WITHOUT_CAPSICUM +	cap_rights_t rights; +#endif + +	if (cb == NULL) { +		EPRINTLN("TAP backend requires non-NULL callback"); +		return (-1); +	} + +	strcpy(tbuf, "/dev/"); +	strlcat(tbuf, devname, sizeof(tbuf)); + +	be->fd = open(tbuf, O_RDWR); +	if (be->fd == -1) { +		EPRINTLN("open of tap device %s failed", tbuf); +		goto error; +	} + +	/* +	 * Set non-blocking and register for read +	 * notifications with the event loop +	 */ +	if (ioctl(be->fd, FIONBIO, &opt) < 0) { +		EPRINTLN("tap device O_NONBLOCK failed"); +		goto error; +	} + +	if (strncmp("ngd", be->prefix, 3) && +	    ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { +		EPRINTLN("tap device link up failed"); +		goto error; +	} + +#ifndef WITHOUT_CAPSICUM +	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); +	if (caph_rights_limit(be->fd, &rights) == -1) +		errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + +	memset(priv->bbuf, 0, sizeof(priv->bbuf)); +	priv->bbuflen = 0; + +	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); +	if (priv->mevp == NULL) { +		EPRINTLN("Could not register event"); +		goto error; +	} + +	return (0); + +error: +	tap_cleanup(be); +	return (-1); +} + +/* + * Called to send a buffer chain out to the tap device + */ +ssize_t +tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ +	return (writev(be->fd, iov, iovcnt)); +} + +ssize_t +tap_peek_recvlen(struct net_backend *be) +{ +	struct tap_priv *priv = NET_BE_PRIV(be); +	ssize_t ret; + +	if (priv->bbuflen > 0) { +		/* +		 * We already have a packet in the bounce buffer. +		 * Just return its length. +		 */ +		return priv->bbuflen; +	} + +	/* +	 * Read the next packet (if any) into the bounce buffer, so +	 * that we get to know its length and we can return that +	 * to the caller. +	 */ +	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); +	if (ret < 0 && errno == EWOULDBLOCK) { +		return (0); +	} + +	if (ret > 0) +		priv->bbuflen = ret; + +	return (ret); +} + +ssize_t +tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ +	struct tap_priv *priv = NET_BE_PRIV(be); +	ssize_t ret; + +	if (priv->bbuflen > 0) { +		/* +		 * A packet is available in the bounce buffer, so +		 * we read it from there. +		 */ +		ret = buf_to_iov(priv->bbuf, priv->bbuflen, +		    iov, iovcnt, 0); + +		/* Mark the bounce buffer as empty. */ +		priv->bbuflen = 0; + +		return (ret); +	} + +	ret = readv(be->fd, iov, iovcnt); +	if (ret < 0 && errno == EWOULDBLOCK) { +		return (0); +	} + +	return (ret); +} + +void +tap_recv_enable(struct net_backend *be) +{ +	struct tap_priv *priv = NET_BE_PRIV(be); + +	mevent_enable(priv->mevp); +} + +void +tap_recv_disable(struct net_backend *be) +{ +	struct tap_priv *priv = NET_BE_PRIV(be); + +	mevent_disable(priv->mevp); +} + +uint64_t +tap_get_cap(struct net_backend *be __unused) +{ + +	return (0); /* no capabilities for now */ +} + +int +tap_set_cap(struct net_backend *be __unused, uint64_t features, +    unsigned vnet_hdr_len) +{ + +	return ((features || vnet_hdr_len) ? -1 : 0); +} + +static struct net_backend tap_backend = { +	.prefix = "tap", +	.priv_size = sizeof(struct tap_priv), +	.init = tap_init, +	.cleanup = tap_cleanup, +	.send = tap_send, +	.peek_recvlen = tap_peek_recvlen, +	.recv = tap_recv, +	.recv_enable = tap_recv_enable, +	.recv_disable = tap_recv_disable, +	.get_cap = tap_get_cap, +	.set_cap = tap_set_cap, +}; + +/* A clone of the tap backend, with a different prefix. */ +static struct net_backend vmnet_backend = { +	.prefix = "vmnet", +	.priv_size = sizeof(struct tap_priv), +	.init = tap_init, +	.cleanup = tap_cleanup, +	.send = tap_send, +	.peek_recvlen = tap_peek_recvlen, +	.recv = tap_recv, +	.recv_enable = tap_recv_enable, +	.recv_disable = tap_recv_disable, +	.get_cap = tap_get_cap, +	.set_cap = tap_set_cap, +}; + +/* A clone of the tap backend, with a different prefix. */ +static struct net_backend ngd_backend = { +	.prefix = "ngd", +	.priv_size = sizeof(struct tap_priv), +	.init = tap_init, +	.cleanup = tap_cleanup, +	.send = tap_send, +	.peek_recvlen = tap_peek_recvlen, +	.recv = tap_recv, +	.recv_enable = tap_recv_enable, +	.recv_disable = tap_recv_disable, +	.get_cap = tap_get_cap, +	.set_cap = tap_set_cap, +}; + +DATA_SET(net_backend_set, tap_backend); +DATA_SET(net_backend_set, vmnet_backend); +DATA_SET(net_backend_set, ngd_backend); + +int +netbe_legacy_config(nvlist_t *nvl, const char *opts) +{ +	char *backend, *cp; + +	if (opts == NULL) +		return (0); + +	cp = strchr(opts, ','); +	if (cp == NULL) { +		set_config_value_node(nvl, "backend", opts); +		return (0); +	} +	backend = strndup(opts, cp - opts); +	set_config_value_node(nvl, "backend", backend); +	free(backend); +	return (pci_parse_legacy_config(nvl, cp + 1)); +} + +/* + * Initialize a backend and attach to the frontend. + * This is called during frontend initialization. + *  @ret is a pointer to the backend to be initialized + *  @devname is the backend-name as supplied on the command line, + * 	e.g. -s 2:0,frontend-name,backend-name[,other-args] + *  @cb is the receive callback supplied by the frontend, + *	and it is invoked in the event loop when a receive + *	event is generated in the hypervisor, + *  @param is a pointer to the frontend, and normally used as + *	the argument for the callback. + */ +int +netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, +    void *param) +{ +	struct net_backend **pbe, *nbe, *tbe = NULL; +	const char *value, *type; +	char *devname; +	int err; + +	value = get_config_value_node(nvl, "backend"); +	if (value == NULL) { +		return (-1); +	} +	devname = strdup(value); + +	/* +	 * Use the type given by configuration if exists; otherwise +	 * use the prefix of the backend as the type. +	 */ +	type = get_config_value_node(nvl, "type"); +	if (type == NULL) +		type = devname; + +	/* +	 * Find the network backend that matches the user-provided +	 * device name. net_backend_set is built using a linker set. +	 */ +	SET_FOREACH(pbe, net_backend_set) { +		if (strncmp(type, (*pbe)->prefix, +		    strlen((*pbe)->prefix)) == 0) { +			tbe = *pbe; +			assert(tbe->init != NULL); +			assert(tbe->cleanup != NULL); +			assert(tbe->send != NULL); +			assert(tbe->recv != NULL); +			assert(tbe->get_cap != NULL); +			assert(tbe->set_cap != NULL); +			break; +		} +	} + +	*ret = NULL; +	if (tbe == NULL) { +		free(devname); +		return (EINVAL); +	} + +	nbe = calloc(1, NET_BE_SIZE(tbe)); +	*nbe = *tbe;	/* copy the template */ +	nbe->fd = -1; +	nbe->sc = param; +	nbe->be_vnet_hdr_len = 0; +	nbe->fe_vnet_hdr_len = 0; + +	/* Initialize the backend. */ +	err = nbe->init(nbe, devname, nvl, cb, param); +	if (err) { +		free(devname); +		free(nbe); +		return (err); +	} + +	*ret = nbe; +	free(devname); + +	return (0); +} + +void +netbe_cleanup(struct net_backend *be) +{ + +	if (be != NULL) { +		be->cleanup(be); +		free(be); +	} +} + +uint64_t +netbe_get_cap(struct net_backend *be) +{ + +	assert(be != NULL); +	return (be->get_cap(be)); +} + +int +netbe_set_cap(struct net_backend *be, uint64_t features, +	      unsigned vnet_hdr_len) +{ +	int ret; + +	assert(be != NULL); + +	/* There are only three valid lengths, i.e., 0, 10 and 12. */ +	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN +		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) +		return (-1); + +	be->fe_vnet_hdr_len = vnet_hdr_len; + +	ret = be->set_cap(be, features, vnet_hdr_len); +	assert(be->be_vnet_hdr_len == 0 || +	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len); + +	return (ret); +} + +ssize_t +netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + +	return (be->send(be, iov, iovcnt)); +} + +ssize_t +netbe_peek_recvlen(struct net_backend *be) +{ + +	return (be->peek_recvlen(be)); +} + +/* + * Try to read a packet from the backend, without blocking. + * If no packets are available, return 0. In case of success, return + * the length of the packet just read. Return -1 in case of errors. + */ +ssize_t +netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + +	return (be->recv(be, iov, iovcnt)); +} + +/* + * Read a packet from the backend and discard it. + * Returns the size of the discarded packet or zero if no packet was available. + * A negative error code is returned in case of read error. + */ +ssize_t +netbe_rx_discard(struct net_backend *be) +{ +	/* +	 * MP note: the dummybuf is only used to discard frames, +	 * so there is no need for it to be per-vtnet or locked. +	 * We only make it large enough for TSO-sized segment. +	 */ +	static uint8_t dummybuf[65536 + 64]; +	struct iovec iov; + +	iov.iov_base = dummybuf; +	iov.iov_len = sizeof(dummybuf); + +	return netbe_recv(be, &iov, 1); +} + +void +netbe_rx_disable(struct net_backend *be) +{ + +	return be->recv_disable(be); +} + +void +netbe_rx_enable(struct net_backend *be) +{ + +	return be->recv_enable(be); +} + +size_t +netbe_get_vnet_hdr_len(struct net_backend *be) +{ + +	return (be->be_vnet_hdr_len); +} | 
