diff options
Diffstat (limited to 'sys/net/route/fib_algo.c')
| -rw-r--r-- | sys/net/route/fib_algo.c | 2077 |
1 files changed, 2077 insertions, 0 deletions
diff --git a/sys/net/route/fib_algo.c b/sys/net/route/fib_algo.c new file mode 100644 index 000000000000..ca635ad8a7b0 --- /dev/null +++ b/sys/net/route/fib_algo.c @@ -0,0 +1,2077 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" + +#include <sys/param.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/sbuf.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/stdarg.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <net/vnet.h> + +#include <net/if.h> +#include <net/if_var.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif + +#include <net/route.h> +#include <net/route/nhop.h> +#include <net/route/route_ctl.h> +#include <net/route/route_var.h> +#include <net/route/fib_algo.h> + +/* + * Fib lookup framework. + * + * This framework enables accelerated longest-prefix-match lookups for the + * routing tables by adding the ability to dynamically attach/detach lookup + * algorithms implementation to/from the datapath. + * + * flm - fib lookup modules - implementation of particular lookup algorithm + * fd - fib data - instance of an flm bound to specific routing table + * + * This file provides main framework functionality. + * + * The following are the features provided by the framework + * + * 1) nexhops abstraction -> provides transparent referencing, indexing + * and efficient idx->ptr mappings for nexthop and nexthop groups. + * 2) Routing table synchronisation + * 3) dataplane attachment points + * 4) automatic algorithm selection based on the provided preference. + * + * + * DATAPATH + * For each supported address family, there is a an allocated array of fib_dp + * structures, indexed by fib number. Each array entry contains callback function + * and its argument. This function will be called with a family-specific lookup key, + * scope and provided argument. This array gets re-created every time when new algo + * instance gets created. Please take a look at the replace_rtables_family() function + * for more details. + * + */ + +SYSCTL_DECL(_net_route); +SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Fib algorithm lookups"); + +/* Algorithm sync policy */ + +/* Time interval to bucket updates */ +VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50; +#define V_update_bucket_time_ms VNET(update_bucket_time_ms) +SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate"); + +/* Minimum update rate to delay sync */ +VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500; +#define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate) +SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync"); + +/* Max allowed delay to sync */ +VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000; +#define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms) +SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)"); + + +#ifdef INET6 +VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; +#define V_algo_fixed_inet6 VNET(algo_fixed_inet6) +SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "IPv6 longest prefix match lookups"); +#endif +#ifdef INET +VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; +#define V_algo_fixed_inet VNET(algo_fixed_inet) +SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "IPv4 longest prefix match lookups"); +#endif + +/* Fib instance counter */ +static uint32_t fib_gen = 0; + +struct nhop_ref_table { + uint32_t count; + int32_t refcnt[0]; +}; + +enum fib_callout_action { + FDA_NONE, /* No callout scheduled */ + FDA_REBUILD, /* Asks to rebuild algo instance */ + FDA_EVAL, /* Asks to evaluate if the current algo is still be best */ + FDA_BATCH, /* Asks to submit batch of updates to the algo */ +}; + +struct fib_sync_status { + struct timeval diverge_time; /* ts when diverged */ + uint32_t num_changes; /* number of changes since sync */ + uint32_t bucket_changes; /* num changes within the current bucket */ + uint64_t bucket_id; /* 50ms bucket # */ + struct fib_change_queue fd_change_queue;/* list of scheduled entries */ +}; + +/* + * Data structure for the fib lookup instance tied to the particular rib. + */ +struct fib_data { + uint32_t number_nhops; /* current # of nhops */ + uint8_t hit_nhops; /* true if out of nhop limit */ + uint8_t init_done; /* true if init is competed */ + uint32_t fd_dead:1; /* Scheduled for deletion */ + uint32_t fd_linked:1; /* true if linked */ + uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ + uint32_t fd_batch:1; /* true if batched notification scheduled */ + uint8_t fd_family; /* family */ + uint32_t fd_fibnum; /* fibnum */ + uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ + uint32_t fd_gen; /* instance gen# */ + struct callout fd_callout; /* rebuild callout */ + enum fib_callout_action fd_callout_action; /* Callout action to take */ + void *fd_algo_data; /* algorithm data */ + struct nhop_object **nh_idx; /* nhop idx->ptr array */ + struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ + struct rib_head *fd_rh; /* RIB table we're attached to */ + struct rib_subscription *fd_rs; /* storing table subscription */ + struct fib_dp fd_dp; /* fib datapath data */ + struct vnet *fd_vnet; /* vnet fib belongs to */ + struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ + struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ + struct fib_sync_status fd_ss; /* State relevant to the rib sync */ + uint32_t fd_num_changes; /* number of changes since last callout */ + TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ +}; + +static bool rebuild_fd(struct fib_data *fd, const char *reason); +static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new); +static void handle_fd_callout(void *_data); +static void destroy_fd_instance_epoch(epoch_context_t ctx); +static bool is_idx_free(struct fib_data *fd, uint32_t index); +static void set_algo_fixed(struct rib_head *rh); +static bool is_algo_fixed(struct rib_head *rh); + +static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); +static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); + +static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, + struct fib_lookup_module *orig_flm); +static void fib_unref_algo(struct fib_lookup_module *flm); +static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); + +struct mtx fib_mtx; +#define FIB_MOD_LOCK() mtx_lock(&fib_mtx) +#define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) +#define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) + +MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); + +/* Algorithm has to be this percent better than the current to switch */ +#define BEST_DIFF_PERCENT (5 * 256 / 100) +/* Schedule algo re-evaluation X seconds after a change */ +#define ALGO_EVAL_DELAY_MS 30000 +/* Force algo re-evaluation after X changes */ +#define ALGO_EVAL_NUM_ROUTES 100 +/* Try to setup algorithm X times */ +#define FIB_MAX_TRIES 32 +/* Max amount of supported nexthops */ +#define FIB_MAX_NHOPS 262144 +#define FIB_CALLOUT_DELAY_MS 50 + + +/* Debug */ +static int flm_debug_level = LOG_NOTICE; +SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, + &flm_debug_level, 0, "debuglevel"); +#define FLM_MAX_DEBUG_LEVEL LOG_DEBUG +#ifndef LOG_DEBUG2 +#define LOG_DEBUG2 8 +#endif + +#define _PASS_MSG(_l) (flm_debug_level >= (_l)) +#define ALGO_PRINTF(_l, _fmt, ...) if (_PASS_MSG(_l)) { \ + printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__); \ +} +#define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ + printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ + print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) +#define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ + printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) +#define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ + _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ +} +#define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) +#define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ + _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ + _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ +} +#if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 +#define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF +#else +#define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) +#endif +#if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG +#define FD_PRINTF_LOG_DEBUG _FD_PRINTF +#else +#define FD_PRINTF_LOG_DEBUG() +#endif +#if FLM_MAX_DEBUG_LEVEL>=LOG_INFO +#define FD_PRINTF_LOG_INFO _FD_PRINTF +#else +#define FD_PRINTF_LOG_INFO() +#endif +#define FD_PRINTF_LOG_NOTICE _FD_PRINTF +#define FD_PRINTF_LOG_ERR _FD_PRINTF +#define FD_PRINTF_LOG_WARNING _FD_PRINTF + + +/* List of all registered lookup algorithms */ +static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); + +/* List of all fib lookup instances in the vnet */ +VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); +#define V_fib_data_list VNET(fib_data_list) + +/* Datastructure for storing non-transient fib lookup module failures */ +struct fib_error { + int fe_family; + uint32_t fe_fibnum; /* failed rtable */ + struct fib_lookup_module *fe_flm; /* failed module */ + TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ +}; +VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); +#define V_fib_error_list VNET(fib_error_list) + +/* Per-family array of fibnum -> {func, arg} mappings used in datapath */ +struct fib_dp_header { + struct epoch_context fdh_epoch_ctx; + uint32_t fdh_num_tables; + struct fib_dp fdh_idx[0]; +}; + +/* + * Tries to add new non-transient algorithm error to the list of + * errors. + * Returns true on success. + */ +static bool +flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) +{ + struct fib_error *fe; + + fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); + if (fe == NULL) + return (false); + fe->fe_flm = flm; + fe->fe_family = flm->flm_family; + fe->fe_fibnum = fibnum; + + FIB_MOD_LOCK(); + /* Avoid duplicates by checking if error already exists first */ + if (flm_error_check(flm, fibnum)) { + FIB_MOD_UNLOCK(); + free(fe, M_TEMP); + return (true); + } + TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); + FIB_MOD_UNLOCK(); + + return (true); +} + +/* + * True if non-transient error has been registered for @flm in @fibnum. + */ +static bool +flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) +{ + const struct fib_error *fe; + + TAILQ_FOREACH(fe, &V_fib_error_list, entries) { + if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) + return (true); + } + + return (false); +} + +/* + * Clear all errors of algo specified by @flm. + */ +static void +fib_error_clear_flm(struct fib_lookup_module *flm) +{ + struct fib_error *fe, *fe_tmp; + + FIB_MOD_LOCK_ASSERT(); + + TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { + if (fe->fe_flm == flm) { + TAILQ_REMOVE(&V_fib_error_list, fe, entries); + free(fe, M_TEMP); + } + } +} + +/* + * Clears all errors in current VNET. + */ +static void +fib_error_clear(void) +{ + struct fib_error *fe, *fe_tmp; + + FIB_MOD_LOCK_ASSERT(); + + TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { + TAILQ_REMOVE(&V_fib_error_list, fe, entries); + free(fe, M_TEMP); + } +} + +static const char * +print_op_result(enum flm_op_result result) +{ + switch (result) { + case FLM_SUCCESS: + return "success"; + case FLM_REBUILD: + return "rebuild"; + case FLM_BATCH: + return "batch"; + case FLM_ERROR: + return "error"; + } + + return "unknown"; +} + +static const char * +print_family(int family) +{ + + if (family == AF_INET) + return ("inet"); + else if (family == AF_INET6) + return ("inet6"); + else + return ("unknown"); +} + +/* + * Debug function used by lookup algorithms. + * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " + */ +void +fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) +{ + char buf[128]; + va_list ap; + + if (level > flm_debug_level) + return; + + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, + fd->fd_gen, func, "%s", buf); +} + +/* + * Outputs list of algorithms supported by the provided address family. + */ +static int +print_algos_sysctl(struct sysctl_req *req, int family) +{ + struct fib_lookup_module *flm; + struct sbuf sbuf; + int error, count = 0; + + error = sysctl_wire_old_buffer(req, 0); + if (error == 0) { + sbuf_new_for_sysctl(&sbuf, NULL, 512, req); + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if (flm->flm_family == family) { + if (count++ > 0) + sbuf_cat(&sbuf, ", "); + sbuf_cat(&sbuf, flm->flm_name); + } + } + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + } + return (error); +} + +#ifdef INET6 +static int +print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) +{ + + return (print_algos_sysctl(req, AF_INET6)); +} +SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); +#endif + +#ifdef INET +static int +print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) +{ + + return (print_algos_sysctl(req, AF_INET)); +} +SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); +#endif + +/* + * Calculate delay between repeated failures. + * Returns current delay in milliseconds. + */ +static uint32_t +callout_calc_delay_ms(struct fib_data *fd) +{ + uint32_t shift; + + if (fd->fd_failed_rebuilds > 10) + shift = 10; + else + shift = fd->fd_failed_rebuilds; + + return ((1 << shift) * FIB_CALLOUT_DELAY_MS); +} + +static void +schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms) +{ + + FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action); + fd->fd_callout_action = action; + callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0, + handle_fd_callout, fd, 0); +} + +static void +schedule_fd_rebuild(struct fib_data *fd, const char *reason) +{ + + RIB_WLOCK_ASSERT(fd->fd_rh); + + if (!fd->fd_need_rebuild) { + fd->fd_need_rebuild = true; + /* Stop batch updates */ + fd->fd_batch = false; + + /* + * Potentially re-schedules pending callout + * initiated by schedule_algo_eval. + */ + FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", + reason, fd->fd_failed_rebuilds); + schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd)); + } +} + +static void +sync_rib_gen(struct fib_data *fd) +{ + FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib); + fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib; +} + +static int64_t +get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv) +{ + int64_t diff = 0; + + diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000; + diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000; + + return (diff); +} + +static void +add_tv_diff_ms(struct timeval *tv, int ms) +{ + tv->tv_sec += ms / 1000; + ms = ms % 1000; + if (ms * 1000 + tv->tv_usec < 1000000) + tv->tv_usec += ms * 1000; + else { + tv->tv_sec += 1; + tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000; + } +} + +/* + * Marks the time when algo state diverges from the rib state. + */ +static void +mark_diverge_time(struct fib_data *fd) +{ + struct fib_sync_status *fd_ss = &fd->fd_ss; + + getmicrouptime(&fd_ss->diverge_time); + fd_ss->bucket_id = 0; + fd_ss->bucket_changes = 0; +} + +/* + * Calculates and updates the next algorithm sync time, based on the current activity. + * + * The intent is to provide reasonable balance between the update + * latency and efficient batching when changing large amount of routes. + * + * High-level algorithm looks the following: + * 1) all changes are bucketed in 50ms intervals + * 2) If amount of changes within the bucket is greater than the threshold, + * the update gets delayed, up to maximum delay threshold. + */ +static void +update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action) +{ + uint32_t bucket_id, new_delay = 0; + struct timeval tv; + + /* Fetch all variables at once to ensure consistent reads */ + uint32_t bucket_time_ms = V_update_bucket_time_ms; + uint32_t threshold_rate = V_bucket_change_threshold_rate; + uint32_t max_delay_ms = V_fib_max_sync_delay_ms; + + if (bucket_time_ms == 0) + bucket_time_ms = 50; + /* calculate per-bucket threshold rate */ + threshold_rate = threshold_rate * bucket_time_ms / 1000; + + getmicrouptime(&tv); + + struct fib_sync_status *fd_ss = &fd->fd_ss; + + bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms; + + if (fd_ss->bucket_id == bucket_id) { + fd_ss->bucket_changes++; + if (fd_ss->bucket_changes == threshold_rate) { + new_delay = (bucket_id + 2) * bucket_time_ms; + if (new_delay <= max_delay_ms) { + FD_PRINTF(LOG_DEBUG, fd, + "hit threshold of %u routes, delay update," + "bucket: %u, total delay: %u", + threshold_rate, bucket_id + 1, new_delay); + } else { + new_delay = 0; + FD_PRINTF(LOG_DEBUG, fd, + "maximum sync delay (%u ms) reached", max_delay_ms); + } + } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1)) + new_delay = bucket_time_ms; + } else { + fd_ss->bucket_id = bucket_id; + fd_ss->bucket_changes = 1; + } + + if (new_delay > 0) { + /* Calculated time has been updated */ + struct timeval new_tv = fd_ss->diverge_time; + add_tv_diff_ms(&new_tv, new_delay); + + int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv); + schedule_callout(fd, action, delay_ms); + } +} + +static void +update_algo_state(struct fib_data *fd) +{ + + RIB_WLOCK_ASSERT(fd->fd_rh); + + if (fd->fd_batch || fd->fd_need_rebuild) { + enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH; + update_rebuild_delay(fd, action); + return; + } + + if (fd->fd_num_changes++ == 0) { + /* Start callout to consider switch */ + if (!callout_pending(&fd->fd_callout)) + schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS); + } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) { + /* Reset callout to exec immediately */ + if (fd->fd_callout_action == FDA_EVAL) + schedule_callout(fd, FDA_EVAL, 1); + } +} + +static bool +need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc) +{ + struct nhop_object *nh; + + /* Sync addition/removal of interface routes */ + switch (rc->rc_cmd) { + case RTM_ADD: + nh = rc->rc_nh_new; + if (!NH_IS_NHGRP(nh)) { + if (!(nh->nh_flags & NHF_GATEWAY)) + return (true); + if (nhop_get_rtflags(nh) & RTF_STATIC) + return (true); + } + break; + case RTM_DELETE: + nh = rc->rc_nh_old; + if (!NH_IS_NHGRP(nh)) { + if (!(nh->nh_flags & NHF_GATEWAY)) + return (true); + if (nhop_get_rtflags(nh) & RTF_STATIC) + return (true); + } + break; + } + + return (false); +} + +static bool +apply_rtable_changes(struct fib_data *fd) +{ + enum flm_op_result result; + struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; + + result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data); + + if (result == FLM_SUCCESS) { + sync_rib_gen(fd); + for (int i = 0; i < q->count; i++) + if (q->entries[i].nh_old) + fib_unref_nhop(fd, q->entries[i].nh_old); + q->count = 0; + } + fd->fd_batch = false; + + return (result == FLM_SUCCESS); +} + +static bool +fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc) +{ + int plen = 0; + + switch (fd->fd_family) { +#ifdef INET + case AF_INET: + rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid); + break; +#endif +#ifdef INET6 + case AF_INET6: + rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid); + break; +#endif + } + + ce->plen = plen; + ce->nh_old = rc->rc_nh_old; + ce->nh_new = rc->rc_nh_new; + if (ce->nh_new != NULL) { + if (fib_ref_nhop(fd, ce->nh_new) == 0) + return (false); + } + + return (true); +} + +static bool +queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc) +{ + struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; + + if (q->count >= q->size) { + uint32_t q_size; + + if (q->size == 0) + q_size = 256; /* ~18k memory */ + else + q_size = q->size * 2; + + size_t size = q_size * sizeof(struct fib_change_entry); + void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO); + if (a == NULL) { + FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements", + q_size); + return (false); + } + q->entries = a; + q->size = q_size; + } + + return (fill_change_entry(fd, &q->entries[q->count++], rc)); +} + +/* + * Rib subscription handler. Checks if the algorithm is ready to + * receive updates, handles nexthop refcounting and passes change + * data to the algorithm callback. + */ +static void +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct fib_data *fd = (struct fib_data *)_data; + enum flm_op_result result; + + RIB_WLOCK_ASSERT(rnh); + + /* + * There is a small gap between subscribing for route changes + * and initiating rtable dump. Avoid receiving route changes + * prior to finishing rtable dump by checking `init_done`. + */ + if (!fd->init_done) + return; + + bool immediate_sync = need_immediate_sync(fd, rc); + + /* Consider scheduling algorithm re-evaluation */ + update_algo_state(fd); + + /* + * If algo requested rebuild, stop sending updates by default. + * This simplifies nexthop refcount handling logic. + */ + if (fd->fd_need_rebuild) { + if (immediate_sync) + rebuild_fd(fd, "rtable change type enforced sync"); + return; + } + + /* + * Algo requested updates to be delivered in batches. + * Add the current change to the queue and return. + */ + if (fd->fd_batch) { + if (immediate_sync) { + if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd)) + rebuild_fd(fd, "batch sync failed"); + } else { + if (!queue_rtable_change(fd, rc)) + schedule_fd_rebuild(fd, "batch queue failed"); + } + return; + } + + /* + * Maintain guarantee that every nexthop returned by the dataplane + * lookup has > 0 refcount, so can be safely referenced within current + * epoch. + */ + if (rc->rc_nh_new != NULL) { + if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { + /* ran out of indexes */ + schedule_fd_rebuild(fd, "ran out of nhop indexes"); + return; + } + } + + result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); + + switch (result) { + case FLM_SUCCESS: + sync_rib_gen(fd); + /* Unref old nexthop on success */ + if (rc->rc_nh_old != NULL) + fib_unref_nhop(fd, rc->rc_nh_old); + break; + case FLM_BATCH: + + /* + * Algo asks to batch the changes. + */ + if (queue_rtable_change(fd, rc)) { + if (!immediate_sync) { + fd->fd_batch = true; + mark_diverge_time(fd); + update_rebuild_delay(fd, FDA_BATCH); + break; + } + if (apply_rtable_changes(fd)) + break; + } + FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild"); + + case FLM_REBUILD: + + /* + * Algo is not able to apply the update. + * Schedule algo rebuild. + */ + if (!immediate_sync) { + mark_diverge_time(fd); + schedule_fd_rebuild(fd, "algo requested rebuild"); + break; + } + + FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); + rebuild_fd(fd, "rtable change type enforced sync"); + break; + case FLM_ERROR: + + /* + * Algo reported a non-recoverable error. + * Record the error and schedule rebuild, which will + * trigger best algo selection. + */ + FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); + if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) + FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); + schedule_fd_rebuild(fd, "algo reported non-recoverable error"); + } +} + +static void +estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) +{ + + if (old_fd == NULL) { + // TODO: read from rtable + fd->number_nhops = 16; + return; + } + + if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) + fd->number_nhops = 2 * old_fd->number_nhops; + else + fd->number_nhops = old_fd->number_nhops; +} + +struct walk_cbdata { + struct fib_data *fd; + flm_dump_t *func; + enum flm_op_result result; +}; + +/* + * Handler called after all rtenties have been dumped. + * Performs post-dump framework checks and calls + * algo:flm_dump_end_cb(). + * + * Updates walk_cbdata result. + */ +static void +sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) +{ + struct walk_cbdata *w = (struct walk_cbdata *)_data; + struct fib_data *fd = w->fd; + + RIB_WLOCK_ASSERT(w->fd->fd_rh); + + if (rnh->rib_dying) { + w->result = FLM_ERROR; + return; + } + + if (fd->hit_nhops) { + FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", + fd->nh_ref_table->count); + if (w->result == FLM_SUCCESS) + w->result = FLM_REBUILD; + return; + } + + if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) + return; + + /* Post-dump hook, dump successful */ + w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); + + if (w->result == FLM_SUCCESS) { + /* Mark init as done to allow routing updates */ + fd->init_done = 1; + } +} + +/* + * Callback for each entry in rib. + * Calls algo:flm_dump_rib_item_cb func as a part of initial + * route table synchronisation. + */ +static int +sync_algo_cb(struct rtentry *rt, void *_data) +{ + struct walk_cbdata *w = (struct walk_cbdata *)_data; + + RIB_WLOCK_ASSERT(w->fd->fd_rh); + + if (w->result == FLM_SUCCESS && w->func) { + + /* + * Reference nexthops to maintain guarantee that + * each nexthop returned by datapath has > 0 references + * and can be safely referenced within current epoch. + */ + struct nhop_object *nh = rt_get_raw_nhop(rt); + if (fib_ref_nhop(w->fd, nh) != 0) + w->result = w->func(rt, w->fd->fd_algo_data); + else + w->result = FLM_REBUILD; + } + + return (0); +} + +/* + * Dump all routing table state to the algo instance. + */ +static enum flm_op_result +sync_algo(struct fib_data *fd) +{ + struct walk_cbdata w = { + .fd = fd, + .func = fd->fd_flm->flm_dump_rib_item_cb, + .result = FLM_SUCCESS, + }; + + rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); + + FD_PRINTF(LOG_INFO, fd, + "initial dump completed (rtable version: %d), result: %s", + fd->fd_rh->rnh_gen, print_op_result(w.result)); + + return (w.result); +} + +/* + * Schedules epoch-backed @fd instance deletion. + * * Unlinks @fd from the list of active algo instances. + * * Removes rib subscription. + * * Stops callout. + * * Schedules actual deletion. + * + * Assume @fd is already unlinked from the datapath. + */ +static int +schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) +{ + bool is_dead; + + NET_EPOCH_ASSERT(); + RIB_WLOCK_ASSERT(fd->fd_rh); + + FIB_MOD_LOCK(); + is_dead = fd->fd_dead; + if (!is_dead) + fd->fd_dead = true; + if (fd->fd_linked) { + TAILQ_REMOVE(&V_fib_data_list, fd, entries); + fd->fd_linked = false; + } + FIB_MOD_UNLOCK(); + if (is_dead) + return (0); + + FD_PRINTF(LOG_INFO, fd, "DETACH"); + + if (fd->fd_rs != NULL) + rib_unsubscribe_locked(fd->fd_rs); + + /* + * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls + * will be executed, hence no _new_ callout schedules will happen. + */ + callout_stop(&fd->fd_callout); + + fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx); + + return (0); +} + +/* + * Wipe all fd instances from the list matching rib specified by @rh. + * If @keep_first is set, remove all but the first record. + */ +static void +fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) +{ + struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); + struct fib_data *fd, *fd_tmp; + struct epoch_tracker et; + + FIB_MOD_LOCK(); + TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { + if (fd->fd_rh == rh) { + if (keep_first) { + keep_first = false; + continue; + } + TAILQ_REMOVE(&V_fib_data_list, fd, entries); + fd->fd_linked = false; + TAILQ_INSERT_TAIL(&tmp_head, fd, entries); + } + } + FIB_MOD_UNLOCK(); + + /* Pass 2: remove each entry */ + NET_EPOCH_ENTER(et); + TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { + if (!in_callout) + RIB_WLOCK(fd->fd_rh); + schedule_destroy_fd_instance(fd, in_callout); + if (!in_callout) + RIB_WUNLOCK(fd->fd_rh); + } + NET_EPOCH_EXIT(et); +} + +void +fib_destroy_rib(struct rib_head *rh) +{ + + /* + * rnh has `is_dying` flag set, so setup of new fd's will fail at + * sync_algo() stage, preventing new entries to be added to the list + * of active algos. Remove all existing entries for the particular rib. + */ + fib_cleanup_algo(rh, false, false); +} + +/* + * Finalises fd destruction by freeing all fd resources. + */ +static void +destroy_fd_instance(struct fib_data *fd) +{ + + FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); + + /* Call destroy callback first */ + if (fd->fd_algo_data != NULL) + fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); + + /* Nhop table */ + if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { + for (int i = 0; i < fd->number_nhops; i++) { + if (!is_idx_free(fd, i)) { + FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", + i, fd->nh_idx[i]); + nhop_free_any(fd->nh_idx[i]); + } + } + free(fd->nh_idx, M_RTABLE); + } + if (fd->nh_ref_table != NULL) + free(fd->nh_ref_table, M_RTABLE); + + if (fd->fd_ss.fd_change_queue.entries != NULL) + free(fd->fd_ss.fd_change_queue.entries, M_TEMP); + + fib_unref_algo(fd->fd_flm); + + free(fd, M_RTABLE); +} + +/* + * Epoch callback indicating fd is safe to destroy + */ +static void +destroy_fd_instance_epoch(epoch_context_t ctx) +{ + struct fib_data *fd; + + fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); + + CURVNET_SET(fd->fd_vnet); + destroy_fd_instance(fd); + CURVNET_RESTORE(); +} + +/* + * Tries to setup fd instance. + * - Allocates fd/nhop table + * - Runs algo:flm_init_cb algo init + * - Subscribes fd to the rib + * - Runs rtable dump + * - Adds instance to the list of active instances. + * + * Returns: operation result. Fills in @pfd with resulting fd on success. + * + */ +static enum flm_op_result +try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, + struct fib_data *old_fd, struct fib_data **pfd) +{ + struct fib_data *fd; + size_t size; + enum flm_op_result result; + + /* Allocate */ + fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (fd == NULL) { + *pfd = NULL; + RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); + return (FLM_REBUILD); + } + *pfd = fd; + + estimate_nhop_scale(old_fd, fd); + + fd->fd_rh = rh; + fd->fd_family = rh->rib_family; + fd->fd_fibnum = rh->rib_fibnum; + callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); + fd->fd_vnet = curvnet; + fd->fd_flm = flm; + + FIB_MOD_LOCK(); + flm->flm_refcount++; + fd->fd_gen = ++fib_gen; + FIB_MOD_UNLOCK(); + + FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); + + /* Allocate nhidx -> nhop_ptr table */ + size = fd->number_nhops * sizeof(void *); + fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); + if (fd->nh_idx == NULL) { + FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); + return (FLM_REBUILD); + } + + /* Allocate nhop index refcount table */ + size = sizeof(struct nhop_ref_table); + size += fd->number_nhops * sizeof(uint32_t); + fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); + if (fd->nh_ref_table == NULL) { + FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); + return (FLM_REBUILD); + } + FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); + + /* Okay, we're ready for algo init */ + void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; + result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); + if (result != FLM_SUCCESS) { + FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); + return (result); + } + + /* Try to subscribe */ + if (flm->flm_change_rib_item_cb != NULL) { + fd->fd_rs = rib_subscribe_locked(fd->fd_rh, + handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); + if (fd->fd_rs == NULL) { + FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); + return (FLM_REBUILD); + } + } + + /* Dump */ + result = sync_algo(fd); + if (result != FLM_SUCCESS) { + FD_PRINTF(LOG_INFO, fd, "rib sync failed"); + return (result); + } + FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); + + FIB_MOD_LOCK(); + /* + * Insert fd in the beginning of a list, to maintain invariant + * that first matching entry for the AF/fib is always the active + * one. + */ + TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); + fd->fd_linked = true; + FIB_MOD_UNLOCK(); + + return (FLM_SUCCESS); +} + +/* + * Sets up algo @flm for table @rh and links it to the datapath. + * + */ +static enum flm_op_result +setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, + struct fib_data *orig_fd, struct fib_data **pfd, bool attach) +{ + struct fib_data *prev_fd, *new_fd; + enum flm_op_result result; + + NET_EPOCH_ASSERT(); + RIB_WLOCK_ASSERT(rh); + + prev_fd = orig_fd; + new_fd = NULL; + for (int i = 0; i < FIB_MAX_TRIES; i++) { + result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); + + if ((result == FLM_SUCCESS) && attach) { + if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp)) + sync_rib_gen(new_fd); + else + result = FLM_REBUILD; + } + + if ((prev_fd != NULL) && (prev_fd != orig_fd)) { + schedule_destroy_fd_instance(prev_fd, false); + prev_fd = NULL; + } + + RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, + print_op_result(result)); + + if (result == FLM_REBUILD) { + prev_fd = new_fd; + new_fd = NULL; + continue; + } + + break; + } + + if (result != FLM_SUCCESS) { + RH_PRINTF(LOG_WARNING, rh, + "%s algo instance setup failed, failures=%d", flm->flm_name, + orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); + /* update failure count */ + FIB_MOD_LOCK(); + if (orig_fd != NULL) + orig_fd->fd_failed_rebuilds++; + FIB_MOD_UNLOCK(); + + /* Ban algo on non-recoverable error */ + if (result == FLM_ERROR) + flm_error_add(flm, rh->rib_fibnum); + + if ((prev_fd != NULL) && (prev_fd != orig_fd)) + schedule_destroy_fd_instance(prev_fd, false); + if (new_fd != NULL) { + schedule_destroy_fd_instance(new_fd, false); + new_fd = NULL; + } + } + + *pfd = new_fd; + return (result); +} + +/* + * Tries to sync algo with the current rtable state, either + * by executing batch update or rebuilding. + * Returns true on success. + */ +static bool +execute_callout_action(struct fib_data *fd) +{ + enum fib_callout_action action = fd->fd_callout_action; + struct fib_lookup_module *flm_new = NULL; + bool result = true; + + NET_EPOCH_ASSERT(); + RIB_WLOCK_ASSERT(fd->fd_rh); + + fd->fd_need_rebuild = false; + fd->fd_batch = false; + fd->fd_num_changes = 0; + + /* First, check if we're still OK to use this algo */ + if (!is_algo_fixed(fd->fd_rh)) + flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); + if (flm_new != NULL) + action = FDA_REBUILD; + + if (action == FDA_BATCH) { + /* Try to sync */ + if (!apply_rtable_changes(fd)) + action = FDA_REBUILD; + } + + if (action == FDA_REBUILD) + result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); + if (flm_new != NULL) + fib_unref_algo(flm_new); + + return (result); +} + +/* + * Callout for all scheduled fd-related work. + * - Checks if the current algo is still the best algo + * - Synchronises algo instance to the rtable (batch usecase) + * - Creates a new instance of an algo for af/fib if desired. + */ +static void +handle_fd_callout(void *_data) +{ + struct fib_data *fd = (struct fib_data *)_data; + struct epoch_tracker et; + + FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action); + + NET_EPOCH_ENTER(et); + CURVNET_SET(fd->fd_vnet); + execute_callout_action(fd); + CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); +} + +/* + * Tries to create new algo instance based on @fd data. + * Returns true on success. + */ +static bool +rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new) +{ + struct fib_data *fd_new, *fd_tmp = NULL; + bool result; + + if (flm_new == fd->fd_flm) + fd_tmp = fd; + else + FD_PRINTF(LOG_INFO, fd, "switching algo to %s", flm_new->flm_name); + + result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); + if (result != FLM_SUCCESS) { + FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); + return (false); + } + FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); + + /* Remove old instance */ + schedule_destroy_fd_instance(fd, true); + + return (true); +} + +static bool +rebuild_fd(struct fib_data *fd, const char *reason) +{ + struct fib_lookup_module *flm_new = NULL; + bool result; + + if (!is_algo_fixed(fd->fd_rh)) + flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); + + FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason); + result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); + if (flm_new != NULL) + fib_unref_algo(flm_new); + + if (!result) { + FD_PRINTF(LOG_ERR, fd, "sync rebuild failed"); + schedule_fd_rebuild(fd, "sync rebuild failed"); + } + + return (result); +} + +/* + * Finds algo by name/family. + * Returns referenced algo or NULL. + */ +static struct fib_lookup_module * +fib_find_algo(const char *algo_name, int family) +{ + struct fib_lookup_module *flm; + + FIB_MOD_LOCK(); + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if ((strcmp(flm->flm_name, algo_name) == 0) && + (family == flm->flm_family)) { + flm->flm_refcount++; + FIB_MOD_UNLOCK(); + return (flm); + } + } + FIB_MOD_UNLOCK(); + + return (NULL); +} + +static void +fib_unref_algo(struct fib_lookup_module *flm) +{ + + FIB_MOD_LOCK(); + flm->flm_refcount--; + FIB_MOD_UNLOCK(); +} + +static int +set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) +{ + struct fib_lookup_module *flm = NULL; + struct fib_data *fd = NULL; + char old_algo_name[32], algo_name[32]; + struct rib_head *rh = NULL; + enum flm_op_result result; + struct epoch_tracker et; + int error; + + /* Fetch current algo/rib for af/family */ + FIB_MOD_LOCK(); + TAILQ_FOREACH(fd, &V_fib_data_list, entries) { + if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) + break; + } + if (fd == NULL) { + FIB_MOD_UNLOCK(); + return (ENOENT); + } + rh = fd->fd_rh; + strlcpy(old_algo_name, fd->fd_flm->flm_name, + sizeof(old_algo_name)); + FIB_MOD_UNLOCK(); + + strlcpy(algo_name, old_algo_name, sizeof(algo_name)); + error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); + if (error != 0 || req->newptr == NULL) + return (error); + + if (strcmp(algo_name, old_algo_name) == 0) + return (0); + + /* New algorithm name is different */ + flm = fib_find_algo(algo_name, family); + if (flm == NULL) { + RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); + return (ESRCH); + } + + fd = NULL; + NET_EPOCH_ENTER(et); + RIB_WLOCK(rh); + result = setup_fd_instance(flm, rh, NULL, &fd, true); + RIB_WUNLOCK(rh); + NET_EPOCH_EXIT(et); + fib_unref_algo(flm); + if (result != FLM_SUCCESS) + return (EINVAL); + + /* Disable automated jumping between algos */ + FIB_MOD_LOCK(); + set_algo_fixed(rh); + FIB_MOD_UNLOCK(); + /* Remove old instance(s) */ + fib_cleanup_algo(rh, true, false); + + /* Drain cb so user can unload the module after userret if so desired */ + NET_EPOCH_DRAIN_CALLBACKS(); + + return (0); +} + +#ifdef INET +static int +set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + + return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); +} +SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, + CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); +#endif + +#ifdef INET6 +static int +set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + + return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); +} +SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, + CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); +#endif + +static struct nhop_object * +dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + return (NULL); +} + +static void +destroy_fdh_epoch(epoch_context_t ctx) +{ + struct fib_dp_header *fdh; + + fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); + free(fdh, M_RTABLE); +} + +static struct fib_dp_header * +alloc_fib_dp_array(uint32_t num_tables, bool waitok) +{ + size_t sz; + struct fib_dp_header *fdh; + + sz = sizeof(struct fib_dp_header); + sz += sizeof(struct fib_dp) * num_tables; + fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); + if (fdh != NULL) { + fdh->fdh_num_tables = num_tables; + /* + * Set dummy lookup function ptr always returning NULL, so + * we can delay algo init. + */ + for (uint32_t i = 0; i < num_tables; i++) + fdh->fdh_idx[i].f = dummy_lookup; + } + return (fdh); +} + +static struct fib_dp_header * +get_fib_dp_header(struct fib_dp *dp) +{ + + return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); +} + +/* + * Replace per-family index pool @pdp with a new one which + * contains updated callback/algo data from @fd. + * Returns true on success. + */ +static bool +replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp) +{ + struct fib_dp_header *new_fdh, *old_fdh; + + NET_EPOCH_ASSERT(); + + FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", + curvnet, dp->f, dp->arg); + + FIB_MOD_LOCK(); + old_fdh = get_fib_dp_header(*pdp); + + if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) { + /* + * Function is the same, data pointer needs update. + * Perform in-line replace without reallocation. + */ + old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg; + FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh); + FIB_MOD_UNLOCK(); + return (true); + } + + new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); + FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); + if (new_fdh == NULL) { + FIB_MOD_UNLOCK(); + FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); + return (false); + } + + memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], + old_fdh->fdh_num_tables * sizeof(struct fib_dp)); + /* Update relevant data structure for @fd */ + new_fdh->fdh_idx[fd->fd_fibnum] = *dp; + + /* Ensure memcpy() writes have completed */ + atomic_thread_fence_rel(); + /* Set new datapath pointer */ + *pdp = &new_fdh->fdh_idx[0]; + FIB_MOD_UNLOCK(); + FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); + + fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); + + return (true); +} + +static struct fib_dp ** +get_family_dp_ptr(int family) +{ + switch (family) { +#ifdef INET + case AF_INET: + return (&V_inet_dp); +#endif +#ifdef INET6 + case AF_INET6: + return (&V_inet6_dp); +#endif + } + return (NULL); +} + +/* + * Make datapath use fib instance @fd + */ +bool +fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp) +{ + struct fib_dp **pdp; + + pdp = get_family_dp_ptr(fd->fd_family); + return (replace_rtables_family(pdp, fd, dp)); +} + +/* + * Grow datapath pointers array. + * Called from sysctl handler on growing number of routing tables. + */ +static void +grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) +{ + struct fib_dp_header *new_fdh, *old_fdh = NULL; + + new_fdh = alloc_fib_dp_array(new_num_tables, true); + + FIB_MOD_LOCK(); + if (*pdp != NULL) { + old_fdh = get_fib_dp_header(*pdp); + memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], + old_fdh->fdh_num_tables * sizeof(struct fib_dp)); + } + + /* Wait till all writes completed */ + atomic_thread_fence_rel(); + + *pdp = &new_fdh->fdh_idx[0]; + FIB_MOD_UNLOCK(); + + if (old_fdh != NULL) + fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); +} + +/* + * Grows per-AF arrays of datapath pointers for each supported family. + * Called from fibs resize sysctl handler. + */ +void +fib_grow_rtables(uint32_t new_num_tables) +{ + +#ifdef INET + grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); +#endif +#ifdef INET6 + grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); +#endif +} + +void +fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) +{ + + bzero(rinfo, sizeof(struct rib_rtable_info)); + rinfo->num_prefixes = rh->rnh_prefixes; + rinfo->num_nhops = nhops_get_count(rh); +#ifdef ROUTE_MPATH + rinfo->num_nhgrp = nhgrp_get_count(rh); +#endif +} + +/* + * Updates pointer to the algo data for the @fd. + */ +void +fib_set_algo_ptr(struct fib_data *fd, void *algo_data) +{ + RIB_WLOCK_ASSERT(fd->fd_rh); + + fd->fd_algo_data = algo_data; +} + +/* + * Calls @callback with @ctx after the end of a current epoch. + */ +void +fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx) +{ + NET_EPOCH_CALL(callback, ctx); +} + +/* + * Accessor to get rib instance @fd is attached to. + */ +struct rib_head * +fib_get_rh(struct fib_data *fd) +{ + + return (fd->fd_rh); +} + +/* + * Accessor to export idx->nhop array + */ +struct nhop_object ** +fib_get_nhop_array(struct fib_data *fd) +{ + + return (fd->nh_idx); +} + +static uint32_t +get_nhop_idx(struct nhop_object *nh) +{ +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) + return (nhgrp_get_idx((struct nhgrp_object *)nh)); + else +#endif + return (nhop_get_idx(nh)); +} + +uint32_t +fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) +{ + + return (get_nhop_idx(nh)); +} + +static bool +is_idx_free(struct fib_data *fd, uint32_t index) +{ + + return (fd->nh_ref_table->refcnt[index] == 0); +} + +static uint32_t +fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) +{ + uint32_t idx = get_nhop_idx(nh); + + if (idx >= fd->number_nhops) { + fd->hit_nhops = 1; + return (0); + } + + if (is_idx_free(fd, idx)) { + nhop_ref_any(nh); + fd->nh_idx[idx] = nh; + fd->nh_ref_table->count++; + FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); + } + fd->nh_ref_table->refcnt[idx]++; + + return (idx); +} + +struct nhop_release_data { + struct nhop_object *nh; + struct epoch_context ctx; +}; + +static void +release_nhop_epoch(epoch_context_t ctx) +{ + struct nhop_release_data *nrd; + + nrd = __containerof(ctx, struct nhop_release_data, ctx); + nhop_free_any(nrd->nh); + free(nrd, M_TEMP); +} + +/* + * Delays nexthop refcount release. + * Datapath may have the datastructures not updated yet, so the old + * nexthop may still be returned till the end of current epoch. Delay + * refcount removal, as we may be removing the last instance, which will + * trigger nexthop deletion, rendering returned nexthop invalid. + */ +static void +fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) +{ + struct nhop_release_data *nrd; + + nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); + if (nrd != NULL) { + nrd->nh = nh; + fib_epoch_call(release_nhop_epoch, &nrd->ctx); + } else { + /* + * Unable to allocate memory. Leak nexthop to maintain guarantee + * that each nhop can be referenced. + */ + FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); + } +} + +static void +fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) +{ + uint32_t idx = get_nhop_idx(nh); + + KASSERT((idx < fd->number_nhops), ("invalid nhop index")); + KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); + + fd->nh_ref_table->refcnt[idx]--; + if (fd->nh_ref_table->refcnt[idx] == 0) { + FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); + fib_schedule_release_nhop(fd, fd->nh_idx[idx]); + } +} + +static void +set_algo_fixed(struct rib_head *rh) +{ + switch (rh->rib_family) { +#ifdef INET + case AF_INET: + V_algo_fixed_inet = true; + break; +#endif +#ifdef INET6 + case AF_INET6: + V_algo_fixed_inet6 = true; + break; +#endif + } +} + +static bool +is_algo_fixed(struct rib_head *rh) +{ + + switch (rh->rib_family) { +#ifdef INET + case AF_INET: + return (V_algo_fixed_inet); +#endif +#ifdef INET6 + case AF_INET6: + return (V_algo_fixed_inet6); +#endif + } + return (false); +} + +/* + * Runs the check on what would be the best algo for rib @rh, assuming + * that the current algo is the one specified by @orig_flm. Note that + * it can be NULL for initial selection. + * + * Returns referenced new algo or NULL if the current one is the best. + */ +static struct fib_lookup_module * +fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) +{ + uint8_t preference, curr_preference = 0, best_preference = 0; + struct fib_lookup_module *flm, *best_flm = NULL; + struct rib_rtable_info rinfo; + int candidate_algos = 0; + + fib_get_rtable_info(rh, &rinfo); + + FIB_MOD_LOCK(); + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if (flm->flm_family != rh->rib_family) + continue; + candidate_algos++; + preference = flm->flm_get_pref(&rinfo); + if (preference > best_preference) { + if (!flm_error_check(flm, rh->rib_fibnum)) { + best_preference = preference; + best_flm = flm; + } + } + if (flm == orig_flm) + curr_preference = preference; + } + if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) + best_flm->flm_refcount++; + else + best_flm = NULL; + FIB_MOD_UNLOCK(); + + RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", + candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, + best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), + best_preference); + + return (best_flm); +} + +/* + * Called when new route table is created. + * Selects, allocates and attaches fib algo for the table. + */ +static bool +fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp) +{ + struct fib_lookup_module *flm; + struct fib_data *fd = NULL; + enum flm_op_result result; + struct epoch_tracker et; + + flm = fib_check_best_algo(rh, NULL); + if (flm == NULL) { + RH_PRINTF(LOG_CRIT, rh, "no algo selected"); + return (false); + } + RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); + + NET_EPOCH_ENTER(et); + RIB_WLOCK(rh); + result = setup_fd_instance(flm, rh, NULL, &fd, false); + RIB_WUNLOCK(rh); + NET_EPOCH_EXIT(et); + + RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); + if (result == FLM_SUCCESS) + *dp = fd->fd_dp; + else + RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); + + fib_unref_algo(flm); + + return (result == FLM_SUCCESS); +} + +/* + * Sets up fib algo instances for the non-initialized RIBs in the @family. + * Allocates temporary datapath index to amortize datapaint index updates + * with large @num_tables. + */ +void +fib_setup_family(int family, uint32_t num_tables) +{ + struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false); + if (new_fdh == NULL) { + ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family)); + return; + } + + for (int i = 0; i < num_tables; i++) { + struct rib_head *rh = rt_tables_get_rnh(i, family); + if (rh->rib_algo_init) + continue; + if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i])) + continue; + + rh->rib_algo_init = true; + } + + FIB_MOD_LOCK(); + struct fib_dp **pdp = get_family_dp_ptr(family); + struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp); + + /* Update the items not touched by the new init, from the old data pointer */ + for (int i = 0; i < num_tables; i++) { + if (new_fdh->fdh_idx[i].f == dummy_lookup) + new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i]; + } + + /* Ensure all index writes have completed */ + atomic_thread_fence_rel(); + /* Set new datapath pointer */ + *pdp = &new_fdh->fdh_idx[0]; + + FIB_MOD_UNLOCK(); + + fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); +} + +/* + * Registers fib lookup module within the subsystem. + */ +int +fib_module_register(struct fib_lookup_module *flm) +{ + + FIB_MOD_LOCK(); + ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name, + print_family(flm->flm_family)); + TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); + FIB_MOD_UNLOCK(); + + return (0); +} + +/* + * Tries to unregister fib lookup module. + * + * Returns 0 on success, EBUSY if module is still used + * by some of the tables. + */ +int +fib_module_unregister(struct fib_lookup_module *flm) +{ + + FIB_MOD_LOCK(); + if (flm->flm_refcount > 0) { + FIB_MOD_UNLOCK(); + return (EBUSY); + } + fib_error_clear_flm(flm); + ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name, + print_family(flm->flm_family)); + TAILQ_REMOVE(&all_algo_list, flm, entries); + FIB_MOD_UNLOCK(); + + return (0); +} + +void +vnet_fib_init(void) +{ + + TAILQ_INIT(&V_fib_data_list); +} + +void +vnet_fib_destroy(void) +{ + + FIB_MOD_LOCK(); + fib_error_clear(); + FIB_MOD_UNLOCK(); +} |
