diff options
-rw-r--r-- | sbin/ipfw/ipfw.8 | 30 | ||||
-rw-r--r-- | sbin/ipfw/ipfw2.h | 1 | ||||
-rw-r--r-- | sbin/ipfw/nat64lsn.c | 124 | ||||
-rw-r--r-- | sys/conf/files | 4 | ||||
-rw-r--r-- | sys/modules/ipfw_nat64/Makefile | 2 | ||||
-rw-r--r-- | sys/netinet6/ip_fw_nat64.h | 45 | ||||
-rw-r--r-- | sys/netpfil/ipfw/nat64/nat64lsn.c | 2520 | ||||
-rw-r--r-- | sys/netpfil/ipfw/nat64/nat64lsn.h | 425 | ||||
-rw-r--r-- | sys/netpfil/ipfw/nat64/nat64lsn_control.c | 434 |
9 files changed, 1766 insertions, 1819 deletions
diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8 index 31448aff92bb..f02ec3e148cd 100644 --- a/sbin/ipfw/ipfw.8 +++ b/sbin/ipfw/ipfw.8 @@ -1,7 +1,7 @@ .\" .\" $FreeBSD$ .\" -.Dd March 18, 2019 +.Dd March 19, 2019 .Dt IPFW 8 .Os .Sh NAME @@ -3300,6 +3300,7 @@ See .Sx SYSCTL VARIABLES for more info. .Sh IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION +.Ss Stateful translation .Nm supports in-kernel IPv6/IPv4 network address and protocol translation. Stateful NAT64 translation allows IPv6-only clients to contact IPv4 servers @@ -3317,7 +3318,8 @@ to be able use stateful NAT64 translator. Stateful NAT64 uses a bunch of memory for several types of objects. When IPv6 client initiates connection, NAT64 translator creates a host entry in the states table. -Each host entry has a number of ports group entries allocated on demand. +Each host entry uses preallocated IPv4 alias entry. +Each alias entry has a number of ports group entries allocated on demand. Ports group entries contains connection state entries. There are several options to control limits and lifetime for these objects. .Pp @@ -3337,6 +3339,11 @@ First time an original packet is handled and consumed by translator, and then it is handled again as translated packet. This behavior can be changed by sysctl variable .Va net.inet.ip.fw.nat64_direct_output . +Also translated packet can be tagged using +.Cm tag +rule action, and then matched by +.Cm tagged +opcode to avoid loops and extra overhead. .Pp The stateful NAT64 configuration command is the following: .Bd -ragged -offset indent @@ -3364,15 +3371,16 @@ to represent IPv4 addresses. This IPv6 prefix should be configured in DNS64. The translator implementation follows RFC6052, that restricts the length of prefixes to one of following: 32, 40, 48, 56, 64, or 96. The Well-Known IPv6 Prefix 64:ff9b:: must be 96 bits long. -.It Cm max_ports Ar number -Maximum number of ports reserved for upper level protocols to one IPv6 client. -All reserved ports are divided into chunks between supported protocols. -The number of connections from one IPv6 client is limited by this option. -Note that closed TCP connections still remain in the list of connections until -.Cm tcp_close_age -interval will not expire. -Default value is -.Ar 2048 . +The special +.Ar ::/length +prefix can be used to handle several IPv6 prefixes with one NAT64 instance. +The NAT64 instance will determine a destination IPv4 address from prefix +.Ar length . +.It Cm states_chunks Ar number +The number of states chunks in single ports group. +Each ports group by default can keep 64 state entries in single chunk. +The above value affects the maximum number of states that can be associated with single IPv4 alias address and port. +The value must be power of 2, and up to 128. .It Cm host_del_age Ar seconds The number of seconds until the host entry for a IPv6 client will be deleted and all its resources will be released due to inactivity. diff --git a/sbin/ipfw/ipfw2.h b/sbin/ipfw/ipfw2.h index ff6990ae1c06..2b562734d15f 100644 --- a/sbin/ipfw/ipfw2.h +++ b/sbin/ipfw/ipfw2.h @@ -278,6 +278,7 @@ enum tokens { TOK_AGG_LEN, TOK_AGG_COUNT, TOK_MAX_PORTS, + TOK_STATES_CHUNKS, TOK_JMAXLEN, TOK_PORT_RANGE, TOK_HOST_DEL_AGE, diff --git a/sbin/ipfw/nat64lsn.c b/sbin/ipfw/nat64lsn.c index c6a892572818..4a6d7a7914c3 100644 --- a/sbin/ipfw/nat64lsn.c +++ b/sbin/ipfw/nat64lsn.c @@ -87,68 +87,70 @@ nat64lsn_print_states(void *buf) char sflags[4], *sf, *proto; ipfw_obj_header *oh; ipfw_obj_data *od; - ipfw_nat64lsn_stg *stg; - ipfw_nat64lsn_state *ste; + ipfw_nat64lsn_stg_v1 *stg; + ipfw_nat64lsn_state_v1 *ste; uint64_t next_idx; int i, sz; oh = (ipfw_obj_header *)buf; od = (ipfw_obj_data *)(oh + 1); - stg = (ipfw_nat64lsn_stg *)(od + 1); + stg = (ipfw_nat64lsn_stg_v1 *)(od + 1); sz = od->head.length - sizeof(*od); next_idx = 0; while (sz > 0 && next_idx != 0xFF) { - next_idx = stg->next_idx; + next_idx = stg->next.index; sz -= sizeof(*stg); if (stg->count == 0) { stg++; continue; } - switch (stg->proto) { - case IPPROTO_TCP: - proto = "TCP"; - break; - case IPPROTO_UDP: - proto = "UDP"; - break; - case IPPROTO_ICMPV6: - proto = "ICMPv6"; - break; - } - inet_ntop(AF_INET6, &stg->host6, s, sizeof(s)); + /* + * NOTE: addresses are in network byte order, + * ports are in host byte order. + */ inet_ntop(AF_INET, &stg->alias4, a, sizeof(a)); - ste = (ipfw_nat64lsn_state *)(stg + 1); + ste = (ipfw_nat64lsn_state_v1 *)(stg + 1); for (i = 0; i < stg->count && sz > 0; i++) { sf = sflags; + inet_ntop(AF_INET6, &ste->host6, s, sizeof(s)); inet_ntop(AF_INET, &ste->daddr, f, sizeof(f)); - if (stg->proto == IPPROTO_TCP) { + switch (ste->proto) { + case IPPROTO_TCP: + proto = "TCP"; if (ste->flags & 0x02) *sf++ = 'S'; if (ste->flags & 0x04) *sf++ = 'E'; if (ste->flags & 0x01) *sf++ = 'F'; + break; + case IPPROTO_UDP: + proto = "UDP"; + break; + case IPPROTO_ICMP: + proto = "ICMPv6"; + break; } *sf = '\0'; - switch (stg->proto) { + switch (ste->proto) { case IPPROTO_TCP: case IPPROTO_UDP: printf("%s:%d\t%s:%d\t%s\t%s\t%d\t%s:%d\n", s, ste->sport, a, ste->aport, proto, sflags, ste->idle, f, ste->dport); break; - case IPPROTO_ICMPV6: + case IPPROTO_ICMP: printf("%s\t%s\t%s\t\t%d\t%s\n", s, a, proto, ste->idle, f); break; default: printf("%s\t%s\t%d\t\t%d\t%s\n", - s, a, stg->proto, ste->idle, f); + s, a, ste->proto, ste->idle, f); } ste++; sz -= sizeof(*ste); } - stg = (ipfw_nat64lsn_stg *)ste; + stg = (ipfw_nat64lsn_stg_v1 *)ste; } return (next_idx); } @@ -174,6 +176,7 @@ nat64lsn_states_cb(ipfw_nat64lsn_cfg *cfg, const char *name, uint8_t set) err(EX_OSERR, NULL); do { oh = (ipfw_obj_header *)buf; + oh->opheader.version = 1; /* Force using ov new API */ od = (ipfw_obj_data *)(oh + 1); nat64lsn_fill_ntlv(&oh->ntlv, cfg->name, set); od->head.type = IPFW_TLV_OBJDATA; @@ -363,12 +366,8 @@ nat64lsn_parse_int(const char *arg, const char *desc) static struct _s_x nat64newcmds[] = { { "prefix6", TOK_PREFIX6 }, - { "agg_len", TOK_AGG_LEN }, /* not yet */ - { "agg_count", TOK_AGG_COUNT }, /* not yet */ - { "port_range", TOK_PORT_RANGE }, /* not yet */ { "jmaxlen", TOK_JMAXLEN }, { "prefix4", TOK_PREFIX4 }, - { "max_ports", TOK_MAX_PORTS }, { "host_del_age", TOK_HOST_DEL_AGE }, { "pg_del_age", TOK_PG_DEL_AGE }, { "tcp_syn_age", TOK_TCP_SYN_AGE }, @@ -376,10 +375,13 @@ static struct _s_x nat64newcmds[] = { { "tcp_est_age", TOK_TCP_EST_AGE }, { "udp_age", TOK_UDP_AGE }, { "icmp_age", TOK_ICMP_AGE }, + { "states_chunks",TOK_STATES_CHUNKS }, { "log", TOK_LOG }, { "-log", TOK_LOGOFF }, { "allow_private", TOK_PRIVATE }, { "-allow_private", TOK_PRIVATEOFF }, + /* for compatibility with old configurations */ + { "max_ports", TOK_MAX_PORTS }, /* unused */ { NULL, 0 } }; @@ -436,42 +438,17 @@ nat64lsn_create(const char *name, uint8_t set, int ac, char **av) nat64lsn_parse_prefix(*av, AF_INET6, &cfg->prefix6, &cfg->plen6); if (ipfw_check_nat64prefix(&cfg->prefix6, - cfg->plen6) != 0) + cfg->plen6) != 0 && + !IN6_IS_ADDR_UNSPECIFIED(&cfg->prefix6)) errx(EX_USAGE, "Bad prefix6 %s", *av); ac--; av++; break; -#if 0 - case TOK_AGG_LEN: - NEED1("Aggregation prefix len required"); - cfg->agg_prefix_len = nat64lsn_parse_int(*av, opt); - ac--; av++; - break; - case TOK_AGG_COUNT: - NEED1("Max per-prefix count required"); - cfg->agg_prefix_max = nat64lsn_parse_int(*av, opt); - ac--; av++; - break; - case TOK_PORT_RANGE: - NEED1("port range x[:y] required"); - if ((p = strchr(*av, ':')) == NULL) - cfg->min_port = (uint16_t)nat64lsn_parse_int( - *av, opt); - else { - *p++ = '\0'; - cfg->min_port = (uint16_t)nat64lsn_parse_int( - *av, opt); - cfg->max_port = (uint16_t)nat64lsn_parse_int( - p, opt); - } - ac--; av++; - break; case TOK_JMAXLEN: NEED1("job queue length required"); cfg->jmaxlen = nat64lsn_parse_int(*av, opt); ac--; av++; break; -#endif case TOK_MAX_PORTS: NEED1("Max per-user ports required"); cfg->max_ports = nat64lsn_parse_int(*av, opt); @@ -519,6 +496,12 @@ nat64lsn_create(const char *name, uint8_t set, int ac, char **av) *av, opt); ac--; av++; break; + case TOK_STATES_CHUNKS: + NEED1("number of chunks required"); + cfg->states_chunks = (uint8_t)nat64lsn_parse_int( + *av, opt); + ac--; av++; + break; case TOK_LOG: cfg->flags |= NAT64_LOG; break; @@ -630,6 +613,12 @@ nat64lsn_config(const char *name, uint8_t set, int ac, char **av) *av, opt); ac--; av++; break; + case TOK_STATES_CHUNKS: + NEED1("number of chunks required"); + cfg->states_chunks = (uint8_t)nat64lsn_parse_int( + *av, opt); + ac--; av++; + break; case TOK_LOG: cfg->flags |= NAT64_LOG; break; @@ -789,31 +778,24 @@ nat64lsn_show_cb(ipfw_nat64lsn_cfg *cfg, const char *name, uint8_t set) printf("nat64lsn %s prefix4 %s/%u", cfg->name, abuf, cfg->plen4); inet_ntop(AF_INET6, &cfg->prefix6, abuf, sizeof(abuf)); printf(" prefix6 %s/%u", abuf, cfg->plen6); -#if 0 - printf("agg_len %u agg_count %u ", cfg->agg_prefix_len, - cfg->agg_prefix_max); - if (cfg->min_port != NAT64LSN_PORT_MIN || - cfg->max_port != NAT64LSN_PORT_MAX) - printf(" port_range %u:%u", cfg->min_port, cfg->max_port); - if (cfg->jmaxlen != NAT64LSN_JMAXLEN) - printf(" jmaxlen %u ", cfg->jmaxlen); -#endif - if (cfg->max_ports != NAT64LSN_MAX_PORTS) - printf(" max_ports %u", cfg->max_ports); - if (cfg->nh_delete_delay != NAT64LSN_HOST_AGE) + if (co.verbose || cfg->states_chunks > 1) + printf(" states_chunks %u", cfg->states_chunks); + if (co.verbose || cfg->nh_delete_delay != NAT64LSN_HOST_AGE) printf(" host_del_age %u", cfg->nh_delete_delay); - if (cfg->pg_delete_delay != NAT64LSN_PG_AGE) + if (co.verbose || cfg->pg_delete_delay != NAT64LSN_PG_AGE) printf(" pg_del_age %u ", cfg->pg_delete_delay); - if (cfg->st_syn_ttl != NAT64LSN_TCP_SYN_AGE) + if (co.verbose || cfg->st_syn_ttl != NAT64LSN_TCP_SYN_AGE) printf(" tcp_syn_age %u", cfg->st_syn_ttl); - if (cfg->st_close_ttl != NAT64LSN_TCP_FIN_AGE) + if (co.verbose || cfg->st_close_ttl != NAT64LSN_TCP_FIN_AGE) printf(" tcp_close_age %u", cfg->st_close_ttl); - if (cfg->st_estab_ttl != NAT64LSN_TCP_EST_AGE) + if (co.verbose || cfg->st_estab_ttl != NAT64LSN_TCP_EST_AGE) printf(" tcp_est_age %u", cfg->st_estab_ttl); - if (cfg->st_udp_ttl != NAT64LSN_UDP_AGE) + if (co.verbose || cfg->st_udp_ttl != NAT64LSN_UDP_AGE) printf(" udp_age %u", cfg->st_udp_ttl); - if (cfg->st_icmp_ttl != NAT64LSN_ICMP_AGE) + if (co.verbose || cfg->st_icmp_ttl != NAT64LSN_ICMP_AGE) printf(" icmp_age %u", cfg->st_icmp_ttl); + if (co.verbose || cfg->jmaxlen != NAT64LSN_JMAXLEN) + printf(" jmaxlen %u ", cfg->jmaxlen); if (cfg->flags & NAT64_LOG) printf(" log"); if (cfg->flags & NAT64_ALLOW_PRIVATE) diff --git a/sys/conf/files b/sys/conf/files index 45968c43852c..ed982409534f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4398,9 +4398,9 @@ netpfil/ipfw/nat64/nat64clat.c optional inet inet6 ipfirewall \ netpfil/ipfw/nat64/nat64clat_control.c optional inet inet6 ipfirewall \ ipfirewall_nat64 netpfil/ipfw/nat64/nat64lsn.c optional inet inet6 ipfirewall \ - ipfirewall_nat64 + ipfirewall_nat64 compile-with "${NORMAL_C} -I$S/contrib/ck/include" netpfil/ipfw/nat64/nat64lsn_control.c optional inet inet6 ipfirewall \ - ipfirewall_nat64 + ipfirewall_nat64 compile-with "${NORMAL_C} -I$S/contrib/ck/include" netpfil/ipfw/nat64/nat64stl.c optional inet inet6 ipfirewall \ ipfirewall_nat64 netpfil/ipfw/nat64/nat64stl_control.c optional inet inet6 ipfirewall \ diff --git a/sys/modules/ipfw_nat64/Makefile b/sys/modules/ipfw_nat64/Makefile index ee2ad7da15af..037215a71481 100644 --- a/sys/modules/ipfw_nat64/Makefile +++ b/sys/modules/ipfw_nat64/Makefile @@ -8,4 +8,6 @@ SRCS+= nat64clat.c nat64clat_control.c SRCS+= nat64lsn.c nat64lsn_control.c SRCS+= nat64stl.c nat64stl_control.c +CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include + .include <bsd.kmod.mk> diff --git a/sys/netinet6/ip_fw_nat64.h b/sys/netinet6/ip_fw_nat64.h index 47c0a70d167f..40e3441132e1 100644 --- a/sys/netinet6/ip_fw_nat64.h +++ b/sys/netinet6/ip_fw_nat64.h @@ -122,7 +122,7 @@ typedef struct _ipfw_nat64clat_cfg { /* * NAT64LSN default configuration values */ -#define NAT64LSN_MAX_PORTS 2048 /* Max number of ports per host */ +#define NAT64LSN_MAX_PORTS 2048 /* Unused */ #define NAT64LSN_JMAXLEN 2048 /* Max outstanding requests. */ #define NAT64LSN_TCP_SYN_AGE 10 /* State's TTL after SYN received. */ #define NAT64LSN_TCP_EST_AGE (2 * 3600) /* TTL for established connection */ @@ -135,16 +135,20 @@ typedef struct _ipfw_nat64clat_cfg { typedef struct _ipfw_nat64lsn_cfg { char name[64]; /* NAT name */ uint32_t flags; - uint32_t max_ports; /* Max ports per client */ - uint32_t agg_prefix_len; /* Prefix length to count */ - uint32_t agg_prefix_max; /* Max hosts per agg prefix */ + + uint32_t max_ports; /* Unused */ + uint32_t agg_prefix_len; /* Unused */ + uint32_t agg_prefix_max; /* Unused */ + struct in_addr prefix4; uint16_t plen4; /* Prefix length */ uint16_t plen6; /* Prefix length */ struct in6_addr prefix6; /* NAT64 prefix */ uint32_t jmaxlen; /* Max jobqueue length */ - uint16_t min_port; /* Min port group # to use */ - uint16_t max_port; /* Max port group # to use */ + + uint16_t min_port; /* Unused */ + uint16_t max_port; /* Unused */ + uint16_t nh_delete_delay;/* Stale host delete delay */ uint16_t pg_delete_delay;/* Stale portgroup delete delay */ uint16_t st_syn_ttl; /* TCP syn expire */ @@ -153,7 +157,7 @@ typedef struct _ipfw_nat64lsn_cfg { uint16_t st_udp_ttl; /* UDP expire */ uint16_t st_icmp_ttl; /* ICMP expire */ uint8_t set; /* Named instance set [0..31] */ - uint8_t spare; + uint8_t states_chunks; /* Number of states chunks per PG */ } ipfw_nat64lsn_cfg; typedef struct _ipfw_nat64lsn_state { @@ -177,5 +181,30 @@ typedef struct _ipfw_nat64lsn_stg { uint32_t spare2; } ipfw_nat64lsn_stg; -#endif /* _NETINET6_IP_FW_NAT64_H_ */ +typedef struct _ipfw_nat64lsn_state_v1 { + struct in6_addr host6; /* Bound IPv6 host */ + struct in_addr daddr; /* Remote IPv4 address */ + uint16_t dport; /* Remote destination port */ + uint16_t aport; /* Local alias port */ + uint16_t sport; /* Source port */ + uint16_t spare; + uint16_t idle; /* Last used time */ + uint8_t flags; /* State flags */ + uint8_t proto; /* protocol */ +} ipfw_nat64lsn_state_v1; +typedef struct _ipfw_nat64lsn_stg_v1 { + union nat64lsn_pgidx { + uint64_t index; + struct { + uint8_t chunk; /* states chunk */ + uint8_t proto; /* protocol */ + uint16_t port; /* base port */ + in_addr_t addr; /* alias address */ + }; + } next; /* next state index */ + struct in_addr alias4; /* IPv4 alias address */ + uint32_t count; /* Number of states */ +} ipfw_nat64lsn_stg_v1; + +#endif /* _NETINET6_IP_FW_NAT64_H_ */ diff --git a/sys/netpfil/ipfw/nat64/nat64lsn.c b/sys/netpfil/ipfw/nat64/nat64lsn.c index 1ddeaafc7dce..af88fd1622c5 100644 --- a/sys/netpfil/ipfw/nat64/nat64lsn.c +++ b/sys/netpfil/ipfw/nat64/nat64lsn.c @@ -33,16 +33,17 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/counter.h> +#include <sys/ck.h> +#include <sys/epoch.h> #include <sys/errno.h> +#include <sys/hash.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/module.h> #include <sys/rmlock.h> -#include <sys/rwlock.h> #include <sys/socket.h> -#include <sys/queue.h> #include <sys/syslog.h> #include <sys/sysctl.h> @@ -71,17 +72,22 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN"); -static void nat64lsn_periodic(void *data); -#define PERIODIC_DELAY 4 -static uint8_t nat64lsn_proto_map[256]; -uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO]; +static epoch_t nat64lsn_epoch; +#define NAT64LSN_EPOCH_ENTER(et) epoch_enter_preempt(nat64lsn_epoch, &(et)) +#define NAT64LSN_EPOCH_EXIT(et) epoch_exit_preempt(nat64lsn_epoch, &(et)) +#define NAT64LSN_EPOCH_WAIT() epoch_wait_preempt(nat64lsn_epoch) +#define NAT64LSN_EPOCH_ASSERT() MPASS(in_epoch(nat64lsn_epoch)) +#define NAT64LSN_EPOCH_CALL(c, f) epoch_call(nat64lsn_epoch, (c), (f)) -#define NAT64_FLAG_FIN 0x01 /* FIN was seen */ -#define NAT64_FLAG_SYN 0x02 /* First syn in->out */ -#define NAT64_FLAG_ESTAB 0x04 /* Packet with Ack */ -#define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN) +static uma_zone_t nat64lsn_host_zone; +static uma_zone_t nat64lsn_pgchunk_zone; +static uma_zone_t nat64lsn_pg_zone; +static uma_zone_t nat64lsn_aliaslink_zone; +static uma_zone_t nat64lsn_state_zone; +static uma_zone_t nat64lsn_job_zone; -#define NAT64_FLAG_RDR 0x80 /* Port redirect */ +static void nat64lsn_periodic(void *data); +#define PERIODIC_DELAY 4 #define NAT64_LOOKUP(chain, cmd) \ (struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1) /* @@ -91,25 +97,33 @@ uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO]; enum nat64lsn_jtype { JTYPE_NEWHOST = 1, JTYPE_NEWPORTGROUP, - JTYPE_DELPORTGROUP, + JTYPE_DESTROY, }; struct nat64lsn_job_item { - TAILQ_ENTRY(nat64lsn_job_item) next; + STAILQ_ENTRY(nat64lsn_job_item) entries; enum nat64lsn_jtype jtype; - struct nat64lsn_host *nh; - struct nat64lsn_portgroup *pg; - void *spare_idx; - struct in6_addr haddr; - uint8_t nat_proto; - uint8_t done; - int needs_idx; - int delcount; - unsigned int fhash; /* Flow hash */ - uint32_t aaddr; /* Last used address (net) */ - struct mbuf *m; - struct ipfw_flow_id f_id; - uint64_t delmask[NAT64LSN_PGPTRNMASK]; + + union { + struct { /* used by JTYPE_NEWHOST, JTYPE_NEWPORTGROUP */ + struct mbuf *m; + struct nat64lsn_host *host; + struct nat64lsn_state *state; + uint32_t src6_hval; + uint32_t state_hval; + struct ipfw_flow_id f_id; + in_addr_t faddr; + uint16_t port; + uint8_t proto; + uint8_t done; + }; + struct { /* used by JTYPE_DESTROY */ + struct nat64lsn_hosts_slist hosts; + struct nat64lsn_pg_slist portgroups; + struct nat64lsn_pgchunk *pgchunk; + struct epoch_context epoch_ctx; + }; + }; }; static struct mtx jmtx; @@ -118,143 +132,311 @@ static struct mtx jmtx; #define JQUEUE_LOCK() mtx_lock(&jmtx) #define JQUEUE_UNLOCK() mtx_unlock(&jmtx) +static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, + struct nat64lsn_job_item *ji); +static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, + struct nat64lsn_job_item *ji); +static struct nat64lsn_job_item *nat64lsn_create_job( + struct nat64lsn_cfg *cfg, int jtype); static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); -static void nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg, - struct nat64lsn_job_head *jhead, int jlen); - -static struct nat64lsn_job_item *nat64lsn_create_job(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, int jtype); -static int nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr, - int needs_idx); -static int nat64lsn_request_host(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, struct mbuf **pm); +static void nat64lsn_job_destroy(epoch_context_t ctx); +static void nat64lsn_destroy_host(struct nat64lsn_host *host); +static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg); + static int nat64lsn_translate4(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, struct mbuf **pm); + const struct ipfw_flow_id *f_id, struct mbuf **mp); static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, - struct ipfw_flow_id *f_id, struct mbuf **pm); - -static int alloc_portgroup(struct nat64lsn_job_item *ji); -static void destroy_portgroup(struct nat64lsn_portgroup *pg); -static void destroy_host6(struct nat64lsn_host *nh); -static int alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); + struct ipfw_flow_id *f_id, struct mbuf **mp); +static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, + struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags); + +#define NAT64_BIT_TCP_FIN 0 /* FIN was seen */ +#define NAT64_BIT_TCP_SYN 1 /* First syn in->out */ +#define NAT64_BIT_TCP_ESTAB 2 /* Packet with Ack */ +#define NAT64_BIT_READY_IPV4 6 /* state is ready for translate4 */ +#define NAT64_BIT_STALE 7 /* state is going to be expired */ + +#define NAT64_FLAG_FIN (1 << NAT64_BIT_TCP_FIN) +#define NAT64_FLAG_SYN (1 << NAT64_BIT_TCP_SYN) +#define NAT64_FLAG_ESTAB (1 << NAT64_BIT_TCP_ESTAB) +#define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN) -static int attach_portgroup(struct nat64lsn_cfg *cfg, - struct nat64lsn_job_item *ji); -static int attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); +#define NAT64_FLAG_READY (1 << NAT64_BIT_READY_IPV4) +#define NAT64_FLAG_STALE (1 << NAT64_BIT_STALE) +static inline uint8_t +convert_tcp_flags(uint8_t flags) +{ + uint8_t result; -/* XXX tmp */ -static uma_zone_t nat64lsn_host_zone; -static uma_zone_t nat64lsn_pg_zone; -static uma_zone_t nat64lsn_pgidx_zone; + result = flags & (TH_FIN|TH_SYN); + result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */ + result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */ -static unsigned int nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg, - struct nat64lsn_host *nh); + return (result); +} -#define I6_hash(x) (djb_hash((const unsigned char *)(x), 16)) -#define I6_first(_ph, h) (_ph)[h] -#define I6_next(x) (x)->next -#define I6_val(x) (&(x)->addr) -#define I6_cmp(a, b) IN6_ARE_ADDR_EQUAL(a, b) -#define I6_lock(a, b) -#define I6_unlock(a, b) +static void +nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, + struct nat64lsn_state *state) +{ -#define I6HASH_FIND(_cfg, _res, _a) \ - CHT_FIND(_cfg->ih, _cfg->ihsize, I6_, _res, _a) -#define I6HASH_INSERT(_cfg, _i) \ - CHT_INSERT_HEAD(_cfg->ih, _cfg->ihsize, I6_, _i) -#define I6HASH_REMOVE(_cfg, _res, _tmp, _a) \ - CHT_REMOVE(_cfg->ih, _cfg->ihsize, I6_, _res, _tmp, _a) + memset(plog, 0, sizeof(*plog)); + plog->length = PFLOG_REAL_HDRLEN; + plog->af = family; + plog->action = PF_NAT; + plog->dir = PF_IN; + plog->rulenr = htonl(state->ip_src); + plog->subrulenr = htonl((uint32_t)(state->aport << 16) | + (state->proto << 8) | (state->ip_dst & 0xff)); + plog->ruleset[0] = '\0'; + strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname)); + ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); +} -#define I6HASH_FOREACH_SAFE(_cfg, _x, _tmp, _cb, _arg) \ - CHT_FOREACH_SAFE(_cfg->ih, _cfg->ihsize, I6_, _x, _tmp, _cb, _arg) +#define HVAL(p, n, s) jenkins_hash32((const uint32_t *)(p), (n), (s)) +#define HOST_HVAL(c, a) HVAL((a),\ + sizeof(struct in6_addr) / sizeof(uint32_t), (c)->hash_seed) +#define HOSTS(c, v) ((c)->hosts_hash[(v) & ((c)->hosts_hashsize - 1)]) + +#define ALIASLINK_HVAL(c, f) HVAL(&(f)->dst_ip6,\ + sizeof(struct in6_addr) * 2 / sizeof(uint32_t), (c)->hash_seed) +#define ALIAS_BYHASH(c, v) \ + ((c)->aliases[(v) & ((1 << (32 - (c)->plen4)) - 1)]) +static struct nat64lsn_aliaslink* +nat64lsn_get_aliaslink(struct nat64lsn_cfg *cfg __unused, + struct nat64lsn_host *host, const struct ipfw_flow_id *f_id __unused) +{ -#define HASH_IN4(x) djb_hash((const unsigned char *)(x), 8) + /* + * We can implement some different algorithms how + * select an alias address. + * XXX: for now we use first available. + */ + return (CK_SLIST_FIRST(&host->aliases)); +} -static unsigned -djb_hash(const unsigned char *h, const int len) +#define STATE_HVAL(c, d) HVAL((d), 2, (c)->hash_seed) +#define STATE_HASH(h, v) \ + ((h)->states_hash[(v) & ((h)->states_hashsize - 1)]) +#define STATES_CHUNK(p, v) \ + ((p)->chunks_count == 1 ? (p)->states : \ + ((p)->states_chunk[CHUNK_BY_FADDR(p, v)])) + +#ifdef __LP64__ +#define FREEMASK_FFSLL(pg, faddr) \ + ffsll(*FREEMASK_CHUNK((pg), (faddr))) +#define FREEMASK_BTR(pg, faddr, bit) \ + ck_pr_btr_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) +#define FREEMASK_BTS(pg, faddr, bit) \ + ck_pr_bts_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) +#define FREEMASK_ISSET(pg, faddr, bit) \ + ISSET64(*FREEMASK_CHUNK((pg), (faddr)), (bit)) +#define FREEMASK_COPY(pg, n, out) \ + (out) = ck_pr_load_64(FREEMASK_CHUNK((pg), (n))) +#else +static inline int +freemask_ffsll(uint32_t *freemask) { - unsigned int result = 0; int i; - for (i = 0; i < len; i++) - result = 33 * result ^ h[i]; - - return (result); + if ((i = ffsl(freemask[0])) != 0) + return (i); + if ((i = ffsl(freemask[1])) != 0) + return (i + 32); + return (0); } - -/* -static size_t -bitmask_size(size_t num, int *level) +#define FREEMASK_FFSLL(pg, faddr) \ + freemask_ffsll(FREEMASK_CHUNK((pg), (faddr))) +#define FREEMASK_BTR(pg, faddr, bit) \ + ck_pr_btr_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) +#define FREEMASK_BTS(pg, faddr, bit) \ + ck_pr_bts_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) +#define FREEMASK_ISSET(pg, faddr, bit) \ + ISSET32(*(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32), (bit) % 32) +#define FREEMASK_COPY(pg, n, out) \ + (out) = ck_pr_load_32(FREEMASK_CHUNK((pg), (n))) | \ + ((uint64_t)ck_pr_load_32(FREEMASK_CHUNK((pg), (n)) + 1) << 32) +#endif /* !__LP64__ */ + + +#define NAT64LSN_TRY_PGCNT 32 +static struct nat64lsn_pg* +nat64lsn_get_pg(uint32_t *chunkmask, uint32_t *pgmask, + struct nat64lsn_pgchunk **chunks, struct nat64lsn_pg **pgptr, + uint32_t *pgidx, in_addr_t faddr) { - size_t x; - int c; + struct nat64lsn_pg *pg, *oldpg; + uint32_t idx, oldidx; + int cnt; + + cnt = 0; + /* First try last used PG */ + oldpg = pg = ck_pr_load_ptr(pgptr); + idx = oldidx = ck_pr_load_32(pgidx); + /* If pgidx is out of range, reset it to the first pgchunk */ + if (!ISSET32(*chunkmask, idx / 32)) + idx = 0; + do { + ck_pr_fence_load(); + if (pg != NULL && FREEMASK_BITCOUNT(pg, faddr) > 0) { + /* + * If last used PG has not free states, + * try to update pointer. + * NOTE: it can be already updated by jobs handler, + * thus we use CAS operation. + */ + if (cnt > 0) + ck_pr_cas_ptr(pgptr, oldpg, pg); + return (pg); + } + /* Stop if idx is out of range */ + if (!ISSET32(*chunkmask, idx / 32)) + break; + + if (ISSET32(pgmask[idx / 32], idx % 32)) + pg = ck_pr_load_ptr( + &chunks[idx / 32]->pgptr[idx % 32]); + else + pg = NULL; - for (c = 0, x = num; num > 1; num /= 64, c++) - ; + idx++; + } while (++cnt < NAT64LSN_TRY_PGCNT); - return (x); + /* If pgidx is out of range, reset it to the first pgchunk */ + if (!ISSET32(*chunkmask, idx / 32)) + idx = 0; + ck_pr_cas_32(pgidx, oldidx, idx); + return (NULL); } -static void -bitmask_prepare(uint64_t *pmask, size_t bufsize, int level) +static struct nat64lsn_state* +nat64lsn_get_state6to4(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, + const struct ipfw_flow_id *f_id, uint32_t hval, in_addr_t faddr, + uint16_t port, uint8_t proto) { - size_t x, z; + struct nat64lsn_aliaslink *link; + struct nat64lsn_state *state; + struct nat64lsn_pg *pg; + int i, offset; + + NAT64LSN_EPOCH_ASSERT(); + + /* Check that we already have state for given arguments */ + CK_SLIST_FOREACH(state, &STATE_HASH(host, hval), entries) { + if (state->proto == proto && state->ip_dst == faddr && + state->sport == port && state->dport == f_id->dst_port) + return (state); + } - memset(pmask, 0xFF, bufsize); - for (x = 0, z = 1; level > 1; x += z, z *= 64, level--) - ; - pmask[x] ~= 0x01; -} -*/ + link = nat64lsn_get_aliaslink(cfg, host, f_id); + if (link == NULL) + return (NULL); -static void -nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, - uint32_t n, uint32_t sn) -{ + switch (proto) { + case IPPROTO_TCP: + pg = nat64lsn_get_pg( + &link->alias->tcp_chunkmask, link->alias->tcp_pgmask, + link->alias->tcp, &link->alias->tcp_pg, + &link->alias->tcp_pgidx, faddr); + break; + case IPPROTO_UDP: + pg = nat64lsn_get_pg( + &link->alias->udp_chunkmask, link->alias->udp_pgmask, + link->alias->udp, &link->alias->udp_pg, + &link->alias->udp_pgidx, faddr); + break; + case IPPROTO_ICMP: + pg = nat64lsn_get_pg( + &link->alias->icmp_chunkmask, link->alias->icmp_pgmask, + link->alias->icmp, &link->alias->icmp_pg, + &link->alias->icmp_pgidx, faddr); + break; + default: + panic("%s: wrong proto %d", __func__, proto); + } + if (pg == NULL) + return (NULL); - memset(plog, 0, sizeof(*plog)); - plog->length = PFLOG_REAL_HDRLEN; - plog->af = family; - plog->action = PF_NAT; - plog->dir = PF_IN; - plog->rulenr = htonl(n); - plog->subrulenr = htonl(sn); - plog->ruleset[0] = '\0'; - strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname)); - ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); + /* Check that PG has some free states */ + state = NULL; + i = FREEMASK_BITCOUNT(pg, faddr); + while (i-- > 0) { + offset = FREEMASK_FFSLL(pg, faddr); + if (offset == 0) { + /* + * We lost the race. + * No more free states in this PG. + */ + break; + } + + /* Lets try to atomically grab the state */ + if (FREEMASK_BTR(pg, faddr, offset - 1)) { + state = &STATES_CHUNK(pg, faddr)->state[offset - 1]; + /* Initialize */ + state->flags = proto != IPPROTO_TCP ? 0 : + convert_tcp_flags(f_id->_flags); + state->proto = proto; + state->aport = pg->base_port + offset - 1; + state->dport = f_id->dst_port; + state->sport = port; + state->ip6_dst = f_id->dst_ip6; + state->ip_dst = faddr; + state->ip_src = link->alias->addr; + state->hval = hval; + state->host = host; + SET_AGE(state->timestamp); + + /* Insert new state into host's hash table */ + HOST_LOCK(host); + CK_SLIST_INSERT_HEAD(&STATE_HASH(host, hval), + state, entries); + host->states_count++; + /* + * XXX: In case if host is going to be expired, + * reset NAT64LSN_DEADHOST flag. + */ + host->flags &= ~NAT64LSN_DEADHOST; + HOST_UNLOCK(host); + NAT64STAT_INC(&cfg->base.stats, screated); + /* Mark the state as ready for translate4 */ + ck_pr_fence_store(); + ck_pr_bts_32(&state->flags, NAT64_BIT_READY_IPV4); + break; + } + } + return (state); } + /* * Inspects icmp packets to see if the message contains different * packet header so we need to alter @addr and @port. */ static int -inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, uint32_t *addr, +inspect_icmp_mbuf(struct mbuf **mp, uint8_t *proto, uint32_t *addr, uint16_t *port) { + struct icmp *icmp; struct ip *ip; - struct tcphdr *tcp; - struct udphdr *udp; - struct icmphdr *icmp; int off; - uint8_t proto; + uint8_t inner_proto; - ip = mtod(*m, struct ip *); /* Outer IP header */ + ip = mtod(*mp, struct ip *); /* Outer IP header */ off = (ip->ip_hl << 2) + ICMP_MINLEN; - if ((*m)->m_len < off) - *m = m_pullup(*m, off); - if (*m == NULL) + if ((*mp)->m_len < off) + *mp = m_pullup(*mp, off); + if (*mp == NULL) return (ENOMEM); - ip = mtod(*m, struct ip *); /* Outer IP header */ - icmp = L3HDR(ip, struct icmphdr *); + ip = mtod(*mp, struct ip *); /* Outer IP header */ + icmp = L3HDR(ip, struct icmp *); switch (icmp->icmp_type) { case ICMP_ECHO: case ICMP_ECHOREPLY: /* Use icmp ID as distinguisher */ - *port = ntohs(*((uint16_t *)(icmp + 1))); + *port = ntohs(icmp->icmp_id); return (0); case ICMP_UNREACH: case ICMP_TIMXCEED: @@ -266,90 +448,133 @@ inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, uint32_t *addr, * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits * of ULP header. */ - if ((*m)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN) + if ((*mp)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN) return (EINVAL); - if ((*m)->m_len < off + sizeof(struct ip) + ICMP_MINLEN) - *m = m_pullup(*m, off + sizeof(struct ip) + ICMP_MINLEN); - if (*m == NULL) + if ((*mp)->m_len < off + sizeof(struct ip) + ICMP_MINLEN) + *mp = m_pullup(*mp, off + sizeof(struct ip) + ICMP_MINLEN); + if (*mp == NULL) return (ENOMEM); - ip = mtodo(*m, off); /* Inner IP header */ - proto = ip->ip_p; + ip = mtodo(*mp, off); /* Inner IP header */ + inner_proto = ip->ip_p; off += ip->ip_hl << 2; /* Skip inner IP header */ *addr = ntohl(ip->ip_src.s_addr); - if ((*m)->m_len < off + ICMP_MINLEN) - *m = m_pullup(*m, off + ICMP_MINLEN); - if (*m == NULL) + if ((*mp)->m_len < off + ICMP_MINLEN) + *mp = m_pullup(*mp, off + ICMP_MINLEN); + if (*mp == NULL) return (ENOMEM); - switch (proto) { + switch (inner_proto) { case IPPROTO_TCP: - tcp = mtodo(*m, off); - *nat_proto = NAT_PROTO_TCP; - *port = ntohs(tcp->th_sport); - return (0); case IPPROTO_UDP: - udp = mtodo(*m, off); - *nat_proto = NAT_PROTO_UDP; - *port = ntohs(udp->uh_sport); + /* Copy source port from the header */ + *port = ntohs(*((uint16_t *)mtodo(*mp, off))); + *proto = inner_proto; return (0); case IPPROTO_ICMP: /* * We will translate only ICMP errors for our ICMP * echo requests. */ - icmp = mtodo(*m, off); + icmp = mtodo(*mp, off); if (icmp->icmp_type != ICMP_ECHO) return (EOPNOTSUPP); - *port = ntohs(*((uint16_t *)(icmp + 1))); + *port = ntohs(icmp->icmp_id); return (0); }; return (EOPNOTSUPP); } -static inline uint8_t -convert_tcp_flags(uint8_t flags) +static struct nat64lsn_state* +nat64lsn_get_state4to6(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias, + in_addr_t faddr, uint16_t port, uint8_t proto) { - uint8_t result; + struct nat64lsn_state *state; + struct nat64lsn_pg *pg; + int chunk_idx, pg_idx, state_idx; - result = flags & (TH_FIN|TH_SYN); - result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */ - result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */ + NAT64LSN_EPOCH_ASSERT(); - return (result); + if (port < NAT64_MIN_PORT) + return (NULL); + /* + * Alias keeps 32 pgchunks for each protocol. + * Each pgchunk has 32 pointers to portgroup. + * Each portgroup has 64 states for ports. + */ + port -= NAT64_MIN_PORT; + chunk_idx = port / 2048; + + port -= chunk_idx * 2048; + pg_idx = port / 64; + state_idx = port % 64; + + /* + * First check in proto_chunkmask that we have allocated PG chunk. + * Then check in proto_pgmask that we have valid PG pointer. + */ + pg = NULL; + switch (proto) { + case IPPROTO_TCP: + if (ISSET32(alias->tcp_chunkmask, chunk_idx) && + ISSET32(alias->tcp_pgmask[chunk_idx], pg_idx)) { + pg = alias->tcp[chunk_idx]->pgptr[pg_idx]; + break; + } + return (NULL); + case IPPROTO_UDP: + if (ISSET32(alias->udp_chunkmask, chunk_idx) && + ISSET32(alias->udp_pgmask[chunk_idx], pg_idx)) { + pg = alias->udp[chunk_idx]->pgptr[pg_idx]; + break; + } + return (NULL); + case IPPROTO_ICMP: + if (ISSET32(alias->icmp_chunkmask, chunk_idx) && + ISSET32(alias->icmp_pgmask[chunk_idx], pg_idx)) { + pg = alias->icmp[chunk_idx]->pgptr[pg_idx]; + break; + } + return (NULL); + default: + panic("%s: wrong proto %d", __func__, proto); + } + if (pg == NULL) + return (NULL); + + if (FREEMASK_ISSET(pg, faddr, state_idx)) + return (NULL); + + state = &STATES_CHUNK(pg, faddr)->state[state_idx]; + ck_pr_fence_load(); + if (ck_pr_load_32(&state->flags) & NAT64_FLAG_READY) + return (state); + return (NULL); } -static NAT64NOINLINE int -nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, - struct mbuf **pm) +static int +nat64lsn_translate4(struct nat64lsn_cfg *cfg, + const struct ipfw_flow_id *f_id, struct mbuf **mp) { struct pfloghdr loghdr, *logdata; struct in6_addr src6; - struct nat64lsn_portgroup *pg; - struct nat64lsn_host *nh; - struct nat64lsn_state *st; - struct ip *ip; - uint32_t addr; - uint16_t state_flags, state_ts; - uint16_t port, lport; - uint8_t nat_proto; + struct nat64lsn_state *state; + struct nat64lsn_alias *alias; + uint32_t addr, flags; + uint16_t port, ts; int ret; + uint8_t proto; addr = f_id->dst_ip; port = f_id->dst_port; + proto = f_id->proto; if (addr < cfg->prefix4 || addr > cfg->pmask4) { NAT64STAT_INC(&cfg->base.stats, nomatch4); return (cfg->nomatch_verdict); } - /* Check if protocol is supported and get its short id */ - nat_proto = nat64lsn_proto_map[f_id->proto]; - if (nat_proto == 0) { - NAT64STAT_INC(&cfg->base.stats, noproto); - return (cfg->nomatch_verdict); - } - - /* We might need to handle icmp differently */ - if (nat_proto == NAT_PROTO_ICMP) { - ret = inspect_icmp_mbuf(pm, &nat_proto, &addr, &port); + /* Check if protocol is supported */ + switch (proto) { + case IPPROTO_ICMP: + ret = inspect_icmp_mbuf(mp, &proto, &addr, &port); if (ret != 0) { if (ret == ENOMEM) { NAT64STAT_INC(&cfg->base.stats, nomem); @@ -358,804 +583,640 @@ nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, NAT64STAT_INC(&cfg->base.stats, noproto); return (cfg->nomatch_verdict); } - /* XXX: Check addr for validity */ if (addr < cfg->prefix4 || addr > cfg->pmask4) { NAT64STAT_INC(&cfg->base.stats, nomatch4); return (cfg->nomatch_verdict); } + /* FALLTHROUGH */ + case IPPROTO_TCP: + case IPPROTO_UDP: + break; + default: + NAT64STAT_INC(&cfg->base.stats, noproto); + return (cfg->nomatch_verdict); } - /* Calc portgroup offset w.r.t protocol */ - pg = GET_PORTGROUP(cfg, addr, nat_proto, port); + alias = &ALIAS_BYHASH(cfg, addr); + MPASS(addr == alias->addr); - /* Check if this port is occupied by any portgroup */ - if (pg == NULL) { + /* Check that we have state for this port */ + state = nat64lsn_get_state4to6(cfg, alias, f_id->src_ip, + port, proto); + if (state == NULL) { NAT64STAT_INC(&cfg->base.stats, nomatch4); -#if 0 - DPRINTF(DP_STATE, "NOMATCH %u %d %d (%d)", addr, nat_proto, port, - _GET_PORTGROUP_IDX(cfg, addr, nat_proto, port)); -#endif return (cfg->nomatch_verdict); } /* TODO: Check flags to see if we need to do some static mapping */ - nh = pg->host; - - /* Prepare some fields we might need to update */ - SET_AGE(state_ts); - ip = mtod(*pm, struct ip *); - if (ip->ip_p == IPPROTO_TCP) - state_flags = convert_tcp_flags( - L3HDR(ip, struct tcphdr *)->th_flags); - else - state_flags = 0; - - /* Lock host and get port mapping */ - NAT64_LOCK(nh); - st = &pg->states[port & (NAT64_CHUNK_SIZE - 1)]; - if (st->timestamp != state_ts) - st->timestamp = state_ts; - if ((st->flags & state_flags) != state_flags) - st->flags |= state_flags; - lport = htons(st->u.s.lport); + /* Update some state fields if need */ + SET_AGE(ts); + if (f_id->proto == IPPROTO_TCP) + flags = convert_tcp_flags(f_id->_flags); + else + flags = 0; + if (state->timestamp != ts) + state->timestamp = ts; + if ((state->flags & flags) != flags) + state->flags |= flags; - NAT64_UNLOCK(nh); + port = htons(state->sport); + src6 = state->ip6_dst; if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; - nat64lsn_log(logdata, *pm, AF_INET, pg->idx, st->cur.off); + nat64lsn_log(logdata, *mp, AF_INET, state); } else logdata = NULL; + /* + * We already have src6 with embedded address, but it is possible, + * that src_ip is different than state->ip_dst, this is why we + * do embedding again. + */ nat64_embed_ip4(&src6, cfg->base.plat_plen, htonl(f_id->src_ip)); - ret = nat64_do_handle_ip4(*pm, &src6, &nh->addr, lport, + ret = nat64_do_handle_ip4(*mp, &src6, &state->host->addr, port, &cfg->base, logdata); - if (ret == NAT64SKIP) return (cfg->nomatch_verdict); - if (ret == NAT64MFREE) - m_freem(*pm); - *pm = NULL; - + if (ret == NAT64RETURN) + *mp = NULL; return (IP_FW_DENY); } -void -nat64lsn_dump_state(const struct nat64lsn_cfg *cfg, - const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st, - const char *px, int off) -{ - char s[INET6_ADDRSTRLEN], a[INET_ADDRSTRLEN], d[INET_ADDRSTRLEN]; - - if ((V_nat64_debug & DP_STATE) == 0) - return; - inet_ntop(AF_INET6, &pg->host->addr, s, sizeof(s)); - inet_ntop(AF_INET, &pg->aaddr, a, sizeof(a)); - inet_ntop(AF_INET, &st->u.s.faddr, d, sizeof(d)); - - DPRINTF(DP_STATE, "%s: PG %d ST [%p|%d]: %s:%d/%d <%s:%d> " - "%s:%d AGE %d", px, pg->idx, st, off, - s, st->u.s.lport, pg->nat_proto, a, pg->aport + off, - d, st->u.s.fport, GET_AGE(st->timestamp)); -} - /* - * Check if particular TCP state is stale and should be deleted. + * Check if particular state is stale and should be deleted. * Return 1 if true, 0 otherwise. */ static int -nat64lsn_periodic_check_tcp(const struct nat64lsn_cfg *cfg, - const struct nat64lsn_state *st, int age) +nat64lsn_check_state(struct nat64lsn_cfg *cfg, struct nat64lsn_state *state) { - int ttl; - - if (st->flags & NAT64_FLAG_FIN) - ttl = cfg->st_close_ttl; - else if (st->flags & NAT64_FLAG_ESTAB) - ttl = cfg->st_estab_ttl; - else if (st->flags & NAT64_FLAG_SYN) - ttl = cfg->st_syn_ttl; - else - ttl = cfg->st_syn_ttl; + int age, ttl; - if (age > ttl) + /* State was marked as stale in previous pass. */ + if (ISSET32(state->flags, NAT64_BIT_STALE)) return (1); - return (0); -} - -/* - * Check if nat state @st is stale and should be deleted. - * Return 1 if true, 0 otherwise. - */ -static NAT64NOINLINE int -nat64lsn_periodic_chkstate(const struct nat64lsn_cfg *cfg, - const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st) -{ - int age, delete; - - age = GET_AGE(st->timestamp); - delete = 0; - /* Skip immutable records */ - if (st->flags & NAT64_FLAG_RDR) + /* State is not yet initialized, it is going to be READY */ + if (!ISSET32(state->flags, NAT64_BIT_READY_IPV4)) return (0); - switch (pg->nat_proto) { - case NAT_PROTO_TCP: - delete = nat64lsn_periodic_check_tcp(cfg, st, age); - break; - case NAT_PROTO_UDP: - if (age > cfg->st_udp_ttl) - delete = 1; - break; - case NAT_PROTO_ICMP: - if (age > cfg->st_icmp_ttl) - delete = 1; - break; + age = GET_AGE(state->timestamp); + switch (state->proto) { + case IPPROTO_TCP: + if (ISSET32(state->flags, NAT64_BIT_TCP_FIN)) + ttl = cfg->st_close_ttl; + else if (ISSET32(state->flags, NAT64_BIT_TCP_ESTAB)) + ttl = cfg->st_estab_ttl; + else if (ISSET32(state->flags, NAT64_BIT_TCP_SYN)) + ttl = cfg->st_syn_ttl; + else + ttl = cfg->st_syn_ttl; + if (age > ttl) + return (1); + break; + case IPPROTO_UDP: + if (age > cfg->st_udp_ttl) + return (1); + break; + case IPPROTO_ICMP: + if (age > cfg->st_icmp_ttl) + return (1); + break; } - - return (delete); + return (0); } - -/* - * The following structures and functions - * are used to perform SLIST_FOREACH_SAFE() - * analog for states identified by struct st_ptr. - */ - -struct st_idx { - struct nat64lsn_portgroup *pg; - struct nat64lsn_state *st; - struct st_ptr sidx_next; -}; - -static struct st_idx * -st_first(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh, - struct st_ptr *sidx, struct st_idx *si) +static int +nat64lsn_maintain_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_pg *pg) { - struct nat64lsn_portgroup *pg; - struct nat64lsn_state *st; - - if (sidx->idx == 0) { - memset(si, 0, sizeof(*si)); - return (si); + struct nat64lsn_state *state; + struct nat64lsn_host *host; + uint64_t freemask; + int c, i, update_age; + + update_age = 0; + for (c = 0; c < pg->chunks_count; c++) { + FREEMASK_COPY(pg, c, freemask); + for (i = 0; i < 64; i++) { + if (ISSET64(freemask, i)) + continue; + state = &STATES_CHUNK(pg, c)->state[i]; + if (nat64lsn_check_state(cfg, state) == 0) { + update_age = 1; + continue; + } + /* + * Expire state: + * 1. Mark as STALE and unlink from host's hash. + * 2. Set bit in freemask. + */ + if (ISSET32(state->flags, NAT64_BIT_STALE)) { + /* + * State was marked as STALE in previous + * pass. Now it is safe to release it. + */ + state->flags = 0; + ck_pr_fence_store(); + FREEMASK_BTS(pg, c, i); + NAT64STAT_INC(&cfg->base.stats, sdeleted); + continue; + } + MPASS(state->flags & NAT64_FLAG_READY); + + host = state->host; + HOST_LOCK(host); + CK_SLIST_REMOVE(&STATE_HASH(host, state->hval), + state, nat64lsn_state, entries); + host->states_count--; + HOST_UNLOCK(host); + + /* Reset READY flag */ + ck_pr_btr_32(&state->flags, NAT64_BIT_READY_IPV4); + /* And set STALE flag */ + ck_pr_bts_32(&state->flags, NAT64_BIT_STALE); + ck_pr_fence_store(); + /* + * Now translate6 will not use this state, wait + * until it become safe for translate4, then mark + * state as free. + */ + } } - pg = PORTGROUP_BYSIDX(cfg, nh, sidx->idx); - st = &pg->states[sidx->off]; + /* + * We have some alive states, update timestamp. + */ + if (update_age) + SET_AGE(pg->timestamp); - si->pg = pg; - si->st = st; - si->sidx_next = st->next; + if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay) + return (0); - return (si); + return (1); } -static struct st_idx * -st_next(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh, - struct st_idx *si) +static void +nat64lsn_expire_portgroups(struct nat64lsn_cfg *cfg, + struct nat64lsn_pg_slist *portgroups) { - struct st_ptr sidx; - struct nat64lsn_portgroup *pg; - struct nat64lsn_state *st; - - sidx = si->sidx_next; - if (sidx.idx == 0) { - memset(si, 0, sizeof(*si)); - si->st = NULL; - si->pg = NULL; - return (si); + struct nat64lsn_alias *alias; + struct nat64lsn_pg *pg, *tpg, *firstpg, **pgptr; + uint32_t *pgmask, *pgidx; + int i, idx; + + for (i = 0; i < 1 << (32 - cfg->plen4); i++) { + alias = &cfg->aliases[i]; + CK_SLIST_FOREACH_SAFE(pg, &alias->portgroups, entries, tpg) { + if (nat64lsn_maintain_pg(cfg, pg) == 0) + continue; + /* Always keep first PG */ + if (pg->base_port == NAT64_MIN_PORT) + continue; + /* + * PG is expired, unlink it and schedule for + * deferred destroying. + */ + idx = (pg->base_port - NAT64_MIN_PORT) / 64; + switch (pg->proto) { + case IPPROTO_TCP: + pgmask = alias->tcp_pgmask; + pgptr = &alias->tcp_pg; + pgidx = &alias->tcp_pgidx; + firstpg = alias->tcp[0]->pgptr[0]; + break; + case IPPROTO_UDP: + pgmask = alias->udp_pgmask; + pgptr = &alias->udp_pg; + pgidx = &alias->udp_pgidx; + firstpg = alias->udp[0]->pgptr[0]; + break; + case IPPROTO_ICMP: + pgmask = alias->icmp_pgmask; + pgptr = &alias->icmp_pg; + pgidx = &alias->icmp_pgidx; + firstpg = alias->icmp[0]->pgptr[0]; + break; + } + /* Reset the corresponding bit in pgmask array. */ + ck_pr_btr_32(&pgmask[idx / 32], idx % 32); + ck_pr_fence_store(); + /* If last used PG points to this PG, reset it. */ + ck_pr_cas_ptr(pgptr, pg, firstpg); + ck_pr_cas_32(pgidx, idx, 0); + /* Unlink PG from alias's chain */ + ALIAS_LOCK(alias); + CK_SLIST_REMOVE(&alias->portgroups, pg, + nat64lsn_pg, entries); + alias->portgroups_count--; + ALIAS_UNLOCK(alias); + /* And link to job's chain for deferred destroying */ + NAT64STAT_INC(&cfg->base.stats, spgdeleted); + CK_SLIST_INSERT_HEAD(portgroups, pg, entries); + } } - - pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx); - st = &pg->states[sidx.off]; - - si->pg = pg; - si->st = st; - si->sidx_next = st->next; - - return (si); -} - -static struct st_idx * -st_save_cond(struct st_idx *si_dst, struct st_idx *si) -{ - if (si->st != NULL) - *si_dst = *si; - - return (si_dst); } -unsigned int -nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh) +static void +nat64lsn_expire_hosts(struct nat64lsn_cfg *cfg, + struct nat64lsn_hosts_slist *hosts) { - struct st_idx si, si_prev; + struct nat64lsn_host *host, *tmp; int i; - unsigned int delcount; - - delcount = 0; - for (i = 0; i < nh->hsize; i++) { - memset(&si_prev, 0, sizeof(si_prev)); - for (st_first(cfg, nh, &nh->phash[i], &si); - si.st != NULL; - st_save_cond(&si_prev, &si), st_next(cfg, nh, &si)) { - if (nat64lsn_periodic_chkstate(cfg, si.pg, si.st) == 0) + + for (i = 0; i < cfg->hosts_hashsize; i++) { + CK_SLIST_FOREACH_SAFE(host, &cfg->hosts_hash[i], + entries, tmp) { + /* Is host was marked in previous call? */ + if (host->flags & NAT64LSN_DEADHOST) { + if (host->states_count > 0) { + host->flags &= ~NAT64LSN_DEADHOST; + continue; + } + /* + * Unlink host from hash table and schedule + * it for deferred destroying. + */ + CFG_LOCK(cfg); + CK_SLIST_REMOVE(&cfg->hosts_hash[i], host, + nat64lsn_host, entries); + cfg->hosts_count--; + CFG_UNLOCK(cfg); + CK_SLIST_INSERT_HEAD(hosts, host, entries); + continue; + } + if (GET_AGE(host->timestamp) < cfg->host_delete_delay) continue; - nat64lsn_dump_state(cfg, si.pg, si.st, "DELETE STATE", - si.st->cur.off); - /* Unlink from hash */ - if (si_prev.st != NULL) - si_prev.st->next = si.st->next; - else - nh->phash[i] = si.st->next; - /* Delete state and free its data */ - PG_MARK_FREE_IDX(si.pg, si.st->cur.off); - memset(si.st, 0, sizeof(struct nat64lsn_state)); - si.st = NULL; - delcount++; - - /* Update portgroup timestamp */ - SET_AGE(si.pg->timestamp); + if (host->states_count > 0) + continue; + /* Mark host as going to be expired in next pass */ + host->flags |= NAT64LSN_DEADHOST; + ck_pr_fence_store(); } } - NAT64STAT_ADD(&cfg->base.stats, sdeleted, delcount); - return (delcount); -} - -/* - * Checks if portgroup is not used and can be deleted, - * Returns 1 if stale, 0 otherwise - */ -static int -stale_pg(const struct nat64lsn_cfg *cfg, const struct nat64lsn_portgroup *pg) -{ - - if (!PG_IS_EMPTY(pg)) - return (0); - if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay) - return (0); - return (1); } -/* - * Checks if host record is not used and can be deleted, - * Returns 1 if stale, 0 otherwise - */ -static int -stale_nh(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh) +static struct nat64lsn_pgchunk* +nat64lsn_expire_pgchunk(struct nat64lsn_cfg *cfg) { - - if (nh->pg_used != 0) - return (0); - if (GET_AGE(nh->timestamp) < cfg->nh_delete_delay) - return (0); - return (1); -} - -struct nat64lsn_periodic_data { - struct nat64lsn_cfg *cfg; - struct nat64lsn_job_head jhead; - int jlen; -}; - -static NAT64NOINLINE int -nat64lsn_periodic_chkhost(struct nat64lsn_host *nh, - struct nat64lsn_periodic_data *d) -{ - struct nat64lsn_portgroup *pg; - struct nat64lsn_job_item *ji; - uint64_t delmask[NAT64LSN_PGPTRNMASK]; - int delcount, i; - - delcount = 0; - memset(delmask, 0, sizeof(delmask)); - - if (V_nat64_debug & DP_JQUEUE) { - char a[INET6_ADDRSTRLEN]; - - inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); - DPRINTF(DP_JQUEUE, "Checking %s host %s on cpu %d", - stale_nh(d->cfg, nh) ? "stale" : "non-stale", a, curcpu); - } - if (!stale_nh(d->cfg, nh)) { - /* Non-stale host. Inspect internals */ - NAT64_LOCK(nh); - - /* Stage 1: Check&expire states */ - if (nat64lsn_periodic_chkstates(d->cfg, nh) != 0) - SET_AGE(nh->timestamp); - - /* Stage 2: Check if we need to expire */ - for (i = 0; i < nh->pg_used; i++) { - pg = PORTGROUP_BYSIDX(d->cfg, nh, i + 1); - if (pg == NULL) +#if 0 + struct nat64lsn_alias *alias; + struct nat64lsn_pgchunk *chunk; + uint32_t pgmask; + int i, c; + + for (i = 0; i < 1 << (32 - cfg->plen4); i++) { + alias = &cfg->aliases[i]; + if (GET_AGE(alias->timestamp) < cfg->pgchunk_delete_delay) + continue; + /* Always keep single chunk allocated */ + for (c = 1; c < 32; c++) { + if ((alias->tcp_chunkmask & (1 << c)) == 0) + break; + chunk = ck_pr_load_ptr(&alias->tcp[c]); + if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0) continue; - - /* Check if we can delete portgroup */ - if (stale_pg(d->cfg, pg) == 0) + ck_pr_btr_32(&alias->tcp_chunkmask, c); + ck_pr_fence_load(); + if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0) continue; - - DPRINTF(DP_JQUEUE, "Check PG %d", i); - delmask[i / 64] |= ((uint64_t)1 << (i % 64)); - delcount++; } - - NAT64_UNLOCK(nh); - if (delcount == 0) - return (0); } +#endif + return (NULL); +} - DPRINTF(DP_JQUEUE, "Queueing %d portgroups for deleting", delcount); - /* We have something to delete - add it to queue */ - ji = nat64lsn_create_job(d->cfg, NULL, JTYPE_DELPORTGROUP); - if (ji == NULL) - return (0); - - ji->haddr = nh->addr; - ji->delcount = delcount; - memcpy(ji->delmask, delmask, sizeof(ji->delmask)); - - TAILQ_INSERT_TAIL(&d->jhead, ji, next); - d->jlen++; - return (0); +#if 0 +static void +nat64lsn_maintain_hosts(struct nat64lsn_cfg *cfg) +{ + struct nat64lsn_host *h; + struct nat64lsn_states_slist *hash; + int i, j, hsize; + + for (i = 0; i < cfg->hosts_hashsize; i++) { + CK_SLIST_FOREACH(h, &cfg->hosts_hash[i], entries) { + if (h->states_count / 2 < h->states_hashsize || + h->states_hashsize >= NAT64LSN_MAX_HSIZE) + continue; + hsize = h->states_hashsize * 2; + hash = malloc(sizeof(*hash)* hsize, M_NOWAIT); + if (hash == NULL) + continue; + for (j = 0; j < hsize; j++) + CK_SLIST_INIT(&hash[i]); + + ck_pr_bts_32(&h->flags, NAT64LSN_GROWHASH); + } + } } +#endif /* * This procedure is used to perform various maintance - * on dynamic hash list. Currently it is called every second. + * on dynamic hash list. Currently it is called every 4 seconds. */ static void nat64lsn_periodic(void *data) { - struct ip_fw_chain *ch; - IPFW_RLOCK_TRACKER; + struct nat64lsn_job_item *ji; struct nat64lsn_cfg *cfg; - struct nat64lsn_periodic_data d; - struct nat64lsn_host *nh, *tmp; cfg = (struct nat64lsn_cfg *) data; - ch = cfg->ch; CURVNET_SET(cfg->vp); - - memset(&d, 0, sizeof(d)); - d.cfg = cfg; - TAILQ_INIT(&d.jhead); - - IPFW_RLOCK(ch); - - /* Stage 1: foreach host, check all its portgroups */ - I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_periodic_chkhost, &d); - - /* Enqueue everything we have requested */ - nat64lsn_enqueue_jobs(cfg, &d.jhead, d.jlen); - + if (cfg->hosts_count > 0) { + ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT); + if (ji != NULL) { + ji->jtype = JTYPE_DESTROY; + CK_SLIST_INIT(&ji->hosts); + CK_SLIST_INIT(&ji->portgroups); + nat64lsn_expire_hosts(cfg, &ji->hosts); + nat64lsn_expire_portgroups(cfg, &ji->portgroups); + ji->pgchunk = nat64lsn_expire_pgchunk(cfg); + NAT64LSN_EPOCH_CALL(&ji->epoch_ctx, + nat64lsn_job_destroy); + } else + NAT64STAT_INC(&cfg->base.stats, jnomem); + } callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY); - - IPFW_RUNLOCK(ch); - CURVNET_RESTORE(); } -static NAT64NOINLINE void -reinject_mbuf(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) -{ - - if (ji->m == NULL) - return; - - /* Request has failed or packet type is wrong */ - if (ji->f_id.addr_type != 6 || ji->done == 0) { - m_freem(ji->m); - ji->m = NULL; - NAT64STAT_INC(&cfg->base.stats, dropped); - DPRINTF(DP_DROPS, "mbuf dropped: type %d, done %d", - ji->jtype, ji->done); - return; - } - - /* - * XXX: Limit recursion level - */ - - NAT64STAT_INC(&cfg->base.stats, jreinjected); - DPRINTF(DP_JQUEUE, "Reinject mbuf"); - nat64lsn_translate6(cfg, &ji->f_id, &ji->m); -} - -static void -destroy_portgroup(struct nat64lsn_portgroup *pg) -{ - - DPRINTF(DP_OBJ, "DESTROY PORTGROUP %d %p", pg->idx, pg); - uma_zfree(nat64lsn_pg_zone, pg); -} - -static NAT64NOINLINE int -alloc_portgroup(struct nat64lsn_job_item *ji) -{ - struct nat64lsn_portgroup *pg; - - pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT); - if (pg == NULL) - return (1); - - if (ji->needs_idx != 0) { - ji->spare_idx = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT); - /* Failed alloc isn't always fatal, so don't check */ - } - memset(&pg->freemask, 0xFF, sizeof(pg->freemask)); - pg->nat_proto = ji->nat_proto; - ji->pg = pg; - return (0); - -} - -static void -destroy_host6(struct nat64lsn_host *nh) +#define ALLOC_ERROR(stage, type) ((stage) ? 10 * (type) + (stage): 0) +#define HOST_ERROR(stage) ALLOC_ERROR(stage, 1) +#define PG_ERROR(stage) ALLOC_ERROR(stage, 2) +static int +nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { char a[INET6_ADDRSTRLEN]; + struct nat64lsn_aliaslink *link; + struct nat64lsn_host *host; + struct nat64lsn_state *state; + uint32_t hval, data[2]; int i; - inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); - DPRINTF(DP_OBJ, "DESTROY HOST %s %p (pg used %d)", a, nh, - nh->pg_used); - NAT64_LOCK_DESTROY(nh); - for (i = 0; i < nh->pg_allocated / NAT64LSN_PGIDX_CHUNK; i++) - uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, i)); - uma_zfree(nat64lsn_host_zone, nh); -} - -static NAT64NOINLINE int -alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) -{ - struct nat64lsn_host *nh; - char a[INET6_ADDRSTRLEN]; - - nh = uma_zalloc(nat64lsn_host_zone, M_NOWAIT); - if (nh == NULL) - return (1); - PORTGROUP_CHUNK(nh, 0) = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT); - if (PORTGROUP_CHUNK(nh, 0) == NULL) { - uma_zfree(nat64lsn_host_zone, nh); - return (2); - } - if (alloc_portgroup(ji) != 0) { - NAT64STAT_INC(&cfg->base.stats, jportfails); - uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, 0)); - uma_zfree(nat64lsn_host_zone, nh); - return (3); + /* Check that host was not yet added. */ + NAT64LSN_EPOCH_ASSERT(); + CK_SLIST_FOREACH(host, &HOSTS(cfg, ji->src6_hval), entries) { + if (IN6_ARE_ADDR_EQUAL(&ji->f_id.src_ip6, &host->addr)) { + /* The host was allocated in previous call. */ + ji->host = host; + goto get_state; + } } - NAT64_LOCK_INIT(nh); - nh->addr = ji->haddr; - nh->hsize = NAT64LSN_HSIZE; /* XXX: hardcoded size */ - nh->pg_allocated = NAT64LSN_PGIDX_CHUNK; - nh->pg_used = 0; - ji->nh = nh; - - inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); - DPRINTF(DP_OBJ, "ALLOC HOST %s %p", a, ji->nh); - return (0); -} - -/* - * Finds free @pg index inside @nh - */ -static NAT64NOINLINE int -find_nh_pg_idx(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh, int *idx) -{ - int i; + host = ji->host = uma_zalloc(nat64lsn_host_zone, M_NOWAIT); + if (ji->host == NULL) + return (HOST_ERROR(1)); - for (i = 0; i < nh->pg_allocated; i++) { - if (PORTGROUP_BYSIDX(cfg, nh, i + 1) == NULL) { - *idx = i; - return (0); - } + host->states_hashsize = NAT64LSN_HSIZE; + host->states_hash = malloc(sizeof(struct nat64lsn_states_slist) * + host->states_hashsize, M_NAT64LSN, M_NOWAIT); + if (host->states_hash == NULL) { + uma_zfree(nat64lsn_host_zone, host); + return (HOST_ERROR(2)); } - return (1); -} -static NAT64NOINLINE int -attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) -{ - char a[INET6_ADDRSTRLEN]; - struct nat64lsn_host *nh; - - I6HASH_FIND(cfg, nh, &ji->haddr); - if (nh == NULL) { - /* Add new host to list */ - nh = ji->nh; - I6HASH_INSERT(cfg, nh); - cfg->ihcount++; - ji->nh = NULL; - - inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); - DPRINTF(DP_OBJ, "ATTACH HOST %s %p", a, nh); - /* - * Try to add portgroup. - * Note it will automatically set - * 'done' on ji if successful. - */ - if (attach_portgroup(cfg, ji) != 0) { - DPRINTF(DP_DROPS, "%s %p failed to attach PG", - a, nh); - NAT64STAT_INC(&cfg->base.stats, jportfails); - return (1); - } - return (0); + link = uma_zalloc(nat64lsn_aliaslink_zone, M_NOWAIT); + if (link == NULL) { + free(host->states_hash, M_NAT64LSN); + uma_zfree(nat64lsn_host_zone, host); + return (HOST_ERROR(3)); } + /* Initialize */ + HOST_LOCK_INIT(host); + SET_AGE(host->timestamp); + host->addr = ji->f_id.src_ip6; + host->hval = ji->src6_hval; + host->flags = 0; + host->states_count = 0; + host->states_hashsize = NAT64LSN_HSIZE; + CK_SLIST_INIT(&host->aliases); + for (i = 0; i < host->states_hashsize; i++) + CK_SLIST_INIT(&host->states_hash[i]); + + /* Determine alias from flow hash. */ + hval = ALIASLINK_HVAL(cfg, &ji->f_id); + link->alias = &ALIAS_BYHASH(cfg, hval); + CK_SLIST_INSERT_HEAD(&host->aliases, link, host_entries); + + ALIAS_LOCK(link->alias); + CK_SLIST_INSERT_HEAD(&link->alias->hosts, link, alias_entries); + link->alias->hosts_count++; + ALIAS_UNLOCK(link->alias); + + CFG_LOCK(cfg); + CK_SLIST_INSERT_HEAD(&HOSTS(cfg, ji->src6_hval), host, entries); + cfg->hosts_count++; + CFG_UNLOCK(cfg); + +get_state: + data[0] = ji->faddr; + data[1] = (ji->f_id.dst_port << 16) | ji->port; + ji->state_hval = hval = STATE_HVAL(cfg, data); + state = nat64lsn_get_state6to4(cfg, host, &ji->f_id, hval, + ji->faddr, ji->port, ji->proto); /* - * nh isn't NULL. This probably means we had several simultaneous - * host requests. The previous one request has already attached - * this host. Requeue attached mbuf and mark job as done, but - * leave nh and pg pointers not changed, so nat64lsn_do_request() - * will release all allocated resources. + * We failed to obtain new state, used alias needs new PG. + * XXX: or another alias should be used. */ - inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); - DPRINTF(DP_OBJ, "%s %p is already attached as %p", - a, ji->nh, nh); + if (state == NULL) { + /* Try to allocate new PG */ + if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0)) + return (HOST_ERROR(4)); + /* We assume that nat64lsn_alloc_pg() got state */ + } else + ji->state = state; + ji->done = 1; - return (0); + DPRINTF(DP_OBJ, "ALLOC HOST %s %p", + inet_ntop(AF_INET6, &host->addr, a, sizeof(a)), host); + return (HOST_ERROR(0)); } -static NAT64NOINLINE int -find_pg_place_addr(const struct nat64lsn_cfg *cfg, int addr_off, - int nat_proto, uint16_t *aport, int *ppg_idx) +static int +nat64lsn_find_pg_place(uint32_t *data) { - int j, pg_idx; - - pg_idx = addr_off * _ADDR_PG_COUNT + - (nat_proto - 1) * _ADDR_PG_PROTO_COUNT; + int i; - for (j = NAT64_MIN_CHUNK; j < _ADDR_PG_PROTO_COUNT; j++) { - if (cfg->pg[pg_idx + j] != NULL) + for (i = 0; i < 32; i++) { + if (~data[i] == 0) continue; - - *aport = j * NAT64_CHUNK_SIZE; - *ppg_idx = pg_idx + j; - return (1); + return (i * 32 + ffs(~data[i]) - 1); } - - return (0); + return (-1); } -/* - * XXX: This function needs to be rewritten to - * use free bitmask for faster pg finding, - * additionally, it should take into consideration - * a) randomization and - * b) previous addresses allocated to given nat instance - * - */ -static NAT64NOINLINE int -find_portgroup_place(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji, - uint32_t *aaddr, uint16_t *aport, int *ppg_idx) +static int +nat64lsn_alloc_proto_pg(struct nat64lsn_cfg *cfg, + struct nat64lsn_alias *alias, uint32_t *chunkmask, + uint32_t *pgmask, struct nat64lsn_pgchunk **chunks, + struct nat64lsn_pg **pgptr, uint8_t proto) { - int i, nat_proto; - - /* - * XXX: Use bitmask index to be able to find/check if IP address - * has some spare pg's - */ - nat_proto = ji->nat_proto; - - /* First, try to use same address */ - if (ji->aaddr != 0) { - i = ntohl(ji->aaddr) - cfg->prefix4; - if (find_pg_place_addr(cfg, i, nat_proto, aport, - ppg_idx) != 0){ - /* Found! */ - *aaddr = htonl(cfg->prefix4 + i); - return (0); - } - } - - /* Next, try to use random address based on flow hash */ - i = ji->fhash % (1 << (32 - cfg->plen4)); - if (find_pg_place_addr(cfg, i, nat_proto, aport, ppg_idx) != 0) { - /* Found! */ - *aaddr = htonl(cfg->prefix4 + i); - return (0); + struct nat64lsn_pg *pg; + int i, pg_idx, chunk_idx; + + /* Find place in pgchunk where PG can be added */ + pg_idx = nat64lsn_find_pg_place(pgmask); + if (pg_idx < 0) /* no more PGs */ + return (PG_ERROR(1)); + /* Check that we have allocated pgchunk for given PG index */ + chunk_idx = pg_idx / 32; + if (!ISSET32(*chunkmask, chunk_idx)) { + chunks[chunk_idx] = uma_zalloc(nat64lsn_pgchunk_zone, + M_NOWAIT); + if (chunks[chunk_idx] == NULL) + return (PG_ERROR(2)); + ck_pr_bts_32(chunkmask, chunk_idx); + ck_pr_fence_store(); } - - - /* Last one: simply find ANY available */ - for (i = 0; i < (1 << (32 - cfg->plen4)); i++) { - if (find_pg_place_addr(cfg, i, nat_proto, aport, - ppg_idx) != 0){ - /* Found! */ - *aaddr = htonl(cfg->prefix4 + i); - return (0); + /* Allocate PG and states chunks */ + pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT); + if (pg == NULL) + return (PG_ERROR(3)); + pg->chunks_count = cfg->states_chunks; + if (pg->chunks_count > 1) { + pg->freemask_chunk = malloc(pg->chunks_count * + sizeof(uint64_t), M_NAT64LSN, M_NOWAIT); + if (pg->freemask_chunk == NULL) { + uma_zfree(nat64lsn_pg_zone, pg); + return (PG_ERROR(4)); + } + pg->states_chunk = malloc(pg->chunks_count * + sizeof(struct nat64lsn_states_chunk *), M_NAT64LSN, + M_NOWAIT | M_ZERO); + if (pg->states_chunk == NULL) { + free(pg->freemask_chunk, M_NAT64LSN); + uma_zfree(nat64lsn_pg_zone, pg); + return (PG_ERROR(5)); } + for (i = 0; i < pg->chunks_count; i++) { + pg->states_chunk[i] = uma_zalloc( + nat64lsn_state_zone, M_NOWAIT); + if (pg->states_chunk[i] == NULL) + goto states_failed; + } + memset(pg->freemask_chunk, 0xff, + sizeof(uint64_t) * pg->chunks_count); + } else { + pg->states = uma_zalloc(nat64lsn_state_zone, M_NOWAIT); + if (pg->states == NULL) { + uma_zfree(nat64lsn_pg_zone, pg); + return (PG_ERROR(6)); + } + memset(&pg->freemask64, 0xff, sizeof(uint64_t)); } - return (1); + /* Initialize PG and hook it to pgchunk */ + SET_AGE(pg->timestamp); + pg->proto = proto; + pg->base_port = NAT64_MIN_PORT + 64 * pg_idx; + ck_pr_store_ptr(&chunks[chunk_idx]->pgptr[pg_idx % 32], pg); + ck_pr_fence_store(); + ck_pr_bts_32(&pgmask[pg_idx / 32], pg_idx % 32); + ck_pr_store_ptr(pgptr, pg); + + ALIAS_LOCK(alias); + CK_SLIST_INSERT_HEAD(&alias->portgroups, pg, entries); + SET_AGE(alias->timestamp); + alias->portgroups_count++; + ALIAS_UNLOCK(alias); + NAT64STAT_INC(&cfg->base.stats, spgcreated); + return (PG_ERROR(0)); + +states_failed: + for (i = 0; i < pg->chunks_count; i++) + uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]); + free(pg->freemask_chunk, M_NAT64LSN); + free(pg->states_chunk, M_NAT64LSN); + uma_zfree(nat64lsn_pg_zone, pg); + return (PG_ERROR(7)); } -static NAT64NOINLINE int -attach_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) +static int +nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { - char a[INET6_ADDRSTRLEN]; - struct nat64lsn_portgroup *pg; - struct nat64lsn_host *nh; - uint32_t aaddr; - uint16_t aport; - int nh_pg_idx, pg_idx; + struct nat64lsn_aliaslink *link; + struct nat64lsn_alias *alias; + int ret; - pg = ji->pg; + link = nat64lsn_get_aliaslink(cfg, ji->host, &ji->f_id); + if (link == NULL) + return (PG_ERROR(1)); /* - * Find source host and bind: we can't rely on - * pg->host + * TODO: check that we did not already allocated PG in + * previous call. */ - I6HASH_FIND(cfg, nh, &ji->haddr); - if (nh == NULL) - return (1); - /* Find spare port chunk */ - if (find_portgroup_place(cfg, ji, &aaddr, &aport, &pg_idx) != 0) { - inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); - DPRINTF(DP_OBJ | DP_DROPS, "empty PG not found for %s", a); - return (2); + ret = 0; + alias = link->alias; + /* Find place in pgchunk where PG can be added */ + switch (ji->proto) { + case IPPROTO_TCP: + ret = nat64lsn_alloc_proto_pg(cfg, alias, + &alias->tcp_chunkmask, alias->tcp_pgmask, + alias->tcp, &alias->tcp_pg, ji->proto); + break; + case IPPROTO_UDP: + ret = nat64lsn_alloc_proto_pg(cfg, alias, + &alias->udp_chunkmask, alias->udp_pgmask, + alias->udp, &alias->udp_pg, ji->proto); + break; + case IPPROTO_ICMP: + ret = nat64lsn_alloc_proto_pg(cfg, alias, + &alias->icmp_chunkmask, alias->icmp_pgmask, + alias->icmp, &alias->icmp_pg, ji->proto); + break; + default: + panic("%s: wrong proto %d", __func__, ji->proto); } - - /* Expand PG indexes if needed */ - if (nh->pg_allocated < cfg->max_chunks && ji->spare_idx != NULL) { - PORTGROUP_CHUNK(nh, nh->pg_allocated / NAT64LSN_PGIDX_CHUNK) = - ji->spare_idx; - nh->pg_allocated += NAT64LSN_PGIDX_CHUNK; - ji->spare_idx = NULL; + if (ret == PG_ERROR(1)) { + /* + * PG_ERROR(1) means that alias lacks free PGs + * XXX: try next alias. + */ + printf("NAT64LSN: %s: failed to obtain PG\n", + __func__); + return (ret); } - - /* Find empty index to store PG in the @nh */ - if (find_nh_pg_idx(cfg, nh, &nh_pg_idx) != 0) { - inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); - DPRINTF(DP_OBJ | DP_DROPS, "free PG index not found for %s", - a); - return (3); + if (ret == PG_ERROR(0)) { + ji->state = nat64lsn_get_state6to4(cfg, ji->host, &ji->f_id, + ji->state_hval, ji->faddr, ji->port, ji->proto); + if (ji->state == NULL) + ret = PG_ERROR(8); + else + ji->done = 1; } - - cfg->pg[pg_idx] = pg; - cfg->protochunks[pg->nat_proto]++; - NAT64STAT_INC(&cfg->base.stats, spgcreated); - - pg->aaddr = aaddr; - pg->aport = aport; - pg->host = nh; - pg->idx = pg_idx; - SET_AGE(pg->timestamp); - - PORTGROUP_BYSIDX(cfg, nh, nh_pg_idx + 1) = pg; - if (nh->pg_used == nh_pg_idx) - nh->pg_used++; - SET_AGE(nh->timestamp); - - ji->pg = NULL; - ji->done = 1; - - return (0); + return (ret); } -static NAT64NOINLINE void -consider_del_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) +static void +nat64lsn_do_request(void *data) { - struct nat64lsn_host *nh, *nh_tmp; - struct nat64lsn_portgroup *pg, *pg_list[256]; - int i, pg_lidx, idx; + struct epoch_tracker et; + struct nat64lsn_job_head jhead; + struct nat64lsn_job_item *ji, *ji2; + struct nat64lsn_cfg *cfg; + int jcount; + uint8_t flags; - /* Find source host */ - I6HASH_FIND(cfg, nh, &ji->haddr); - if (nh == NULL || nh->pg_used == 0) + cfg = (struct nat64lsn_cfg *)data; + if (cfg->jlen == 0) return; - memset(pg_list, 0, sizeof(pg_list)); - pg_lidx = 0; - - NAT64_LOCK(nh); - - for (i = nh->pg_used - 1; i >= 0; i--) { - if ((ji->delmask[i / 64] & ((uint64_t)1 << (i % 64))) == 0) - continue; - pg = PORTGROUP_BYSIDX(cfg, nh, i + 1); - - /* Check that PG isn't busy. */ - if (stale_pg(cfg, pg) == 0) - continue; - - /* DO delete */ - pg_list[pg_lidx++] = pg; - PORTGROUP_BYSIDX(cfg, nh, i + 1) = NULL; - - idx = _GET_PORTGROUP_IDX(cfg, ntohl(pg->aaddr), pg->nat_proto, - pg->aport); - KASSERT(cfg->pg[idx] == pg, ("Non matched pg")); - cfg->pg[idx] = NULL; - cfg->protochunks[pg->nat_proto]--; - NAT64STAT_INC(&cfg->base.stats, spgdeleted); - - /* Decrease pg_used */ - while (nh->pg_used > 0 && - PORTGROUP_BYSIDX(cfg, nh, nh->pg_used) == NULL) - nh->pg_used--; - - /* Check if on-stack buffer has ended */ - if (pg_lidx == nitems(pg_list)) - break; - } - - NAT64_UNLOCK(nh); - - if (stale_nh(cfg, nh)) { - I6HASH_REMOVE(cfg, nh, nh_tmp, &ji->haddr); - KASSERT(nh != NULL, ("Unable to find address")); - cfg->ihcount--; - ji->nh = nh; - I6HASH_FIND(cfg, nh, &ji->haddr); - KASSERT(nh == NULL, ("Failed to delete address")); - } - - /* TODO: Delay freeing portgroups */ - while (pg_lidx > 0) { - pg_lidx--; - NAT64STAT_INC(&cfg->base.stats, spgdeleted); - destroy_portgroup(pg_list[pg_lidx]); - } -} - -/* - * Main request handler. - * Responsible for handling jqueue, e.g. - * creating new hosts, addind/deleting portgroups. - */ -static NAT64NOINLINE void -nat64lsn_do_request(void *data) -{ - IPFW_RLOCK_TRACKER; - struct nat64lsn_job_head jhead; - struct nat64lsn_job_item *ji; - int jcount, nhsize; - struct nat64lsn_cfg *cfg = (struct nat64lsn_cfg *) data; - struct ip_fw_chain *ch; - int delcount; - CURVNET_SET(cfg->vp); - - TAILQ_INIT(&jhead); - - /* XXX: We're running unlocked here */ - - ch = cfg->ch; - delcount = 0; - IPFW_RLOCK(ch); + STAILQ_INIT(&jhead); /* Grab queue */ JQUEUE_LOCK(); - TAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item, next); + STAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item); jcount = cfg->jlen; cfg->jlen = 0; JQUEUE_UNLOCK(); - /* check if we need to resize hash */ - nhsize = 0; - if (cfg->ihcount > cfg->ihsize && cfg->ihsize < 65536) { - nhsize = cfg->ihsize; - for ( ; cfg->ihcount > nhsize && nhsize < 65536; nhsize *= 2) - ; - } else if (cfg->ihcount < cfg->ihsize * 4) { - nhsize = cfg->ihsize; - for ( ; cfg->ihcount < nhsize * 4 && nhsize > 32; nhsize /= 2) - ; - } - - IPFW_RUNLOCK(ch); - - if (TAILQ_EMPTY(&jhead)) { - CURVNET_RESTORE(); - return; - } + /* TODO: check if we need to resize hash */ NAT64STAT_INC(&cfg->base.stats, jcalls); DPRINTF(DP_JQUEUE, "count=%d", jcount); @@ -1169,442 +1230,283 @@ nat64lsn_do_request(void *data) * TODO: Limit per-call number of items */ - /* Pre-allocate everything for entire chain */ - TAILQ_FOREACH(ji, &jhead, next) { + NAT64LSN_EPOCH_ENTER(et); + STAILQ_FOREACH(ji, &jhead, entries) { switch (ji->jtype) { - case JTYPE_NEWHOST: - if (alloc_host6(cfg, ji) != 0) - NAT64STAT_INC(&cfg->base.stats, - jhostfails); - break; - case JTYPE_NEWPORTGROUP: - if (alloc_portgroup(ji) != 0) - NAT64STAT_INC(&cfg->base.stats, - jportfails); - break; - case JTYPE_DELPORTGROUP: - delcount += ji->delcount; - break; - default: - break; + case JTYPE_NEWHOST: + if (nat64lsn_alloc_host(cfg, ji) != HOST_ERROR(0)) + NAT64STAT_INC(&cfg->base.stats, jhostfails); + break; + case JTYPE_NEWPORTGROUP: + if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0)) + NAT64STAT_INC(&cfg->base.stats, jportfails); + break; + default: + continue; } - } - - /* - * TODO: Alloc hew hash - */ - nhsize = 0; - if (nhsize > 0) { - /* XXX: */ - } - - /* Apply all changes in batch */ - IPFW_UH_WLOCK(ch); - IPFW_WLOCK(ch); - - TAILQ_FOREACH(ji, &jhead, next) { - switch (ji->jtype) { - case JTYPE_NEWHOST: - if (ji->nh != NULL) - attach_host6(cfg, ji); - break; - case JTYPE_NEWPORTGROUP: - if (ji->pg != NULL && - attach_portgroup(cfg, ji) != 0) - NAT64STAT_INC(&cfg->base.stats, - jportfails); - break; - case JTYPE_DELPORTGROUP: - consider_del_portgroup(cfg, ji); - break; + if (ji->done != 0) { + flags = ji->proto != IPPROTO_TCP ? 0 : + convert_tcp_flags(ji->f_id._flags); + nat64lsn_translate6_internal(cfg, &ji->m, + ji->state, flags); + NAT64STAT_INC(&cfg->base.stats, jreinjected); } } + NAT64LSN_EPOCH_EXIT(et); - if (nhsize > 0) { - /* XXX: Move everything to new hash */ - } - - IPFW_WUNLOCK(ch); - IPFW_UH_WUNLOCK(ch); - - /* Flush unused entries */ - while (!TAILQ_EMPTY(&jhead)) { - ji = TAILQ_FIRST(&jhead); - TAILQ_REMOVE(&jhead, ji, next); - if (ji->nh != NULL) - destroy_host6(ji->nh); - if (ji->pg != NULL) - destroy_portgroup(ji->pg); - if (ji->m != NULL) - reinject_mbuf(cfg, ji); - if (ji->spare_idx != NULL) - uma_zfree(nat64lsn_pgidx_zone, ji->spare_idx); - free(ji, M_IPFW); + ji = STAILQ_FIRST(&jhead); + while (ji != NULL) { + ji2 = STAILQ_NEXT(ji, entries); + /* + * In any case we must free mbuf if + * translator did not consumed it. + */ + m_freem(ji->m); + uma_zfree(nat64lsn_job_zone, ji); + ji = ji2; } CURVNET_RESTORE(); } -static NAT64NOINLINE struct nat64lsn_job_item * -nat64lsn_create_job(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, - int jtype) +static struct nat64lsn_job_item * +nat64lsn_create_job(struct nat64lsn_cfg *cfg, int jtype) { struct nat64lsn_job_item *ji; - struct in6_addr haddr; - uint8_t nat_proto; /* - * Do not try to lock possibly contested mutex if we're near the limit. - * Drop packet instead. + * Do not try to lock possibly contested mutex if we're near the + * limit. Drop packet instead. */ - if (cfg->jlen >= cfg->jmaxlen) { + ji = NULL; + if (cfg->jlen >= cfg->jmaxlen) NAT64STAT_INC(&cfg->base.stats, jmaxlen); - return (NULL); - } - - memset(&haddr, 0, sizeof(haddr)); - nat_proto = 0; - if (f_id != NULL) { - haddr = f_id->src_ip6; - nat_proto = nat64lsn_proto_map[f_id->proto]; - - DPRINTF(DP_JQUEUE, "REQUEST pg nat_proto %d on proto %d", - nat_proto, f_id->proto); - - if (nat_proto == 0) - return (NULL); + else { + ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT); + if (ji == NULL) + NAT64STAT_INC(&cfg->base.stats, jnomem); } - - ji = malloc(sizeof(struct nat64lsn_job_item), M_IPFW, - M_NOWAIT | M_ZERO); - if (ji == NULL) { - NAT64STAT_INC(&cfg->base.stats, jnomem); - return (NULL); - } - - ji->jtype = jtype; - - if (f_id != NULL) { - ji->f_id = *f_id; - ji->haddr = haddr; - ji->nat_proto = nat_proto; + NAT64STAT_INC(&cfg->base.stats, dropped); + DPRINTF(DP_DROPS, "failed to create job"); + } else { + ji->jtype = jtype; + ji->done = 0; } - return (ji); } -static NAT64NOINLINE void +static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { - if (ji == NULL) - return; - JQUEUE_LOCK(); - TAILQ_INSERT_TAIL(&cfg->jhead, ji, next); - cfg->jlen++; + STAILQ_INSERT_TAIL(&cfg->jhead, ji, entries); NAT64STAT_INC(&cfg->base.stats, jrequests); + cfg->jlen++; if (callout_pending(&cfg->jcallout) == 0) callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg); JQUEUE_UNLOCK(); } -static NAT64NOINLINE void -nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg, - struct nat64lsn_job_head *jhead, int jlen) -{ - - if (TAILQ_EMPTY(jhead)) - return; - - /* Attach current queue to execution one */ - JQUEUE_LOCK(); - TAILQ_CONCAT(&cfg->jhead, jhead, next); - cfg->jlen += jlen; - NAT64STAT_ADD(&cfg->base.stats, jrequests, jlen); - - if (callout_pending(&cfg->jcallout) == 0) - callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg); - JQUEUE_UNLOCK(); -} - -static unsigned int -flow6_hash(const struct ipfw_flow_id *f_id) +static void +nat64lsn_job_destroy(epoch_context_t ctx) { - unsigned char hbuf[36]; - - memcpy(hbuf, &f_id->dst_ip6, 16); - memcpy(&hbuf[16], &f_id->src_ip6, 16); - memcpy(&hbuf[32], &f_id->dst_port, 2); - memcpy(&hbuf[32], &f_id->src_port, 2); + struct nat64lsn_job_item *ji; + struct nat64lsn_host *host; + struct nat64lsn_pg *pg; + int i; - return (djb_hash(hbuf, sizeof(hbuf))); + ji = __containerof(ctx, struct nat64lsn_job_item, epoch_ctx); + MPASS(ji->jtype == JTYPE_DESTROY); + while (!CK_SLIST_EMPTY(&ji->hosts)) { + host = CK_SLIST_FIRST(&ji->hosts); + CK_SLIST_REMOVE_HEAD(&ji->hosts, entries); + if (host->states_count > 0) { + /* + * XXX: The state has been created + * during host deletion. + */ + printf("NAT64LSN: %s: destroying host with %d " + "states\n", __func__, host->states_count); + } + nat64lsn_destroy_host(host); + } + while (!CK_SLIST_EMPTY(&ji->portgroups)) { + pg = CK_SLIST_FIRST(&ji->portgroups); + CK_SLIST_REMOVE_HEAD(&ji->portgroups, entries); + for (i = 0; i < pg->chunks_count; i++) { + if (FREEMASK_BITCOUNT(pg, i) != 64) { + /* + * XXX: The state has been created during + * PG deletion. + */ + printf("NAT64LSN: %s: destroying PG %p " + "with non-empty chunk %d\n", __func__, + pg, i); + } + } + nat64lsn_destroy_pg(pg); + } + uma_zfree(nat64lsn_pgchunk_zone, ji->pgchunk); + uma_zfree(nat64lsn_job_zone, ji); } -static NAT64NOINLINE int +static int nat64lsn_request_host(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, struct mbuf **pm) + const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval, + in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_job_item *ji; - struct mbuf *m; - m = *pm; - *pm = NULL; + ji = nat64lsn_create_job(cfg, JTYPE_NEWHOST); + if (ji != NULL) { + ji->m = *mp; + ji->f_id = *f_id; + ji->faddr = faddr; + ji->port = port; + ji->proto = proto; + ji->src6_hval = hval; - ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWHOST); - if (ji == NULL) { - m_freem(m); - NAT64STAT_INC(&cfg->base.stats, dropped); - DPRINTF(DP_DROPS, "failed to create job"); - } else { - ji->m = m; - /* Provide pseudo-random value based on flow */ - ji->fhash = flow6_hash(f_id); nat64lsn_enqueue_job(cfg, ji); NAT64STAT_INC(&cfg->base.stats, jhostsreq); + *mp = NULL; } - return (IP_FW_DENY); } -static NAT64NOINLINE int -nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr, - int needs_idx) +static int +nat64lsn_request_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, + const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval, + in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_job_item *ji; - struct mbuf *m; - m = *pm; - *pm = NULL; + ji = nat64lsn_create_job(cfg, JTYPE_NEWPORTGROUP); + if (ji != NULL) { + ji->m = *mp; + ji->f_id = *f_id; + ji->faddr = faddr; + ji->port = port; + ji->proto = proto; + ji->state_hval = hval; + ji->host = host; - ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWPORTGROUP); - if (ji == NULL) { - m_freem(m); - NAT64STAT_INC(&cfg->base.stats, dropped); - DPRINTF(DP_DROPS, "failed to create job"); - } else { - ji->m = m; - /* Provide pseudo-random value based on flow */ - ji->fhash = flow6_hash(f_id); - ji->aaddr = aaddr; - ji->needs_idx = needs_idx; nat64lsn_enqueue_job(cfg, ji); NAT64STAT_INC(&cfg->base.stats, jportreq); + *mp = NULL; } - return (IP_FW_DENY); } -static NAT64NOINLINE struct nat64lsn_state * -nat64lsn_create_state(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh, - int nat_proto, struct nat64lsn_state *kst, uint32_t *aaddr) +static int +nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, struct mbuf **mp, + struct nat64lsn_state *state, uint8_t flags) { - struct nat64lsn_portgroup *pg; - struct nat64lsn_state *st; - int i, hval, off; - - /* XXX: create additional bitmask for selecting proper portgroup */ - for (i = 0; i < nh->pg_used; i++) { - pg = PORTGROUP_BYSIDX(cfg, nh, i + 1); - if (pg == NULL) - continue; - if (*aaddr == 0) - *aaddr = pg->aaddr; - if (pg->nat_proto != nat_proto) - continue; - - off = PG_GET_FREE_IDX(pg); - if (off != 0) { - /* We have found spare state. Use it */ - off--; - PG_MARK_BUSY_IDX(pg, off); - st = &pg->states[off]; - - /* - * Fill in new info. Assume state was zeroed. - * Timestamp and flags will be filled by caller. - */ - st->u.s = kst->u.s; - st->cur.idx = i + 1; - st->cur.off = off; - - /* Insert into host hash table */ - hval = HASH_IN4(&st->u.hkey) & (nh->hsize - 1); - st->next = nh->phash[hval]; - nh->phash[hval] = st->cur; - - nat64lsn_dump_state(cfg, pg, st, "ALLOC STATE", off); + struct pfloghdr loghdr, *logdata; + int ret; + uint16_t ts; - NAT64STAT_INC(&cfg->base.stats, screated); + /* Update timestamp and flags if needed */ + SET_AGE(ts); + if (state->timestamp != ts) + state->timestamp = ts; + if ((state->flags & flags) != 0) + state->flags |= flags; - return (st); - } - /* Saev last used alias affress */ - *aaddr = pg->aaddr; - } + if (cfg->base.flags & NAT64_LOG) { + logdata = &loghdr; + nat64lsn_log(logdata, *mp, AF_INET6, state); + } else + logdata = NULL; - return (NULL); + ret = nat64_do_handle_ip6(*mp, htonl(state->ip_src), + htons(state->aport), &cfg->base, logdata); + if (ret == NAT64SKIP) + return (cfg->nomatch_verdict); + if (ret == NAT64RETURN) + *mp = NULL; + return (IP_FW_DENY); } -static NAT64NOINLINE int +static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id, - struct mbuf **pm) + struct mbuf **mp) { - struct pfloghdr loghdr, *logdata; - char a[INET6_ADDRSTRLEN]; - struct nat64lsn_host *nh; - struct st_ptr sidx; - struct nat64lsn_state *st, kst; - struct nat64lsn_portgroup *pg; + struct nat64lsn_state *state; + struct nat64lsn_host *host; struct icmp6_hdr *icmp6; - uint32_t aaddr; - int action, hval, nat_proto, proto; - uint16_t aport, state_ts, state_flags; - - /* Check if af/protocol is supported and get it short id */ - nat_proto = nat64lsn_proto_map[f_id->proto]; - if (nat_proto == 0) { + uint32_t addr, hval, data[2]; + int offset, proto; + uint16_t port; + uint8_t flags; + + /* Check if protocol is supported */ + port = f_id->src_port; + proto = f_id->proto; + switch (f_id->proto) { + case IPPROTO_ICMPV6: /* - * Since we can be called from jobs handler, we need - * to free mbuf by self, do not leave this task to - * ipfw_check_packet(). + * For ICMPv6 echo reply/request we use icmp6_id as + * local port. */ + offset = 0; + proto = nat64_getlasthdr(*mp, &offset); + if (proto < 0) { + NAT64STAT_INC(&cfg->base.stats, dropped); + DPRINTF(DP_DROPS, "mbuf isn't contigious"); + return (IP_FW_DENY); + } + if (proto == IPPROTO_ICMPV6) { + icmp6 = mtodo(*mp, offset); + if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST || + icmp6->icmp6_type == ICMP6_ECHO_REPLY) + port = ntohs(icmp6->icmp6_id); + } + proto = IPPROTO_ICMP; + /* FALLTHROUGH */ + case IPPROTO_TCP: + case IPPROTO_UDP: + break; + default: NAT64STAT_INC(&cfg->base.stats, noproto); - goto drop; + return (cfg->nomatch_verdict); } - /* Try to find host first */ - I6HASH_FIND(cfg, nh, &f_id->src_ip6); - - if (nh == NULL) - return (nat64lsn_request_host(cfg, f_id, pm)); - - /* Fill-in on-stack state structure */ - kst.u.s.faddr = nat64_extract_ip4(&f_id->dst_ip6, - cfg->base.plat_plen); - if (kst.u.s.faddr == 0 || - nat64_check_private_ip4(&cfg->base, kst.u.s.faddr) != 0) { - NAT64STAT_INC(&cfg->base.stats, dropped); - goto drop; - } - kst.u.s.fport = f_id->dst_port; - kst.u.s.lport = f_id->src_port; + /* Extract IPv4 from destination IPv6 address */ + addr = nat64_extract_ip4(&f_id->dst_ip6, cfg->base.plat_plen); + if (addr == 0 || nat64_check_private_ip4(&cfg->base, addr) != 0) { + char a[INET_ADDRSTRLEN]; - /* Prepare some fields we might need to update */ - hval = 0; - proto = nat64_getlasthdr(*pm, &hval); - if (proto < 0) { NAT64STAT_INC(&cfg->base.stats, dropped); - DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious"); - goto drop; + DPRINTF(DP_DROPS, "dropped due to embedded IPv4 address %s", + inet_ntop(AF_INET, &addr, a, sizeof(a))); + return (IP_FW_DENY); /* XXX: add extra stats? */ } - SET_AGE(state_ts); - if (proto == IPPROTO_TCP) - state_flags = convert_tcp_flags( - TCP(mtodo(*pm, hval))->th_flags); - else - state_flags = 0; - if (proto == IPPROTO_ICMPV6) { - /* Alter local port data */ - icmp6 = mtodo(*pm, hval); - if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST || - icmp6->icmp6_type == ICMP6_ECHO_REPLY) - kst.u.s.lport = ntohs(icmp6->icmp6_id); - } - - hval = HASH_IN4(&kst.u.hkey) & (nh->hsize - 1); - pg = NULL; - st = NULL; - - /* OK, let's find state in host hash */ - NAT64_LOCK(nh); - sidx = nh->phash[hval]; - int k = 0; - while (sidx.idx != 0) { - pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx); - st = &pg->states[sidx.off]; - //DPRINTF("SISX: %d/%d next: %d/%d", sidx.idx, sidx.off, - //st->next.idx, st->next.off); - if (st->u.hkey == kst.u.hkey && pg->nat_proto == nat_proto) + /* Try to find host */ + hval = HOST_HVAL(cfg, &f_id->src_ip6); + CK_SLIST_FOREACH(host, &HOSTS(cfg, hval), entries) { + if (IN6_ARE_ADDR_EQUAL(&f_id->src_ip6, &host->addr)) break; - if (k++ > 1000) { - DPRINTF(DP_ALL, "XXX: too long %d/%d %d/%d\n", - sidx.idx, sidx.off, st->next.idx, st->next.off); - DPRINTF(DP_GENERIC, "TR host %s %p on cpu %d", - inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)), - nh, curcpu); - k = 0; - } - sidx = st->next; } - - if (sidx.idx == 0) { - aaddr = 0; - st = nat64lsn_create_state(cfg, nh, nat_proto, &kst, &aaddr); - if (st == NULL) { - /* No free states. Request more if we can */ - if (nh->pg_used >= cfg->max_chunks) { - /* Limit reached */ - DPRINTF(DP_DROPS, "PG limit reached " - " for host %s (used %u, allocated %u, " - "limit %u)", inet_ntop(AF_INET6, - &nh->addr, a, sizeof(a)), - nh->pg_used * NAT64_CHUNK_SIZE, - nh->pg_allocated * NAT64_CHUNK_SIZE, - cfg->max_chunks * NAT64_CHUNK_SIZE); - NAT64_UNLOCK(nh); - NAT64STAT_INC(&cfg->base.stats, dropped); - goto drop; - } - if ((nh->pg_allocated <= - nh->pg_used + NAT64LSN_REMAININGPG) && - nh->pg_allocated < cfg->max_chunks) - action = 1; /* Request new indexes */ - else - action = 0; - NAT64_UNLOCK(nh); - //DPRINTF("No state, unlock for %p", nh); - return (nat64lsn_request_portgroup(cfg, f_id, - pm, aaddr, action)); - } - - /* We've got new state. */ - sidx = st->cur; - pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx); - } - - /* Okay, state found */ - - /* Update necessary fileds */ - if (st->timestamp != state_ts) - st->timestamp = state_ts; - if ((st->flags & state_flags) != 0) - st->flags |= state_flags; - - /* Copy needed state data */ - aaddr = pg->aaddr; - aport = htons(pg->aport + sidx.off); - - NAT64_UNLOCK(nh); - - if (cfg->base.flags & NAT64_LOG) { - logdata = &loghdr; - nat64lsn_log(logdata, *pm, AF_INET6, pg->idx, st->cur.off); - } else - logdata = NULL; - - action = nat64_do_handle_ip6(*pm, aaddr, aport, &cfg->base, logdata); - if (action == NAT64SKIP) - return (cfg->nomatch_verdict); - if (action == NAT64MFREE) { -drop: - m_freem(*pm); - } - *pm = NULL; /* mark mbuf as consumed */ - return (IP_FW_DENY); + /* We use IPv4 address in host byte order */ + addr = ntohl(addr); + if (host == NULL) + return (nat64lsn_request_host(cfg, f_id, mp, + hval, addr, port, proto)); + + flags = proto != IPPROTO_TCP ? 0 : convert_tcp_flags(f_id->_flags); + + data[0] = addr; + data[1] = (f_id->dst_port << 16) | port; + hval = STATE_HVAL(cfg, data); + state = nat64lsn_get_state6to4(cfg, host, f_id, hval, addr, + port, proto); + if (state == NULL) + return (nat64lsn_request_pg(cfg, host, f_id, mp, hval, addr, + port, proto)); + return (nat64lsn_translate6_internal(cfg, mp, state, flags)); } /* @@ -1614,49 +1516,61 @@ int ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { - ipfw_insn *icmd; + struct epoch_tracker et; struct nat64lsn_cfg *cfg; + ipfw_insn *icmd; int ret; IPFW_RLOCK_ASSERT(ch); - *done = 1; /* terminate the search */ + *done = 0; /* continue the search in case of failure */ icmd = cmd + 1; if (cmd->opcode != O_EXTERNAL_ACTION || cmd->arg1 != V_nat64lsn_eid || icmd->opcode != O_EXTERNAL_INSTANCE || (cfg = NAT64_LOOKUP(ch, icmd)) == NULL) - return (0); + return (IP_FW_DENY); + + *done = 1; /* terminate the search */ + NAT64LSN_EPOCH_ENTER(et); switch (args->f_id.addr_type) { case 4: ret = nat64lsn_translate4(cfg, &args->f_id, &args->m); break; case 6: + /* + * Check that destination IPv6 address matches our prefix6. + */ + if ((cfg->base.flags & NAT64LSN_ANYPREFIX) == 0 && + memcmp(&args->f_id.dst_ip6, &cfg->base.plat_prefix, + cfg->base.plat_plen / 8) != 0) { + ret = cfg->nomatch_verdict; + break; + } ret = nat64lsn_translate6(cfg, &args->f_id, &args->m); break; default: - return (cfg->nomatch_verdict); + ret = cfg->nomatch_verdict; } - return (ret); -} - -static int -nat64lsn_ctor_host(void *mem, int size, void *arg, int flags) -{ - struct nat64lsn_host *nh; + NAT64LSN_EPOCH_EXIT(et); - nh = (struct nat64lsn_host *)mem; - memset(nh->pg_ptr, 0, sizeof(nh->pg_ptr)); - memset(nh->phash, 0, sizeof(nh->phash)); - return (0); + if (ret != IP_FW_PASS && args->m != NULL) { + m_freem(args->m); + args->m = NULL; + } + return (ret); } static int -nat64lsn_ctor_pgidx(void *mem, int size, void *arg, int flags) +nat64lsn_state_ctor(void *mem, int size, void *arg, int flags) { + struct nat64lsn_states_chunk *chunk; + int i; - memset(mem, 0, size); + chunk = (struct nat64lsn_states_chunk *)mem; + for (i = 0; i < 64; i++) + chunk->state[i].flags = 0; return (0); } @@ -1664,109 +1578,185 @@ void nat64lsn_init_internal(void) { - memset(nat64lsn_proto_map, 0, sizeof(nat64lsn_proto_map)); - /* Set up supported protocol map */ - nat64lsn_proto_map[IPPROTO_TCP] = NAT_PROTO_TCP; - nat64lsn_proto_map[IPPROTO_UDP] = NAT_PROTO_UDP; - nat64lsn_proto_map[IPPROTO_ICMP] = NAT_PROTO_ICMP; - nat64lsn_proto_map[IPPROTO_ICMPV6] = NAT_PROTO_ICMP; - /* Fill in reverse proto map */ - memset(nat64lsn_rproto_map, 0, sizeof(nat64lsn_rproto_map)); - nat64lsn_rproto_map[NAT_PROTO_TCP] = IPPROTO_TCP; - nat64lsn_rproto_map[NAT_PROTO_UDP] = IPPROTO_UDP; - nat64lsn_rproto_map[NAT_PROTO_ICMP] = IPPROTO_ICMPV6; + nat64lsn_epoch = epoch_alloc(EPOCH_PREEMPT); - JQUEUE_LOCK_INIT(); - nat64lsn_host_zone = uma_zcreate("NAT64 hosts zone", - sizeof(struct nat64lsn_host), nat64lsn_ctor_host, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); - nat64lsn_pg_zone = uma_zcreate("NAT64 portgroups zone", - sizeof(struct nat64lsn_portgroup), NULL, NULL, NULL, NULL, + nat64lsn_host_zone = uma_zcreate("NAT64LSN hosts", + sizeof(struct nat64lsn_host), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + nat64lsn_pgchunk_zone = uma_zcreate("NAT64LSN portgroup chunks", + sizeof(struct nat64lsn_pgchunk), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + nat64lsn_pg_zone = uma_zcreate("NAT64LSN portgroups", + sizeof(struct nat64lsn_pg), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + nat64lsn_aliaslink_zone = uma_zcreate("NAT64LSN links", + sizeof(struct nat64lsn_aliaslink), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - nat64lsn_pgidx_zone = uma_zcreate("NAT64 portgroup indexes zone", - sizeof(struct nat64lsn_portgroup *) * NAT64LSN_PGIDX_CHUNK, - nat64lsn_ctor_pgidx, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + nat64lsn_state_zone = uma_zcreate("NAT64LSN states", + sizeof(struct nat64lsn_states_chunk), nat64lsn_state_ctor, + NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + nat64lsn_job_zone = uma_zcreate("NAT64LSN jobs", + sizeof(struct nat64lsn_job_item), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + JQUEUE_LOCK_INIT(); } void nat64lsn_uninit_internal(void) { + /* XXX: epoch_task drain */ + epoch_free(nat64lsn_epoch); + JQUEUE_LOCK_DESTROY(); uma_zdestroy(nat64lsn_host_zone); + uma_zdestroy(nat64lsn_pgchunk_zone); uma_zdestroy(nat64lsn_pg_zone); - uma_zdestroy(nat64lsn_pgidx_zone); + uma_zdestroy(nat64lsn_aliaslink_zone); + uma_zdestroy(nat64lsn_state_zone); + uma_zdestroy(nat64lsn_job_zone); } void nat64lsn_start_instance(struct nat64lsn_cfg *cfg) { + CALLOUT_LOCK(cfg); callout_reset(&cfg->periodic, hz * PERIODIC_DELAY, nat64lsn_periodic, cfg); + CALLOUT_UNLOCK(cfg); } struct nat64lsn_cfg * -nat64lsn_init_instance(struct ip_fw_chain *ch, size_t numaddr) +nat64lsn_init_instance(struct ip_fw_chain *ch, in_addr_t prefix, int plen) { struct nat64lsn_cfg *cfg; + struct nat64lsn_alias *alias; + int i, naddr; + + cfg = malloc(sizeof(struct nat64lsn_cfg), M_NAT64LSN, + M_WAITOK | M_ZERO); - cfg = malloc(sizeof(struct nat64lsn_cfg), M_IPFW, M_WAITOK | M_ZERO); - TAILQ_INIT(&cfg->jhead); + CFG_LOCK_INIT(cfg); + CALLOUT_LOCK_INIT(cfg); + STAILQ_INIT(&cfg->jhead); cfg->vp = curvnet; - cfg->ch = ch; COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK); - cfg->ihsize = NAT64LSN_HSIZE; - cfg->ih = malloc(sizeof(void *) * cfg->ihsize, M_IPFW, - M_WAITOK | M_ZERO); - - cfg->pg = malloc(sizeof(void *) * numaddr * _ADDR_PG_COUNT, M_IPFW, - M_WAITOK | M_ZERO); + cfg->hash_seed = arc4random(); + cfg->hosts_hashsize = NAT64LSN_HOSTS_HSIZE; + cfg->hosts_hash = malloc(sizeof(struct nat64lsn_hosts_slist) * + cfg->hosts_hashsize, M_NAT64LSN, M_WAITOK | M_ZERO); + for (i = 0; i < cfg->hosts_hashsize; i++) + CK_SLIST_INIT(&cfg->hosts_hash[i]); + + naddr = 1 << (32 - plen); + cfg->prefix4 = prefix; + cfg->pmask4 = prefix | (naddr - 1); + cfg->plen4 = plen; + cfg->aliases = malloc(sizeof(struct nat64lsn_alias) * naddr, + M_NAT64LSN, M_WAITOK | M_ZERO); + for (i = 0; i < naddr; i++) { + alias = &cfg->aliases[i]; + alias->addr = prefix + i; /* host byte order */ + CK_SLIST_INIT(&alias->hosts); + ALIAS_LOCK_INIT(alias); + } - callout_init(&cfg->periodic, CALLOUT_MPSAFE); + callout_init_mtx(&cfg->periodic, &cfg->periodic_lock, 0); callout_init(&cfg->jcallout, CALLOUT_MPSAFE); return (cfg); } -/* - * Destroy all hosts callback. - * Called on module unload when all activity already finished, so - * can work without any locks. - */ -static NAT64NOINLINE int -nat64lsn_destroy_host(struct nat64lsn_host *nh, struct nat64lsn_cfg *cfg) +static void +nat64lsn_destroy_pg(struct nat64lsn_pg *pg) { - struct nat64lsn_portgroup *pg; int i; - for (i = nh->pg_used; i > 0; i--) { - pg = PORTGROUP_BYSIDX(cfg, nh, i); - if (pg == NULL) - continue; - cfg->pg[pg->idx] = NULL; - destroy_portgroup(pg); - nh->pg_used--; + if (pg->chunks_count == 1) { + uma_zfree(nat64lsn_state_zone, pg->states); + } else { + for (i = 0; i < pg->chunks_count; i++) + uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]); + free(pg->states_chunk, M_NAT64LSN); + free(pg->freemask_chunk, M_NAT64LSN); } - destroy_host6(nh); - cfg->ihcount--; - return (0); + uma_zfree(nat64lsn_pg_zone, pg); +} + +static void +nat64lsn_destroy_alias(struct nat64lsn_cfg *cfg, + struct nat64lsn_alias *alias) +{ + struct nat64lsn_pg *pg; + int i; + + while (!CK_SLIST_EMPTY(&alias->portgroups)) { + pg = CK_SLIST_FIRST(&alias->portgroups); + CK_SLIST_REMOVE_HEAD(&alias->portgroups, entries); + nat64lsn_destroy_pg(pg); + } + for (i = 0; i < 32; i++) { + if (ISSET32(alias->tcp_chunkmask, i)) + uma_zfree(nat64lsn_pgchunk_zone, alias->tcp[i]); + if (ISSET32(alias->udp_chunkmask, i)) + uma_zfree(nat64lsn_pgchunk_zone, alias->udp[i]); + if (ISSET32(alias->icmp_chunkmask, i)) + uma_zfree(nat64lsn_pgchunk_zone, alias->icmp[i]); + } + ALIAS_LOCK_DESTROY(alias); +} + +static void +nat64lsn_destroy_host(struct nat64lsn_host *host) +{ + struct nat64lsn_aliaslink *link; + + while (!CK_SLIST_EMPTY(&host->aliases)) { + link = CK_SLIST_FIRST(&host->aliases); + CK_SLIST_REMOVE_HEAD(&host->aliases, host_entries); + + ALIAS_LOCK(link->alias); + CK_SLIST_REMOVE(&link->alias->hosts, link, + nat64lsn_aliaslink, alias_entries); + link->alias->hosts_count--; + ALIAS_UNLOCK(link->alias); + + uma_zfree(nat64lsn_aliaslink_zone, link); + } + HOST_LOCK_DESTROY(host); + free(host->states_hash, M_NAT64LSN); + uma_zfree(nat64lsn_host_zone, host); } void nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg) { - struct nat64lsn_host *nh, *tmp; + struct nat64lsn_host *host; + int i; - callout_drain(&cfg->jcallout); + CALLOUT_LOCK(cfg); callout_drain(&cfg->periodic); - I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_destroy_host, cfg); - DPRINTF(DP_OBJ, "instance %s: hosts %d", cfg->name, cfg->ihcount); + CALLOUT_UNLOCK(cfg); + callout_drain(&cfg->jcallout); + + for (i = 0; i < cfg->hosts_hashsize; i++) { + while (!CK_SLIST_EMPTY(&cfg->hosts_hash[i])) { + host = CK_SLIST_FIRST(&cfg->hosts_hash[i]); + CK_SLIST_REMOVE_HEAD(&cfg->hosts_hash[i], entries); + nat64lsn_destroy_host(host); + } + } + + for (i = 0; i < (1 << (32 - cfg->plen4)); i++) + nat64lsn_destroy_alias(cfg, &cfg->aliases[i]); + CALLOUT_LOCK_DESTROY(cfg); + CFG_LOCK_DESTROY(cfg); COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS); - free(cfg->ih, M_IPFW); - free(cfg->pg, M_IPFW); - free(cfg, M_IPFW); + free(cfg->hosts_hash, M_NAT64LSN); + free(cfg->aliases, M_NAT64LSN); + free(cfg, M_NAT64LSN); } diff --git a/sys/netpfil/ipfw/nat64/nat64lsn.h b/sys/netpfil/ipfw/nat64/nat64lsn.h index 44036cb3efcb..797876b229c2 100644 --- a/sys/netpfil/ipfw/nat64/nat64lsn.h +++ b/sys/netpfil/ipfw/nat64/nat64lsn.h @@ -35,75 +35,149 @@ #include "ip_fw_nat64.h" #include "nat64_translate.h" -#define NAT64_CHUNK_SIZE_BITS 6 /* 64 ports */ -#define NAT64_CHUNK_SIZE (1 << NAT64_CHUNK_SIZE_BITS) - #define NAT64_MIN_PORT 1024 -#define NAT64_MIN_CHUNK (NAT64_MIN_PORT >> NAT64_CHUNK_SIZE_BITS) +struct nat64lsn_host; +struct nat64lsn_alias; -struct st_ptr { - uint8_t idx; /* index in nh->pg_ptr array. - * NOTE: it starts from 1. - */ - uint8_t off; +struct nat64lsn_state { + /* IPv6 host entry keeps hash table to speedup state lookup */ + CK_SLIST_ENTRY(nat64lsn_state) entries; + struct nat64lsn_host *host; + + struct in6_addr ip6_dst; /* Destination IPv6 address */ + + in_addr_t ip_src; /* Alias IPv4 address */ + in_addr_t ip_dst; /* Destination IPv4 address */ + uint16_t dport; /* Destination port */ + uint16_t sport; /* Source port */ + + uint32_t hval; + uint32_t flags; /* Internal flags */ + uint16_t aport; + uint16_t timestamp; /* last used */ + uint8_t proto; + uint8_t _spare[7]; }; -#define NAT64LSN_MAXPGPTR ((1 << (sizeof(uint8_t) * NBBY)) - 1) -#define NAT64LSN_PGPTRMASKBITS (sizeof(uint64_t) * NBBY) -#define NAT64LSN_PGPTRNMASK (roundup(NAT64LSN_MAXPGPTR, \ - NAT64LSN_PGPTRMASKBITS) / NAT64LSN_PGPTRMASKBITS) -struct nat64lsn_portgroup; -/* sizeof(struct nat64lsn_host) = 64 + 64x2 + 8x8 = 256 bytes */ -struct nat64lsn_host { - struct rwlock h_lock; /* Host states lock */ - - struct in6_addr addr; - struct nat64lsn_host *next; - uint16_t timestamp; /* Last altered */ - uint16_t hsize; /* ports hash size */ - uint16_t pg_used; /* Number of portgroups used */ -#define NAT64LSN_REMAININGPG 8 /* Number of remaining PG before - * requesting of new chunk of indexes. - */ - uint16_t pg_allocated; /* Number of portgroups indexes - * allocated. - */ -#define NAT64LSN_HSIZE 64 - struct st_ptr phash[NAT64LSN_HSIZE]; /* XXX: hardcoded size */ - /* - * PG indexes are stored in chunks with 32 elements. - * The maximum count is limited to 255 due to st_ptr->idx is uint8_t. - */ -#define NAT64LSN_PGIDX_CHUNK 32 -#define NAT64LSN_PGNIDX (roundup(NAT64LSN_MAXPGPTR, \ - NAT64LSN_PGIDX_CHUNK) / NAT64LSN_PGIDX_CHUNK) - struct nat64lsn_portgroup **pg_ptr[NAT64LSN_PGNIDX]; /* PG indexes */ +struct nat64lsn_states_chunk { + struct nat64lsn_state state[64]; +}; + +#define ISSET64(mask, bit) ((mask) & ((uint64_t)1 << (bit))) +#define ISSET32(mask, bit) ((mask) & ((uint32_t)1 << (bit))) +struct nat64lsn_pg { + CK_SLIST_ENTRY(nat64lsn_pg) entries; + + uint16_t base_port; + uint16_t timestamp; + uint8_t proto; + uint8_t chunks_count; + uint8_t spare[2]; + + union { + uint64_t freemask64; + uint32_t freemask32[2]; + uint64_t *freemask64_chunk; + uint32_t *freemask32_chunk; + void *freemask_chunk; + }; + union { + struct nat64lsn_states_chunk *states; + struct nat64lsn_states_chunk **states_chunk; + }; +}; + +#define CHUNK_BY_FADDR(p, a) ((a) & ((p)->chunks_count - 1)) + +#ifdef __LP64__ +#define FREEMASK_CHUNK(p, v) \ + ((p)->chunks_count == 1 ? &(p)->freemask64 : \ + &(p)->freemask64_chunk[CHUNK_BY_FADDR(p, v)]) +#define FREEMASK_BITCOUNT(pg, faddr) \ + bitcount64(*FREEMASK_CHUNK((pg), (faddr))) +#else +#define FREEMASK_CHUNK(p, v) \ + ((p)->chunks_count == 1 ? &(p)->freemask32[0] : \ + &(p)->freemask32_chunk[CHUNK_BY_FADDR(p, v) * 2]) +#define FREEMASK_BITCOUNT(pg, faddr) \ + bitcount64(*(uint64_t *)FREEMASK_CHUNK((pg), (faddr))) +#endif /* !__LP64__ */ + +struct nat64lsn_pgchunk { + struct nat64lsn_pg *pgptr[32]; }; -#define NAT64_RLOCK_ASSERT(h) rw_assert(&(h)->h_lock, RA_RLOCKED) -#define NAT64_WLOCK_ASSERT(h) rw_assert(&(h)->h_lock, RA_WLOCKED) +struct nat64lsn_aliaslink { + CK_SLIST_ENTRY(nat64lsn_aliaslink) alias_entries; + CK_SLIST_ENTRY(nat64lsn_aliaslink) host_entries; + struct nat64lsn_alias *alias; +}; -#define NAT64_RLOCK(h) rw_rlock(&(h)->h_lock) -#define NAT64_RUNLOCK(h) rw_runlock(&(h)->h_lock) -#define NAT64_WLOCK(h) rw_wlock(&(h)->h_lock) -#define NAT64_WUNLOCK(h) rw_wunlock(&(h)->h_lock) -#define NAT64_LOCK(h) NAT64_WLOCK(h) -#define NAT64_UNLOCK(h) NAT64_WUNLOCK(h) -#define NAT64_LOCK_INIT(h) do { \ - rw_init(&(h)->h_lock, "NAT64 host lock"); \ - } while (0) +CK_SLIST_HEAD(nat64lsn_aliaslink_slist, nat64lsn_aliaslink); +CK_SLIST_HEAD(nat64lsn_states_slist, nat64lsn_state); +CK_SLIST_HEAD(nat64lsn_hosts_slist, nat64lsn_host); +CK_SLIST_HEAD(nat64lsn_pg_slist, nat64lsn_pg); + +struct nat64lsn_alias { + struct nat64lsn_aliaslink_slist hosts; + struct nat64lsn_pg_slist portgroups; + + struct mtx lock; + in_addr_t addr; /* host byte order */ + uint32_t hosts_count; + uint32_t portgroups_count; + uint32_t tcp_chunkmask; + uint32_t udp_chunkmask; + uint32_t icmp_chunkmask; + + uint32_t tcp_pgidx; + uint32_t udp_pgidx; + uint32_t icmp_pgidx; + uint16_t timestamp; + uint16_t spare; + + uint32_t tcp_pgmask[32]; + uint32_t udp_pgmask[32]; + uint32_t icmp_pgmask[32]; + struct nat64lsn_pgchunk *tcp[32]; + struct nat64lsn_pgchunk *udp[32]; + struct nat64lsn_pgchunk *icmp[32]; + + /* pointer to PG that can be used for faster state allocation */ + struct nat64lsn_pg *tcp_pg; + struct nat64lsn_pg *udp_pg; + struct nat64lsn_pg *icmp_pg; +}; +#define ALIAS_LOCK_INIT(p) \ + mtx_init(&(p)->lock, "alias_lock", NULL, MTX_DEF) +#define ALIAS_LOCK_DESTROY(p) mtx_destroy(&(p)->lock) +#define ALIAS_LOCK(p) mtx_lock(&(p)->lock) +#define ALIAS_UNLOCK(p) mtx_unlock(&(p)->lock) -#define NAT64_LOCK_DESTROY(h) do { \ - rw_destroy(&(h)->h_lock); \ - } while (0) +#define NAT64LSN_HSIZE 256 +#define NAT64LSN_MAX_HSIZE 4096 +#define NAT64LSN_HOSTS_HSIZE 1024 -/* Internal proto index */ -#define NAT_PROTO_TCP 1 -#define NAT_PROTO_UDP 2 -#define NAT_PROTO_ICMP 3 +struct nat64lsn_host { + struct in6_addr addr; + struct nat64lsn_aliaslink_slist aliases; + struct nat64lsn_states_slist *states_hash; + CK_SLIST_ENTRY(nat64lsn_host) entries; + uint32_t states_count; + uint32_t hval; + uint32_t flags; +#define NAT64LSN_DEADHOST 1 +#define NAT64LSN_GROWHASH 2 + uint16_t states_hashsize; + uint16_t timestamp; + struct mtx lock; +}; -#define NAT_MAX_PROTO 4 -extern uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO]; +#define HOST_LOCK_INIT(p) \ + mtx_init(&(p)->lock, "host_lock", NULL, MTX_DEF|MTX_NEW) +#define HOST_LOCK_DESTROY(p) mtx_destroy(&(p)->lock) +#define HOST_LOCK(p) mtx_lock(&(p)->lock) +#define HOST_UNLOCK(p) mtx_unlock(&(p)->lock) VNET_DECLARE(uint16_t, nat64lsn_eid); #define V_nat64lsn_eid VNET(nat64lsn_eid) @@ -112,124 +186,65 @@ VNET_DECLARE(uint16_t, nat64lsn_eid); /* Timestamp macro */ #define _CT ((int)time_uptime % 65536) #define SET_AGE(x) (x) = _CT -#define GET_AGE(x) ((_CT >= (x)) ? _CT - (x) : \ - (int)65536 + _CT - (x)) +#define GET_AGE(x) ((_CT >= (x)) ? _CT - (x): (int)65536 + _CT - (x)) -#ifdef __LP64__ -/* ffsl() is capable of checking 64-bit ints */ -#define _FFS64 -#endif - -/* 16 bytes */ -struct nat64lsn_state { - union { - struct { - in_addr_t faddr; /* Remote IPv4 address */ - uint16_t fport; /* Remote IPv4 port */ - uint16_t lport; /* Local IPv6 port */ - }s; - uint64_t hkey; - } u; - uint8_t nat_proto; - uint8_t flags; - uint16_t timestamp; - struct st_ptr cur; /* Index of portgroup in nat64lsn_host */ - struct st_ptr next; /* Next entry index */ -}; - -/* - * 1024+32 bytes per 64 states, used to store state - * AND for outside-in state lookup - */ -struct nat64lsn_portgroup { - struct nat64lsn_host *host; /* IPv6 source host info */ - in_addr_t aaddr; /* Alias addr, network format */ - uint16_t aport; /* Base port */ - uint16_t timestamp; - uint8_t nat_proto; - uint8_t spare[3]; - uint32_t idx; -#ifdef _FFS64 - uint64_t freemask; /* Mask of free entries */ -#else - uint32_t freemask[2]; /* Mask of free entries */ -#endif - struct nat64lsn_state states[NAT64_CHUNK_SIZE]; /* State storage */ -}; -#ifdef _FFS64 -#define PG_MARK_BUSY_IDX(_pg, _idx) (_pg)->freemask &= ~((uint64_t)1<<(_idx)) -#define PG_MARK_FREE_IDX(_pg, _idx) (_pg)->freemask |= ((uint64_t)1<<(_idx)) -#define PG_IS_FREE_IDX(_pg, _idx) ((_pg)->freemask & ((uint64_t)1<<(_idx))) -#define PG_IS_BUSY_IDX(_pg, _idx) (PG_IS_FREE_IDX(_pg, _idx) == 0) -#define PG_GET_FREE_IDX(_pg) (ffsll((_pg)->freemask)) -#define PG_IS_EMPTY(_pg) (((_pg)->freemask + 1) == 0) -#else -#define PG_MARK_BUSY_IDX(_pg, _idx) \ - (_pg)->freemask[(_idx) / 32] &= ~((u_long)1<<((_idx) % 32)) -#define PG_MARK_FREE_IDX(_pg, _idx) \ - (_pg)->freemask[(_idx) / 32] |= ((u_long)1<<((_idx) % 32)) -#define PG_IS_FREE_IDX(_pg, _idx) \ - ((_pg)->freemask[(_idx) / 32] & ((u_long)1<<((_idx) % 32))) -#define PG_IS_BUSY_IDX(_pg, _idx) (PG_IS_FREE_IDX(_pg, _idx) == 0) -#define PG_GET_FREE_IDX(_pg) _pg_get_free_idx(_pg) -#define PG_IS_EMPTY(_pg) \ - ((((_pg)->freemask[0] + 1) == 0 && ((_pg)->freemask[1] + 1) == 0)) - -static inline int -_pg_get_free_idx(const struct nat64lsn_portgroup *pg) -{ - int i; - - if ((i = ffsl(pg->freemask[0])) != 0) - return (i); - if ((i = ffsl(pg->freemask[1])) != 0) - return (i + 32); - return (0); -} - -#endif - -TAILQ_HEAD(nat64lsn_job_head, nat64lsn_job_item); +STAILQ_HEAD(nat64lsn_job_head, nat64lsn_job_item); struct nat64lsn_cfg { struct named_object no; - struct nat64lsn_portgroup **pg; /* XXX: array of pointers */ - struct nat64lsn_host **ih; /* Host hash */ + + struct nat64lsn_hosts_slist *hosts_hash; + struct nat64lsn_alias *aliases; /* array of aliases */ + + struct mtx lock; + uint32_t hosts_hashsize; + uint32_t hash_seed; + uint32_t prefix4; /* IPv4 prefix */ uint32_t pmask4; /* IPv4 prefix mask */ - uint32_t ihsize; /* IPv6 host hash size */ uint8_t plen4; - uint8_t nomatch_verdict;/* What to return to ipfw on no-match */ + uint8_t nomatch_verdict;/* Return value on no-match */ - uint32_t ihcount; /* Number of items in host hash */ - int max_chunks; /* Max chunks per client */ - int agg_prefix_len; /* Prefix length to count */ - int agg_prefix_max; /* Max hosts per agg prefix */ + uint32_t hosts_count; /* Number of items in host hash */ + uint32_t states_chunks; /* Number of states chunks per PG */ uint32_t jmaxlen; /* Max jobqueue length */ - uint16_t min_chunk; /* Min port group # to use */ - uint16_t max_chunk; /* Max port group # to use */ - uint16_t nh_delete_delay; /* Stale host delete delay */ + uint16_t host_delete_delay; /* Stale host delete delay */ + uint16_t pgchunk_delete_delay; uint16_t pg_delete_delay; /* Stale portgroup del delay */ uint16_t st_syn_ttl; /* TCP syn expire */ uint16_t st_close_ttl; /* TCP fin expire */ uint16_t st_estab_ttl; /* TCP established expire */ uint16_t st_udp_ttl; /* UDP expire */ uint16_t st_icmp_ttl; /* ICMP expire */ - uint32_t protochunks[NAT_MAX_PROTO];/* Number of chunks used */ + struct nat64_config base; #define NAT64LSN_FLAGSMASK (NAT64_LOG | NAT64_ALLOW_PRIVATE) +#define NAT64LSN_ANYPREFIX 0x00000100 + struct mtx periodic_lock; struct callout periodic; struct callout jcallout; - struct ip_fw_chain *ch; struct vnet *vp; struct nat64lsn_job_head jhead; int jlen; char name[64]; /* Nat instance name */ }; +/* CFG_LOCK protects cfg->hosts_hash from modification */ +#define CFG_LOCK_INIT(p) \ + mtx_init(&(p)->lock, "cfg_lock", NULL, MTX_DEF) +#define CFG_LOCK_DESTROY(p) mtx_destroy(&(p)->lock) +#define CFG_LOCK(p) mtx_lock(&(p)->lock) +#define CFG_UNLOCK(p) mtx_unlock(&(p)->lock) + +#define CALLOUT_LOCK_INIT(p) \ + mtx_init(&(p)->periodic_lock, "periodic_lock", NULL, MTX_DEF) +#define CALLOUT_LOCK_DESTROY(p) mtx_destroy(&(p)->periodic_lock) +#define CALLOUT_LOCK(p) mtx_lock(&(p)->periodic_lock) +#define CALLOUT_UNLOCK(p) mtx_unlock(&(p)->periodic_lock) + struct nat64lsn_cfg *nat64lsn_init_instance(struct ip_fw_chain *ch, - size_t numaddr); + in_addr_t prefix, int plen); void nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg); void nat64lsn_start_instance(struct nat64lsn_cfg *cfg); void nat64lsn_init_internal(void); @@ -237,114 +252,4 @@ void nat64lsn_uninit_internal(void); int ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done); -void -nat64lsn_dump_state(const struct nat64lsn_cfg *cfg, - const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st, - const char *px, int off); -/* - * Portgroup layout - * addr x nat_proto x port_off - * - */ - -#define _ADDR_PG_PROTO_COUNT (65536 >> NAT64_CHUNK_SIZE_BITS) -#define _ADDR_PG_COUNT (_ADDR_PG_PROTO_COUNT * NAT_MAX_PROTO) - -#define GET_ADDR_IDX(_cfg, _addr) ((_addr) - ((_cfg)->prefix4)) -#define __GET_PORTGROUP_IDX(_proto, _port) \ - ((_proto - 1) * _ADDR_PG_PROTO_COUNT + \ - ((_port) >> NAT64_CHUNK_SIZE_BITS)) - -#define _GET_PORTGROUP_IDX(_cfg, _addr, _proto, _port) \ - GET_ADDR_IDX(_cfg, _addr) * _ADDR_PG_COUNT + \ - __GET_PORTGROUP_IDX(_proto, _port) -#define GET_PORTGROUP(_cfg, _addr, _proto, _port) \ - ((_cfg)->pg[_GET_PORTGROUP_IDX(_cfg, _addr, _proto, _port)]) - -#define PORTGROUP_CHUNK(_nh, _idx) \ - ((_nh)->pg_ptr[(_idx)]) -#define PORTGROUP_BYSIDX(_cfg, _nh, _idx) \ - (PORTGROUP_CHUNK(_nh, (_idx - 1) / NAT64LSN_PGIDX_CHUNK) \ - [((_idx) - 1) % NAT64LSN_PGIDX_CHUNK]) - - -/* Chained hash table */ -#define CHT_FIND(_ph, _hsize, _PX, _x, _key) do { \ - unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \ - _PX##lock(_ph, _buck); \ - _x = _PX##first(_ph, _buck); \ - for ( ; _x != NULL; _x = _PX##next(_x)) { \ - if (_PX##cmp(_key, _PX##val(_x))) \ - break; \ - } \ - if (_x == NULL) \ - _PX##unlock(_ph, _buck); \ -} while(0) - -#define CHT_UNLOCK_BUCK(_ph, _PX, _buck) \ - _PX##unlock(_ph, _buck); - -#define CHT_UNLOCK_KEY(_ph, _hsize, _PX, _key) do { \ - unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \ - _PX##unlock(_ph, _buck); \ -} while(0) - -#define CHT_INSERT_HEAD(_ph, _hsize, _PX, _i) do { \ - unsigned int _buck = _PX##hash(_PX##val(_i)) & (_hsize - 1); \ - _PX##lock(_ph, _buck); \ - _PX##next(_i) = _PX##first(_ph, _buck); \ - _PX##first(_ph, _buck) = _i; \ - _PX##unlock(_ph, _buck); \ -} while(0) - -#define CHT_REMOVE(_ph, _hsize, _PX, _x, _tmp, _key) do { \ - unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \ - _PX##lock(_ph, _buck); \ - _x = _PX##first(_ph, _buck); \ - _tmp = NULL; \ - for ( ; _x != NULL; _tmp = _x, _x = _PX##next(_x)) { \ - if (_PX##cmp(_key, _PX##val(_x))) \ - break; \ - } \ - if (_x != NULL) { \ - if (_tmp == NULL) \ - _PX##first(_ph, _buck) = _PX##next(_x); \ - else \ - _PX##next(_tmp) = _PX##next(_x); \ - } \ - _PX##unlock(_ph, _buck); \ -} while(0) - -#define CHT_FOREACH_SAFE(_ph, _hsize, _PX, _x, _tmp, _cb, _arg) do { \ - for (unsigned int _i = 0; _i < _hsize; _i++) { \ - _PX##lock(_ph, _i); \ - _x = _PX##first(_ph, _i); \ - _tmp = NULL; \ - for (; _x != NULL; _tmp = _x, _x = _PX##next(_x)) { \ - if (_cb(_x, _arg) == 0) \ - continue; \ - if (_tmp == NULL) \ - _PX##first(_ph, _i) = _PX##next(_x); \ - else \ - _tmp = _PX##next(_x); \ - } \ - _PX##unlock(_ph, _i); \ - } \ -} while(0) - -#define CHT_RESIZE(_ph, _hsize, _nph, _nhsize, _PX, _x, _y) do { \ - unsigned int _buck; \ - for (unsigned int _i = 0; _i < _hsize; _i++) { \ - _x = _PX##first(_ph, _i); \ - _y = _x; \ - while (_y != NULL) { \ - _buck = _PX##hash(_PX##val(_x)) & (_nhsize - 1);\ - _y = _PX##next(_x); \ - _PX##next(_x) = _PX##first(_nph, _buck); \ - _PX##first(_nph, _buck) = _x; \ - } \ - } \ -} while(0) - #endif /* _IP_FW_NAT64LSN_H_ */ - diff --git a/sys/netpfil/ipfw/nat64/nat64lsn_control.c b/sys/netpfil/ipfw/nat64/nat64lsn_control.c index 6bb48d29e382..65481a88d64e 100644 --- a/sys/netpfil/ipfw/nat64/nat64lsn_control.c +++ b/sys/netpfil/ipfw/nat64/nat64lsn_control.c @@ -33,6 +33,8 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/counter.h> +#include <sys/ck.h> +#include <sys/epoch.h> #include <sys/errno.h> #include <sys/kernel.h> #include <sys/lock.h> @@ -43,10 +45,8 @@ __FBSDID("$FreeBSD$"); #include <sys/rwlock.h> #include <sys/socket.h> #include <sys/sockopt.h> -#include <sys/queue.h> #include <net/if.h> -#include <net/pfil.h> #include <netinet/in.h> #include <netinet/ip.h> @@ -75,12 +75,6 @@ static void nat64lsn_default_config(ipfw_nat64lsn_cfg *uc) { - if (uc->max_ports == 0) - uc->max_ports = NAT64LSN_MAX_PORTS; - else - uc->max_ports = roundup(uc->max_ports, NAT64_CHUNK_SIZE); - if (uc->max_ports > NAT64_CHUNK_SIZE * NAT64LSN_MAXPGPTR) - uc->max_ports = NAT64_CHUNK_SIZE * NAT64LSN_MAXPGPTR; if (uc->jmaxlen == 0) uc->jmaxlen = NAT64LSN_JMAXLEN; if (uc->jmaxlen > 65536) @@ -99,6 +93,13 @@ nat64lsn_default_config(ipfw_nat64lsn_cfg *uc) uc->st_udp_ttl = NAT64LSN_UDP_AGE; if (uc->st_icmp_ttl == 0) uc->st_icmp_ttl = NAT64LSN_ICMP_AGE; + + if (uc->states_chunks == 0) + uc->states_chunks = 1; + else if (uc->states_chunks >= 128) + uc->states_chunks = 128; + else if (!powerof2(uc->states_chunks)) + uc->states_chunks = 1 << fls(uc->states_chunks); } /* @@ -127,12 +128,20 @@ nat64lsn_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, if (ipfw_check_object_name_generic(uc->name) != 0) return (EINVAL); - if (uc->agg_prefix_len > 127 || uc->set >= IPFW_MAX_SETS) + if (uc->set >= IPFW_MAX_SETS) return (EINVAL); if (uc->plen4 > 32) return (EINVAL); - if (nat64_check_prefix6(&uc->prefix6, uc->plen6) != 0) + + /* + * Unspecified address has special meaning. But it must + * have valid prefix length. This length will be used to + * correctly extract and embedd IPv4 address into IPv6. + */ + if (nat64_check_prefix6(&uc->prefix6, uc->plen6) != 0 && + IN6_IS_ADDR_UNSPECIFIED(&uc->prefix6) && + nat64_check_prefixlen(uc->plen6) != 0) return (EINVAL); /* XXX: Check prefix4 to be global */ @@ -140,14 +149,6 @@ nat64lsn_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, mask4 = ~((1 << (32 - uc->plen4)) - 1); if ((addr4 & mask4) != addr4) return (EINVAL); - if (uc->min_port == 0) - uc->min_port = NAT64_MIN_PORT; - if (uc->max_port == 0) - uc->max_port = 65535; - if (uc->min_port > uc->max_port) - return (EINVAL); - uc->min_port = roundup(uc->min_port, NAT64_CHUNK_SIZE); - uc->max_port = roundup(uc->max_port, NAT64_CHUNK_SIZE); nat64lsn_default_config(uc); @@ -159,7 +160,7 @@ nat64lsn_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, } IPFW_UH_RUNLOCK(ch); - cfg = nat64lsn_init_instance(ch, 1 << (32 - uc->plen4)); + cfg = nat64lsn_init_instance(ch, addr4, uc->plen4); strlcpy(cfg->name, uc->name, sizeof(cfg->name)); cfg->no.name = cfg->name; cfg->no.etlv = IPFW_TLV_NAT64LSN_NAME; @@ -170,20 +171,12 @@ nat64lsn_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, cfg->base.flags = (uc->flags & NAT64LSN_FLAGSMASK) | NAT64_PLATPFX; if (IN6_IS_ADDR_WKPFX(&cfg->base.plat_prefix)) cfg->base.flags |= NAT64_WKPFX; + else if (IN6_IS_ADDR_UNSPECIFIED(&cfg->base.plat_prefix)) + cfg->base.flags |= NAT64LSN_ANYPREFIX; - cfg->prefix4 = addr4; - cfg->pmask4 = addr4 | ~mask4; - cfg->plen4 = uc->plen4; - - cfg->max_chunks = uc->max_ports / NAT64_CHUNK_SIZE; - cfg->agg_prefix_len = uc->agg_prefix_len; - cfg->agg_prefix_max = uc->agg_prefix_max; - - cfg->min_chunk = uc->min_port / NAT64_CHUNK_SIZE; - cfg->max_chunk = uc->max_port / NAT64_CHUNK_SIZE; - + cfg->states_chunks = uc->states_chunks; cfg->jmaxlen = uc->jmaxlen; - cfg->nh_delete_delay = uc->nh_delete_delay; + cfg->host_delete_delay = uc->nh_delete_delay; cfg->pg_delete_delay = uc->pg_delete_delay; cfg->st_syn_ttl = uc->st_syn_ttl; cfg->st_close_ttl = uc->st_close_ttl; @@ -249,7 +242,7 @@ nat64lsn_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3, cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); - return (ESRCH); + return (ENOENT); } if (cfg->no.refcnt > 0) { @@ -272,6 +265,8 @@ static void export_stats(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg, struct ipfw_nat64lsn_stats *stats) { + struct nat64lsn_alias *alias; + int i, j; __COPY_STAT_FIELD(cfg, stats, opcnt64); __COPY_STAT_FIELD(cfg, stats, opcnt46); @@ -299,10 +294,16 @@ export_stats(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg, __COPY_STAT_FIELD(cfg, stats, spgcreated); __COPY_STAT_FIELD(cfg, stats, spgdeleted); - stats->hostcount = cfg->ihcount; - stats->tcpchunks = cfg->protochunks[NAT_PROTO_TCP]; - stats->udpchunks = cfg->protochunks[NAT_PROTO_UDP]; - stats->icmpchunks = cfg->protochunks[NAT_PROTO_ICMP]; + stats->hostcount = cfg->hosts_count; + for (i = 0; i < (1 << (32 - cfg->plen4)); i++) { + alias = &cfg->aliases[i]; + for (j = 0; j < 32 && ISSET32(alias->tcp_chunkmask, j); j++) + stats->tcpchunks += bitcount32(alias->tcp_pgmask[j]); + for (j = 0; j < 32 && ISSET32(alias->udp_chunkmask, j); j++) + stats->udpchunks += bitcount32(alias->udp_pgmask[j]); + for (j = 0; j < 32 && ISSET32(alias->icmp_chunkmask, j); j++) + stats->icmpchunks += bitcount32(alias->icmp_pgmask[j]); + } } #undef __COPY_STAT_FIELD @@ -312,12 +313,9 @@ nat64lsn_export_config(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg, { uc->flags = cfg->base.flags & NAT64LSN_FLAGSMASK; - uc->max_ports = cfg->max_chunks * NAT64_CHUNK_SIZE; - uc->agg_prefix_len = cfg->agg_prefix_len; - uc->agg_prefix_max = cfg->agg_prefix_max; - + uc->states_chunks = cfg->states_chunks; uc->jmaxlen = cfg->jmaxlen; - uc->nh_delete_delay = cfg->nh_delete_delay; + uc->nh_delete_delay = cfg->host_delete_delay; uc->pg_delete_delay = cfg->pg_delete_delay; uc->st_syn_ttl = cfg->st_syn_ttl; uc->st_close_ttl = cfg->st_close_ttl; @@ -425,7 +423,7 @@ nat64lsn_config(struct ip_fw_chain *ch, ip_fw3_opheader *op, cfg = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_RUNLOCK(ch); - return (EEXIST); + return (ENOENT); } nat64lsn_export_config(ch, cfg, uc); IPFW_UH_RUNLOCK(ch); @@ -438,18 +436,18 @@ nat64lsn_config(struct ip_fw_chain *ch, ip_fw3_opheader *op, cfg = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); - return (EEXIST); + return (ENOENT); } /* * For now allow to change only following values: * jmaxlen, nh_del_age, pg_del_age, tcp_syn_age, tcp_close_age, - * tcp_est_age, udp_age, icmp_age, flags, max_ports. + * tcp_est_age, udp_age, icmp_age, flags, states_chunks. */ - cfg->max_chunks = uc->max_ports / NAT64_CHUNK_SIZE; + cfg->states_chunks = uc->states_chunks; cfg->jmaxlen = uc->jmaxlen; - cfg->nh_delete_delay = uc->nh_delete_delay; + cfg->host_delete_delay = uc->nh_delete_delay; cfg->pg_delete_delay = uc->pg_delete_delay; cfg->st_syn_ttl = uc->st_syn_ttl; cfg->st_close_ttl = uc->st_close_ttl; @@ -496,7 +494,7 @@ nat64lsn_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_RUNLOCK(ch); - return (ESRCH); + return (ENOENT); } export_stats(ch, cfg, &stats); @@ -538,163 +536,176 @@ nat64lsn_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); - return (ESRCH); + return (ENOENT); } COUNTER_ARRAY_ZERO(cfg->base.stats.cnt, NAT64STATS); IPFW_UH_WUNLOCK(ch); return (0); } +#ifdef __LP64__ +#define FREEMASK_COPY(pg, n, out) (out) = *FREEMASK_CHUNK((pg), (n)) +#else +#define FREEMASK_COPY(pg, n, out) (out) = *FREEMASK_CHUNK((pg), (n)) | \ + ((uint64_t)*(FREEMASK_CHUNK((pg), (n)) + 1) << 32) +#endif /* * Reply: [ ipfw_obj_header ipfw_obj_data [ ipfw_nat64lsn_stg * ipfw_nat64lsn_state x count, ... ] ] */ static int -export_pg_states(struct nat64lsn_cfg *cfg, struct nat64lsn_portgroup *pg, - ipfw_nat64lsn_stg *stg, struct sockopt_data *sd) +nat64lsn_export_states_v1(struct nat64lsn_cfg *cfg, union nat64lsn_pgidx *idx, + struct nat64lsn_pg *pg, struct sockopt_data *sd, uint32_t *ret_count) { - ipfw_nat64lsn_state *ste; - struct nat64lsn_state *st; - int i, count; + ipfw_nat64lsn_state_v1 *s; + struct nat64lsn_state *state; + uint64_t freemask; + uint32_t i, count; - NAT64_LOCK(pg->host); - count = 0; - for (i = 0; i < 64; i++) { - if (PG_IS_BUSY_IDX(pg, i)) - count++; - } - DPRINTF(DP_STATE, "EXPORT PG %d, count %d", pg->idx, count); + /* validate user input */ + if (idx->chunk > pg->chunks_count - 1) + return (EINVAL); - if (count == 0) { - stg->count = 0; - NAT64_UNLOCK(pg->host); - return (0); - } - ste = (ipfw_nat64lsn_state *)ipfw_get_sopt_space(sd, - count * sizeof(ipfw_nat64lsn_state)); - if (ste == NULL) { - NAT64_UNLOCK(pg->host); - return (1); - } + FREEMASK_COPY(pg, idx->chunk, freemask); + count = 64 - bitcount64(freemask); + if (count == 0) + return (0); /* Try next PG/chunk */ + + DPRINTF(DP_STATE, "EXPORT PG 0x%16jx, count %d", + (uintmax_t)idx->index, count); + + s = (ipfw_nat64lsn_state_v1 *)ipfw_get_sopt_space(sd, + count * sizeof(ipfw_nat64lsn_state_v1)); + if (s == NULL) + return (ENOMEM); - stg->alias4.s_addr = pg->aaddr; - stg->proto = nat64lsn_rproto_map[pg->nat_proto]; - stg->flags = 0; - stg->host6 = pg->host->addr; - stg->count = count; for (i = 0; i < 64; i++) { - if (PG_IS_FREE_IDX(pg, i)) + if (ISSET64(freemask, i)) continue; - st = &pg->states[i]; - ste->daddr.s_addr = st->u.s.faddr; - ste->dport = st->u.s.fport; - ste->aport = pg->aport + i; - ste->sport = st->u.s.lport; - ste->flags = st->flags; /* XXX filter flags */ - ste->idle = GET_AGE(st->timestamp); - ste++; + state = pg->chunks_count == 1 ? &pg->states->state[i] : + &pg->states_chunk[idx->chunk]->state[i]; + + s->host6 = state->host->addr; + s->daddr.s_addr = htonl(state->ip_dst); + s->dport = state->dport; + s->sport = state->sport; + s->aport = state->aport; + s->flags = (uint8_t)(state->flags & 7); + s->proto = state->proto; + s->idle = GET_AGE(state->timestamp); + s++; } - NAT64_UNLOCK(pg->host); - + *ret_count = count; return (0); } +#define LAST_IDX 0xFF static int -get_next_idx(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto, - uint16_t *port) +nat64lsn_next_pgidx(struct nat64lsn_cfg *cfg, struct nat64lsn_pg *pg, + union nat64lsn_pgidx *idx) { - if (*port < 65536 - NAT64_CHUNK_SIZE) { - *port += NAT64_CHUNK_SIZE; - return (0); + /* First iterate over chunks */ + if (pg != NULL) { + if (idx->chunk < pg->chunks_count - 1) { + idx->chunk++; + return (0); + } } - *port = 0; - - if (*nat_proto < NAT_MAX_PROTO - 1) { - *nat_proto += 1; + idx->chunk = 0; + /* Then over PGs */ + if (idx->port < UINT16_MAX - 64) { + idx->port += 64; return (0); } - *nat_proto = 1; - - if (*addr < cfg->pmask4) { - *addr += 1; + idx->port = NAT64_MIN_PORT; + /* Then over supported protocols */ + switch (idx->proto) { + case IPPROTO_ICMP: + idx->proto = IPPROTO_TCP; return (0); + case IPPROTO_TCP: + idx->proto = IPPROTO_UDP; + return (0); + default: + idx->proto = IPPROTO_ICMP; } - - /* End of space. */ - return (1); + /* And then over IPv4 alias addresses */ + if (idx->addr < cfg->pmask4) { + idx->addr++; + return (1); /* New states group is needed */ + } + idx->index = LAST_IDX; + return (-1); /* No more states */ } -#define PACK_IDX(addr, proto, port) \ - ((uint64_t)addr << 32) | ((uint32_t)port << 16) | (proto << 8) -#define UNPACK_IDX(idx, addr, proto, port) \ - (addr) = (uint32_t)((idx) >> 32); \ - (port) = (uint16_t)(((idx) >> 16) & 0xFFFF); \ - (proto) = (uint8_t)(((idx) >> 8) & 0xFF) - -static struct nat64lsn_portgroup * -get_next_pg(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto, - uint16_t *port) +static struct nat64lsn_pg* +nat64lsn_get_pg_byidx(struct nat64lsn_cfg *cfg, union nat64lsn_pgidx *idx) { - struct nat64lsn_portgroup *pg; - uint64_t pre_pack, post_pack; - - pg = NULL; - pre_pack = PACK_IDX(*addr, *nat_proto, *port); - for (;;) { - if (get_next_idx(cfg, addr, nat_proto, port) != 0) { - /* End of states */ - return (pg); - } - - pg = GET_PORTGROUP(cfg, *addr, *nat_proto, *port); - if (pg != NULL) - break; + struct nat64lsn_alias *alias; + int pg_idx; + + alias = &cfg->aliases[idx->addr & ((1 << (32 - cfg->plen4)) - 1)]; + MPASS(alias->addr == idx->addr); + + pg_idx = (idx->port - NAT64_MIN_PORT) / 64; + switch (idx->proto) { + case IPPROTO_ICMP: + if (ISSET32(alias->icmp_pgmask[pg_idx / 32], pg_idx % 32)) + return (alias->icmp[pg_idx / 32]->pgptr[pg_idx % 32]); + break; + case IPPROTO_TCP: + if (ISSET32(alias->tcp_pgmask[pg_idx / 32], pg_idx % 32)) + return (alias->tcp[pg_idx / 32]->pgptr[pg_idx % 32]); + break; + case IPPROTO_UDP: + if (ISSET32(alias->udp_pgmask[pg_idx / 32], pg_idx % 32)) + return (alias->udp[pg_idx / 32]->pgptr[pg_idx % 32]); + break; } - - post_pack = PACK_IDX(*addr, *nat_proto, *port); - if (pre_pack == post_pack) - DPRINTF(DP_STATE, "XXX: PACK_IDX %u %d %d", - *addr, *nat_proto, *port); - return (pg); + return (NULL); } -static NAT64NOINLINE struct nat64lsn_portgroup * -get_first_pg(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto, - uint16_t *port) +/* + * Lists nat64lsn states. + * Data layout (v0): + * Request: [ ipfw_obj_header ipfw_obj_data [ uint64_t ]] + * Reply: [ ipfw_obj_header ipfw_obj_data [ + * ipfw_nat64lsn_stg ipfw_nat64lsn_state x N] ] + * + * Returns 0 on success + */ +static int +nat64lsn_states_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) { - struct nat64lsn_portgroup *pg; - pg = GET_PORTGROUP(cfg, *addr, *nat_proto, *port); - if (pg == NULL) - pg = get_next_pg(cfg, addr, nat_proto, port); - - return (pg); + /* TODO: implement states listing for old ipfw(8) binaries */ + return (EOPNOTSUPP); } /* * Lists nat64lsn states. - * Data layout (v0)(current): + * Data layout (v1)(current): * Request: [ ipfw_obj_header ipfw_obj_data [ uint64_t ]] * Reply: [ ipfw_obj_header ipfw_obj_data [ - * ipfw_nat64lsn_stg ipfw_nat64lsn_state x N] ] + * ipfw_nat64lsn_stg_v1 ipfw_nat64lsn_state_v1 x N] ] * * Returns 0 on success */ static int -nat64lsn_states(struct ip_fw_chain *ch, ip_fw3_opheader *op3, +nat64lsn_states_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; ipfw_obj_data *od; - ipfw_nat64lsn_stg *stg; + ipfw_nat64lsn_stg_v1 *stg; struct nat64lsn_cfg *cfg; - struct nat64lsn_portgroup *pg, *pg_next; - uint64_t next_idx; + struct nat64lsn_pg *pg; + union nat64lsn_pgidx idx; size_t sz; - uint32_t addr, states; - uint16_t port; - uint8_t nat_proto; + uint32_t count, total; + int ret; sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_data) + sizeof(uint64_t); @@ -708,78 +719,96 @@ nat64lsn_states(struct ip_fw_chain *ch, ip_fw3_opheader *op3, od->head.length != sz - sizeof(ipfw_obj_header)) return (EINVAL); - next_idx = *(uint64_t *)(od + 1); - /* Translate index to the request position to start from */ - UNPACK_IDX(next_idx, addr, nat_proto, port); - if (nat_proto >= NAT_MAX_PROTO) + idx.index = *(uint64_t *)(od + 1); + if (idx.index != 0 && idx.proto != IPPROTO_ICMP && + idx.proto != IPPROTO_TCP && idx.proto != IPPROTO_UDP) return (EINVAL); - if (nat_proto == 0 && addr != 0) + if (idx.index == LAST_IDX) return (EINVAL); IPFW_UH_RLOCK(ch); cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_RUNLOCK(ch); - return (ESRCH); + return (ENOENT); } - /* Fill in starting point */ - if (addr == 0) { - addr = cfg->prefix4; - nat_proto = 1; - port = 0; + if (idx.index == 0) { /* Fill in starting point */ + idx.addr = cfg->prefix4; + idx.proto = IPPROTO_ICMP; + idx.port = NAT64_MIN_PORT; } - if (addr < cfg->prefix4 || addr > cfg->pmask4) { + if (idx.addr < cfg->prefix4 || idx.addr > cfg->pmask4 || + idx.port < NAT64_MIN_PORT) { IPFW_UH_RUNLOCK(ch); - DPRINTF(DP_GENERIC | DP_STATE, "XXX: %ju %u %u", - (uintmax_t)next_idx, addr, cfg->pmask4); return (EINVAL); } - sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_data) + - sizeof(ipfw_nat64lsn_stg); - if (sd->valsize < sz) + sizeof(ipfw_nat64lsn_stg_v1); + if (sd->valsize < sz) { + IPFW_UH_RUNLOCK(ch); return (ENOMEM); + } oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, sz); od = (ipfw_obj_data *)(oh + 1); od->head.type = IPFW_TLV_OBJDATA; od->head.length = sz - sizeof(ipfw_obj_header); - stg = (ipfw_nat64lsn_stg *)(od + 1); - - pg = get_first_pg(cfg, &addr, &nat_proto, &port); - if (pg == NULL) { - /* No states */ - stg->next_idx = 0xFF; - stg->count = 0; - IPFW_UH_RUNLOCK(ch); - return (0); - } - states = 0; - pg_next = NULL; - while (pg != NULL) { - pg_next = get_next_pg(cfg, &addr, &nat_proto, &port); - if (pg_next == NULL) - stg->next_idx = 0xFF; - else - stg->next_idx = PACK_IDX(addr, nat_proto, port); - - if (export_pg_states(cfg, pg, stg, sd) != 0) { - IPFW_UH_RUNLOCK(ch); - return (states == 0 ? ENOMEM: 0); + stg = (ipfw_nat64lsn_stg_v1 *)(od + 1); + stg->count = total = 0; + stg->next.index = idx.index; + /* + * Acquire CALLOUT_LOCK to avoid races with expiration code. + * Thus states, hosts and PGs will not expire while we hold it. + */ + CALLOUT_LOCK(cfg); + ret = 0; + do { + pg = nat64lsn_get_pg_byidx(cfg, &idx); + if (pg != NULL) { + count = 0; + ret = nat64lsn_export_states_v1(cfg, &idx, pg, + sd, &count); + if (ret != 0) + break; + if (count > 0) { + stg->count += count; + total += count; + /* Update total size of reply */ + od->head.length += + count * sizeof(ipfw_nat64lsn_state_v1); + sz += count * sizeof(ipfw_nat64lsn_state_v1); + } + stg->alias4.s_addr = htonl(idx.addr); } - states += stg->count; - od->head.length += stg->count * sizeof(ipfw_nat64lsn_state); - sz += stg->count * sizeof(ipfw_nat64lsn_state); - if (pg_next != NULL) { - sz += sizeof(ipfw_nat64lsn_stg); - if (sd->valsize < sz) + /* Determine new index */ + switch (nat64lsn_next_pgidx(cfg, pg, &idx)) { + case -1: + ret = ENOENT; /* End of search */ + break; + case 1: /* + * Next alias address, new group may be needed. + * If states count is zero, use this group. + */ + if (stg->count == 0) + continue; + /* Otherwise try to create new group */ + sz += sizeof(ipfw_nat64lsn_stg_v1); + if (sd->valsize < sz) { + ret = ENOMEM; break; - stg = (ipfw_nat64lsn_stg *)ipfw_get_sopt_space(sd, - sizeof(ipfw_nat64lsn_stg)); + } + /* Save next index in current group */ + stg->next.index = idx.index; + stg = (ipfw_nat64lsn_stg_v1 *)ipfw_get_sopt_space(sd, + sizeof(ipfw_nat64lsn_stg_v1)); + od->head.length += sizeof(ipfw_nat64lsn_stg_v1); + stg->count = 0; + break; } - pg = pg_next; - } + stg->next.index = idx.index; + } while (ret == 0); + CALLOUT_UNLOCK(cfg); IPFW_UH_RUNLOCK(ch); - return (0); + return ((total > 0 || idx.index == LAST_IDX) ? 0: ret); } static struct ipfw_sopt_handler scodes[] = { @@ -789,7 +818,8 @@ static struct ipfw_sopt_handler scodes[] = { { IP_FW_NAT64LSN_LIST, 0, HDIR_GET, nat64lsn_list }, { IP_FW_NAT64LSN_STATS, 0, HDIR_GET, nat64lsn_stats }, { IP_FW_NAT64LSN_RESET_STATS,0, HDIR_SET, nat64lsn_reset_stats }, - { IP_FW_NAT64LSN_LIST_STATES,0, HDIR_GET, nat64lsn_states }, + { IP_FW_NAT64LSN_LIST_STATES,0, HDIR_GET, nat64lsn_states_v0 }, + { IP_FW_NAT64LSN_LIST_STATES,1, HDIR_GET, nat64lsn_states_v1 }, }; static int |