diff options
Diffstat (limited to 'services')
| -rw-r--r-- | services/authzone.c | 39 | ||||
| -rw-r--r-- | services/cache/dns.c | 59 | ||||
| -rw-r--r-- | services/cache/dns.h | 3 | ||||
| -rw-r--r-- | services/cache/rrset.c | 10 | ||||
| -rw-r--r-- | services/listen_dnsport.c | 2534 | ||||
| -rw-r--r-- | services/listen_dnsport.h | 405 | ||||
| -rw-r--r-- | services/mesh.c | 13 | ||||
| -rw-r--r-- | services/mesh.h | 3 | ||||
| -rw-r--r-- | services/modstack.c | 2 | ||||
| -rw-r--r-- | services/rpz.c | 19 |
10 files changed, 3009 insertions, 78 deletions
diff --git a/services/authzone.c b/services/authzone.c index 580a681f57ce..6f6c55d4397d 100644 --- a/services/authzone.c +++ b/services/authzone.c @@ -3684,6 +3684,29 @@ auth_zone_parse_notify_serial(sldns_buffer* pkt, uint32_t *serial) return 1; } +/** print addr to str, and if not 53, append "@port_number", for logs. */ +static void addr_port_to_str(struct sockaddr_storage* addr, socklen_t addrlen, + char* buf, size_t len) +{ + uint16_t port = 0; + if(addr_is_ip6(addr, addrlen)) { + struct sockaddr_in6* sa = (struct sockaddr_in6*)addr; + port = ntohs((uint16_t)sa->sin6_port); + } else { + struct sockaddr_in* sa = (struct sockaddr_in*)addr; + port = ntohs((uint16_t)sa->sin_port); + } + if(port == UNBOUND_DNS_PORT) { + /* If it is port 53, print it plainly. */ + addr_to_str(addr, addrlen, buf, len); + } else { + char a[256]; + a[0]=0; + addr_to_str(addr, addrlen, a, sizeof(a)); + snprintf(buf, len, "%s@%d", a, (int)port); + } +} + /** see if addr appears in the list */ static int addr_in_list(struct auth_addr* list, struct sockaddr_storage* addr, @@ -5516,7 +5539,7 @@ xfr_transfer_init_fetch(struct auth_xfer* xfr, struct module_env* env) if(!xfr->task_transfer->cp) { char zname[255+1], as[256]; dname_str(xfr->name, zname); - addr_to_str(&addr, addrlen, as, sizeof(as)); + addr_port_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "cannot create http cp " "connection for %s to %s", zname, as); return 0; @@ -5525,7 +5548,7 @@ xfr_transfer_init_fetch(struct auth_xfer* xfr, struct module_env* env) if(verbosity >= VERB_ALGO) { char zname[255+1], as[256]; dname_str(xfr->name, zname); - addr_to_str(&addr, addrlen, as, sizeof(as)); + addr_port_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "auth zone %s transfer next HTTP fetch from %s started", zname, as); } /* Create or refresh the list of allow_notify addrs */ @@ -5548,7 +5571,7 @@ xfr_transfer_init_fetch(struct auth_xfer* xfr, struct module_env* env) if(!xfr->task_transfer->cp) { char zname[255+1], as[256]; dname_str(xfr->name, zname); - addr_to_str(&addr, addrlen, as, sizeof(as)); + addr_port_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "cannot create tcp cp connection for " "xfr %s to %s", zname, as); return 0; @@ -5557,7 +5580,7 @@ xfr_transfer_init_fetch(struct auth_xfer* xfr, struct module_env* env) if(verbosity >= VERB_ALGO) { char zname[255+1], as[256]; dname_str(xfr->name, zname); - addr_to_str(&addr, addrlen, as, sizeof(as)); + addr_port_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "auth zone %s transfer next %s fetch from %s started", zname, (xfr->task_transfer->on_ixfr?"IXFR":"AXFR"), as); } @@ -5660,7 +5683,7 @@ xfr_master_add_addrs(struct auth_master* m, struct ub_packed_rrset_key* rrset, } if(verbosity >= VERB_ALGO) { char s[64]; - addr_to_str(&a->addr, a->addrlen, s, sizeof(s)); + addr_port_to_str(&a->addr, a->addrlen, s, sizeof(s)); verbose(VERB_ALGO, "auth host %s lookup %s", m->host, s); } @@ -6406,7 +6429,7 @@ xfr_probe_send_probe(struct auth_xfer* xfr, struct module_env* env, if(!xfr->task_probe->cp) { char zname[255+1], as[256]; dname_str(xfr->name, zname); - addr_to_str(&addr, addrlen, as, sizeof(as)); + addr_port_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "cannot create udp cp for " "probe %s to %s", zname, as); return 0; @@ -6426,7 +6449,7 @@ xfr_probe_send_probe(struct auth_xfer* xfr, struct module_env* env, (struct sockaddr*)&addr, addrlen, 0)) { char zname[255+1], as[256]; dname_str(xfr->name, zname); - addr_to_str(&addr, addrlen, as, sizeof(as)); + addr_port_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "failed to send soa probe for %s to %s", zname, as); return 0; @@ -6434,7 +6457,7 @@ xfr_probe_send_probe(struct auth_xfer* xfr, struct module_env* env, if(verbosity >= VERB_ALGO) { char zname[255+1], as[256]; dname_str(xfr->name, zname); - addr_to_str(&addr, addrlen, as, sizeof(as)); + addr_port_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "auth zone %s soa probe sent to %s", zname, as); } diff --git a/services/cache/dns.c b/services/cache/dns.c index 5e74c31693b3..7ab63bacf492 100644 --- a/services/cache/dns.c +++ b/services/cache/dns.c @@ -88,7 +88,7 @@ store_rrsets(struct module_env* env, struct reply_info* rep, time_t now, /* update ref if it was in the cache */ switch(rrset_cache_update(env->rrset_cache, &rep->ref[i], env->alloc, ((ntohs(rep->ref[i].key->rk.type)== - LDNS_RR_TYPE_NS && !pside)?qstarttime:now + leeway))) { + LDNS_RR_TYPE_NS && !pside)?qstarttime:now) + leeway)) { case 0: /* ref unchanged, item inserted */ break; case 2: /* ref updated, cache is superior */ @@ -162,7 +162,7 @@ dns_cache_store_msg(struct module_env* env, struct query_info* qinfo, size_t i; /* store RRsets */ - for(i=0; i<rep->rrset_count; i++) { + for(i=0; i<rep->rrset_count; i++) { rep->ref[i].key = rep->rrsets[i]; rep->ref[i].id = rep->rrsets[i]->id; } @@ -197,6 +197,7 @@ dns_cache_store_msg(struct module_env* env, struct query_info* qinfo, reply_info_sortref(rep); if(!(e = query_info_entrysetup(qinfo, rep, hash))) { log_err("store_msg: malloc failed"); + reply_info_delete(rep, NULL); return; } slabhash_insert(env->msg_cache, hash, &e->entry, rep, env->alloc); @@ -365,7 +366,7 @@ find_add_addrs(struct module_env* env, uint16_t qclass, /** find and add A and AAAA records for missing nameservers in delegpt */ int cache_fill_missing(struct module_env* env, uint16_t qclass, - struct regional* region, struct delegpt* dp) + struct regional* region, struct delegpt* dp, uint32_t flags) { struct delegpt_ns* ns; struct msgreply_entry* neg; @@ -376,7 +377,7 @@ cache_fill_missing(struct module_env* env, uint16_t qclass, continue; ns->cache_lookup_count++; akey = rrset_cache_lookup(env->rrset_cache, ns->name, - ns->namelen, LDNS_RR_TYPE_A, qclass, 0, now, 0); + ns->namelen, LDNS_RR_TYPE_A, qclass, flags, now, 0); if(akey) { if(!delegpt_add_rrset_A(dp, region, akey, ns->lame, NULL)) { @@ -397,7 +398,7 @@ cache_fill_missing(struct module_env* env, uint16_t qclass, } } akey = rrset_cache_lookup(env->rrset_cache, ns->name, - ns->namelen, LDNS_RR_TYPE_AAAA, qclass, 0, now, 0); + ns->namelen, LDNS_RR_TYPE_AAAA, qclass, flags, now, 0); if(akey) { if(!delegpt_add_rrset_AAAA(dp, region, akey, ns->lame, NULL)) { @@ -607,22 +608,8 @@ tomsg(struct module_env* env, struct query_info* q, struct reply_info* r, time_t now_control = now; if(now > r->ttl) { /* Check if we are allowed to serve expired */ - if(allow_expired) { - if(env->cfg->serve_expired_ttl && - r->serve_expired_ttl < now) { - return NULL; - } - /* Ignore expired failure answers */ - if(FLAGS_GET_RCODE(r->flags) != - LDNS_RCODE_NOERROR && - FLAGS_GET_RCODE(r->flags) != - LDNS_RCODE_NXDOMAIN && - FLAGS_GET_RCODE(r->flags) != - LDNS_RCODE_YXDOMAIN) - return 0; - } else { + if(!allow_expired || !reply_info_can_answer_expired(r, now)) return NULL; - } /* Change the current time so we can pass the below TTL checks when * serving expired data. */ now_control = r->ttl - env->cfg->serve_expired_reply_ttl; @@ -641,6 +628,7 @@ tomsg(struct module_env* env, struct query_info* q, struct reply_info* r, else msg->rep->prefetch_ttl = PREFETCH_TTL_CALC(msg->rep->ttl); msg->rep->serve_expired_ttl = msg->rep->ttl + SERVE_EXPIRED_TTL; + msg->rep->serve_expired_norec_ttl = 0; msg->rep->security = r->security; msg->rep->an_numrrsets = r->an_numrrsets; msg->rep->ns_numrrsets = r->ns_numrrsets; @@ -724,6 +712,7 @@ rrset_msg(struct ub_packed_rrset_key* rrset, struct regional* region, msg->rep->ttl = d->ttl - now; msg->rep->prefetch_ttl = PREFETCH_TTL_CALC(msg->rep->ttl); msg->rep->serve_expired_ttl = msg->rep->ttl + SERVE_EXPIRED_TTL; + msg->rep->serve_expired_norec_ttl = 0; msg->rep->security = sec_status_unchecked; msg->rep->an_numrrsets = 1; msg->rep->ns_numrrsets = 0; @@ -763,6 +752,7 @@ synth_dname_msg(struct ub_packed_rrset_key* rrset, struct regional* region, msg->rep->ttl = d->ttl - now; msg->rep->prefetch_ttl = PREFETCH_TTL_CALC(msg->rep->ttl); msg->rep->serve_expired_ttl = msg->rep->ttl + SERVE_EXPIRED_TTL; + msg->rep->serve_expired_norec_ttl = 0; msg->rep->security = sec_status_unchecked; msg->rep->an_numrrsets = 1; msg->rep->ns_numrrsets = 0; @@ -1070,6 +1060,35 @@ dns_cache_store(struct module_env* env, struct query_info* msgqinf, struct regional* region, uint32_t flags, time_t qstarttime) { struct reply_info* rep = NULL; + if(SERVE_EXPIRED) { + /* We are serving expired records. Before caching, check if a + * useful expired record exists. */ + struct msgreply_entry* e = msg_cache_lookup(env, + msgqinf->qname, msgqinf->qname_len, msgqinf->qtype, + msgqinf->qclass, flags, 0, 0); + if(e) { + struct reply_info* cached = e->entry.data; + if(cached->ttl < *env->now + && reply_info_could_use_expired(cached, *env->now) + /* If we are validating make sure only + * validating modules can update such messages. + * In that case don't cache it and let a + * subsequent module handle the caching. For + * example, the iterator should not replace an + * expired secure answer with a fresh unchecked + * one and let the validator manage caching. */ + && cached->security != sec_status_bogus + && (env->need_to_validate && + msgrep->security == sec_status_unchecked)) { + verbose(VERB_ALGO, "a validated expired entry " + "could be overwritten, skip caching " + "the new message at this stage"); + lock_rw_unlock(&e->entry.lock); + return 1; + } + lock_rw_unlock(&e->entry.lock); + } + } /* alloc, malloc properly (not in region, like msg is) */ rep = reply_info_copy(msgrep, env->alloc, NULL); if(!rep) diff --git a/services/cache/dns.h b/services/cache/dns.h index c2bf23c6de54..1dd537d2bd5d 100644 --- a/services/cache/dns.h +++ b/services/cache/dns.h @@ -202,10 +202,11 @@ struct dns_msg* dns_cache_lookup(struct module_env* env, * @param qclass: which class to look in. * @param region: where to store new dp info. * @param dp: delegation point to fill missing entries. + * @param flags: rrset flags, or 0. * @return false on alloc failure. */ int cache_fill_missing(struct module_env* env, uint16_t qclass, - struct regional* region, struct delegpt* dp); + struct regional* region, struct delegpt* dp, uint32_t flags); /** * Utility, create new, unpacked data structure for cache response. diff --git a/services/cache/rrset.c b/services/cache/rrset.c index 2c03214c8fe2..a05ae5a56b78 100644 --- a/services/cache/rrset.c +++ b/services/cache/rrset.c @@ -128,8 +128,8 @@ need_to_update_rrset(void* nd, void* cd, time_t timenow, int equal, int ns) { struct packed_rrset_data* newd = (struct packed_rrset_data*)nd; struct packed_rrset_data* cached = (struct packed_rrset_data*)cd; - /* o if new data is expired, current data is better */ - if( newd->ttl < timenow && cached->ttl >= timenow) + /* o if new data is expired, cached data is better */ + if( newd->ttl < timenow && timenow <= cached->ttl) return 0; /* o store if rrset has been validated * everything better than bogus data @@ -140,9 +140,9 @@ need_to_update_rrset(void* nd, void* cd, time_t timenow, int equal, int ns) if( cached->security == sec_status_bogus && newd->security != sec_status_bogus && !equal) return 1; - /* o if current RRset is more trustworthy - insert it */ + /* o if new RRset is more trustworthy - insert it */ if( newd->trust > cached->trust ) { - /* if the cached rrset is bogus, and this one equal, + /* if the cached rrset is bogus, and new is equal, * do not update the TTL - let it expire. */ if(equal && cached->ttl >= timenow && cached->security == sec_status_bogus) @@ -155,7 +155,7 @@ need_to_update_rrset(void* nd, void* cd, time_t timenow, int equal, int ns) /* o same trust, but different in data - insert it */ if( newd->trust == cached->trust && !equal ) { /* if this is type NS, do not 'stick' to owner that changes - * the NS RRset, but use the old TTL for the new data, and + * the NS RRset, but use the cached TTL for the new data, and * update to fetch the latest data. ttl is not expired, because * that check was before this one. */ if(ns) { diff --git a/services/listen_dnsport.c b/services/listen_dnsport.c index 6c0691f2a73c..5dbac3650aaf 100644 --- a/services/listen_dnsport.c +++ b/services/listen_dnsport.c @@ -56,9 +56,11 @@ #include "util/net_help.h" #include "sldns/sbuffer.h" #include "sldns/parseutil.h" +#include "sldns/wire2str.h" #include "services/mesh.h" #include "util/fptr_wlist.h" #include "util/locks.h" +#include "util/timeval_func.h" #ifdef HAVE_NETDB_H #include <netdb.h> @@ -79,9 +81,30 @@ #ifdef HAVE_NET_IF_H #include <net/if.h> #endif + +#ifdef HAVE_TIME_H +#include <time.h> +#endif +#include <sys/time.h> + +#ifdef HAVE_NGTCP2 +#include <ngtcp2/ngtcp2.h> +#include <ngtcp2/ngtcp2_crypto.h> +#ifdef HAVE_NGTCP2_NGTCP2_CRYPTO_QUICTLS_H +#include <ngtcp2/ngtcp2_crypto_quictls.h> +#else +#include <ngtcp2/ngtcp2_crypto_openssl.h> +#endif +#endif + +#ifdef HAVE_OPENSSL_SSL_H +#include <openssl/ssl.h> +#endif + #ifdef HAVE_LINUX_NET_TSTAMP_H #include <linux/net_tstamp.h> #endif + /** number of queued TCP connections for listen() */ #define TCP_BACKLOG 256 @@ -109,9 +132,11 @@ static int http2_response_buffer_lock_inited = 0; /** * Debug print of the getaddrinfo returned address. * @param addr: the address returned. + * @param additional: additional text that describes the type of socket, + * or NULL for no text. */ static void -verbose_print_addr(struct addrinfo *addr) +verbose_print_addr(struct addrinfo *addr, const char* additional) { if(verbosity >= VERB_ALGO) { char buf[100]; @@ -126,13 +151,14 @@ verbose_print_addr(struct addrinfo *addr) (void)strlcpy(buf, "(null)", sizeof(buf)); } buf[sizeof(buf)-1] = 0; - verbose(VERB_ALGO, "creating %s%s socket %s %d", + verbose(VERB_ALGO, "creating %s%s socket %s %d%s%s", addr->ai_socktype==SOCK_DGRAM?"udp": addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto", addr->ai_family==AF_INET?"4": addr->ai_family==AF_INET6?"6": "_otherfam", buf, - ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port)); + ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port), + (additional?" ":""), (additional?additional:"")); } } @@ -673,7 +699,7 @@ create_udp_sock(int family, int socktype, struct sockaddr* addr, int create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto, int* reuseport, int transparent, int mss, int nodelay, int freebind, - int use_systemd, int dscp) + int use_systemd, int dscp, const char* additional) { int s = -1; char* err; @@ -692,7 +718,7 @@ create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto, #if !defined(IP_FREEBIND) (void)freebind; #endif - verbose_print_addr(addr); + verbose_print_addr(addr, additional); *noproto = 0; #ifdef HAVE_SYSTEMD if (!use_systemd || @@ -1008,7 +1034,8 @@ static int make_sock(int stype, const char* ifname, const char* port, struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd, int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind, - int use_systemd, int dscp, struct unbound_socket* ub_sock) + int use_systemd, int dscp, struct unbound_socket* ub_sock, + const char* additional) { struct addrinfo *res = NULL; int r, s, inuse, noproto; @@ -1032,7 +1059,7 @@ make_sock(int stype, const char* ifname, const char* port, return -1; } if(stype == SOCK_DGRAM) { - verbose_print_addr(res); + verbose_print_addr(res, additional); s = create_udp_sock(res->ai_family, res->ai_socktype, (struct sockaddr*)res->ai_addr, res->ai_addrlen, v6only, &inuse, &noproto, (int)rcv, (int)snd, 1, @@ -1045,7 +1072,7 @@ make_sock(int stype, const char* ifname, const char* port, } else { s = create_tcp_accept_sock(res, v6only, &noproto, reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd, - dscp); + dscp, additional); if(s == -1 && noproto && hints->ai_family == AF_INET6){ *noip6 = 1; } @@ -1079,7 +1106,8 @@ static int make_sock_port(int stype, const char* ifname, const char* port, struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd, int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind, - int use_systemd, int dscp, struct unbound_socket* ub_sock) + int use_systemd, int dscp, struct unbound_socket* ub_sock, + const char* additional) { char* s = strchr(ifname, '@'); if(s) { @@ -1102,11 +1130,11 @@ make_sock_port(int stype, const char* ifname, const char* port, p[strlen(s+1)]=0; return make_sock(stype, newif, p, hints, v6only, noip6, rcv, snd, reuseport, transparent, tcp_mss, nodelay, freebind, - use_systemd, dscp, ub_sock); + use_systemd, dscp, ub_sock, additional); } return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd, reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd, - dscp, ub_sock); + dscp, ub_sock, additional); } /** @@ -1254,6 +1282,8 @@ if_is_ssl(const char* ifname, const char* port, int ssl_port, * @param use_systemd: if true, fetch sockets from systemd. * @param dnscrypt_port: dnscrypt service port number * @param dscp: DSCP to use. + * @param quic_port: dns over quic port number. + * @param http_notls_downstream: if no tls is used for https downstream. * @param sock_queue_timeout: the sock_queue_timeout from config. Seconds to * wait to discard if UDP packets have waited for long in the socket * buffer. @@ -1267,7 +1297,7 @@ ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, struct config_strlist* proxy_protocol_port, int* reuseport, int transparent, int tcp_mss, int freebind, int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp, - int sock_queue_timeout) + int quic_port, int http_notls_downstream, int sock_queue_timeout) { int s, noip6=0; int is_https = if_is_https(ifname, port, https_port); @@ -1275,6 +1305,8 @@ ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port); int nodelay = is_https && http2_nodelay; struct unbound_socket* ub_sock; + int is_doq = if_is_quic(ifname, port, quic_port); + const char* add = NULL; if(!do_udp && !do_tcp) return 0; @@ -1286,6 +1318,9 @@ ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, } else if(is_https) { fatal_exit("PROXYv2 and DoH combination not " "supported!"); + } else if(is_doq) { + fatal_exit("PROXYv2 and DoQ combination not " + "supported!"); } } @@ -1295,7 +1330,8 @@ ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, return 0; if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1, &noip6, rcv, snd, reuseport, transparent, - tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) { + tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock, + (is_dnscrypt?"udpancil_dnscrypt":"udpancil"))) == -1) { free(ub_sock->addr); free(ub_sock); if(noip6) { @@ -1323,13 +1359,36 @@ ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, return 0; } } else if(do_udp) { + enum listen_type udp_port_type; ub_sock = calloc(1, sizeof(struct unbound_socket)); if(!ub_sock) return 0; + if(is_dnscrypt) { + udp_port_type = listen_type_udp_dnscrypt; + add = "dnscrypt"; + } else if(is_doq) { + udp_port_type = listen_type_doq; + add = "doq"; + if(((strchr(ifname, '@') && + atoi(strchr(ifname, '@')+1) == 53) || + (!strchr(ifname, '@') && atoi(port) == 53))) { + log_err("DNS over QUIC is not allowed on " + "port 53. Port 53 is for DNS " + "datagrams. Error for " + "interface '%s'.", ifname); + free(ub_sock->addr); + free(ub_sock); + return 0; + } + } else { + udp_port_type = listen_type_udp; + add = NULL; + } /* regular udp socket */ if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1, &noip6, rcv, snd, reuseport, transparent, - tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) { + tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock, + add)) == -1) { free(ub_sock->addr); free(ub_sock); if(noip6) { @@ -1338,14 +1397,25 @@ ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, } return 0; } - if (sock_queue_timeout && !set_recvtimestamp(s)) { - log_warn("socket timestamping is not available"); + if(udp_port_type == listen_type_doq) { + if(!set_recvpktinfo(s, hints->ai_family)) { + sock_close(s); + free(ub_sock->addr); + free(ub_sock); + return 0; + } } - if(!port_insert(list, s, is_dnscrypt - ?listen_type_udp_dnscrypt : - (sock_queue_timeout ? - listen_type_udpancil:listen_type_udp), - is_pp2, ub_sock)) { + if(udp_port_type == listen_type_udp && sock_queue_timeout) + udp_port_type = listen_type_udpancil; + if (sock_queue_timeout) { + if(!set_recvtimestamp(s)) { + log_warn("socket timestamping is not available"); + } else { + if(udp_port_type == listen_type_udp) + udp_port_type = listen_type_udpancil; + } + } + if(!port_insert(list, s, udp_port_type, is_pp2, ub_sock)) { sock_close(s); free(ub_sock->addr); free(ub_sock); @@ -1359,17 +1429,24 @@ ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, ub_sock = calloc(1, sizeof(struct unbound_socket)); if(!ub_sock) return 0; - if(is_ssl) + if(is_ssl) { port_type = listen_type_ssl; - else if(is_https) + add = "tls"; + } else if(is_https) { port_type = listen_type_http; - else if(is_dnscrypt) + add = "https"; + if(http_notls_downstream) + add = "http"; + } else if(is_dnscrypt) { port_type = listen_type_tcp_dnscrypt; - else + add = "dnscrypt"; + } else { port_type = listen_type_tcp; + add = NULL; + } if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1, &noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay, - freebind, use_systemd, dscp, ub_sock)) == -1) { + freebind, use_systemd, dscp, ub_sock, add)) == -1) { free(ub_sock->addr); free(ub_sock); if(noip6) { @@ -1446,8 +1523,10 @@ listen_create(struct comm_base* base, struct listen_port* ports, size_t bufsize, int tcp_accept_count, int tcp_idle_timeout, int harden_large_queries, uint32_t http_max_streams, char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit, - void* sslctx, struct dt_env* dtenv, comm_point_callback_type* cb, - void *cb_arg) + void* sslctx, struct dt_env* dtenv, struct doq_table* doq_table, + struct ub_randstate* rnd, const char* ssl_service_key, + const char* ssl_service_pem, struct config_file* cfg, + comm_point_callback_type* cb, void *cb_arg) { struct listen_dnsport* front = (struct listen_dnsport*) malloc(sizeof(struct listen_dnsport)); @@ -1471,6 +1550,16 @@ listen_create(struct comm_base* base, struct listen_port* ports, cp = comm_point_create_udp(base, ports->fd, front->udp_buff, ports->pp2_enabled, cb, cb_arg, ports->socket); + } else if(ports->ftype == listen_type_doq) { +#ifndef HAVE_NGTCP2 + log_warn("Unbound is not compiled with " + "ngtcp2. This is required to use DNS " + "over QUIC."); +#endif + cp = comm_point_create_doq(base, ports->fd, + front->udp_buff, cb, cb_arg, ports->socket, + doq_table, rnd, ssl_service_key, + ssl_service_pem, cfg); } else if(ports->ftype == listen_type_tcp || ports->ftype == listen_type_tcp_dnscrypt) { cp = comm_point_create_tcp(base, ports->fd, @@ -1858,7 +1947,9 @@ listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs, reuseport, cfg->ip_transparent, cfg->tcp_mss, cfg->ip_freebind, cfg->http_nodelay, cfg->use_systemd, - cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) { + cfg->dnscrypt_port, cfg->ip_dscp, + cfg->quic_port, cfg->http_notls_downstream, + cfg->sock_queue_timeout)) { listening_ports_free(list); return NULL; } @@ -1875,7 +1966,9 @@ listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs, reuseport, cfg->ip_transparent, cfg->tcp_mss, cfg->ip_freebind, cfg->http_nodelay, cfg->use_systemd, - cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) { + cfg->dnscrypt_port, cfg->ip_dscp, + cfg->quic_port, cfg->http_notls_downstream, + cfg->sock_queue_timeout)) { listening_ports_free(list); return NULL; } @@ -1894,7 +1987,9 @@ listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs, reuseport, cfg->ip_transparent, cfg->tcp_mss, cfg->ip_freebind, cfg->http_nodelay, cfg->use_systemd, - cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) { + cfg->dnscrypt_port, cfg->ip_dscp, + cfg->quic_port, cfg->http_notls_downstream, + cfg->sock_queue_timeout)) { listening_ports_free(list); return NULL; } @@ -1910,7 +2005,9 @@ listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs, reuseport, cfg->ip_transparent, cfg->tcp_mss, cfg->ip_freebind, cfg->http_nodelay, cfg->use_systemd, - cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) { + cfg->dnscrypt_port, cfg->ip_dscp, + cfg->quic_port, cfg->http_notls_downstream, + cfg->sock_queue_timeout)) { listening_ports_free(list); return NULL; } @@ -1928,7 +2025,9 @@ listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs, reuseport, cfg->ip_transparent, cfg->tcp_mss, cfg->ip_freebind, cfg->http_nodelay, cfg->use_systemd, - cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) { + cfg->dnscrypt_port, cfg->ip_dscp, + cfg->quic_port, cfg->http_notls_downstream, + cfg->sock_queue_timeout)) { listening_ports_free(list); return NULL; } @@ -1944,7 +2043,9 @@ listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs, reuseport, cfg->ip_transparent, cfg->tcp_mss, cfg->ip_freebind, cfg->http_nodelay, cfg->use_systemd, - cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) { + cfg->dnscrypt_port, cfg->ip_dscp, + cfg->quic_port, cfg->http_notls_downstream, + cfg->sock_queue_timeout)) { listening_ports_free(list); return NULL; } @@ -3154,3 +3255,2368 @@ nghttp2_session_callbacks* http2_req_callbacks_create(void) return callbacks; } #endif /* HAVE_NGHTTP2 */ + +#ifdef HAVE_NGTCP2 +struct doq_table* +doq_table_create(struct config_file* cfg, struct ub_randstate* rnd) +{ + struct doq_table* table = calloc(1, sizeof(*table)); + if(!table) + return NULL; + table->idle_timeout = ((uint64_t)cfg->tcp_idle_timeout)* + NGTCP2_MILLISECONDS; + table->sv_scidlen = 16; + table->static_secret_len = 16; + table->static_secret = malloc(table->static_secret_len); + if(!table->static_secret) { + free(table); + return NULL; + } + doq_fill_rand(rnd, table->static_secret, table->static_secret_len); + table->conn_tree = rbtree_create(doq_conn_cmp); + if(!table->conn_tree) { + free(table->static_secret); + free(table); + return NULL; + } + table->conid_tree = rbtree_create(doq_conid_cmp); + if(!table->conid_tree) { + free(table->static_secret); + free(table->conn_tree); + free(table); + return NULL; + } + table->timer_tree = rbtree_create(doq_timer_cmp); + if(!table->timer_tree) { + free(table->static_secret); + free(table->conn_tree); + free(table->conid_tree); + free(table); + return NULL; + } + lock_rw_init(&table->lock); + lock_rw_init(&table->conid_lock); + lock_basic_init(&table->size_lock); + lock_protect(&table->lock, &table->static_secret, + sizeof(table->static_secret)); + lock_protect(&table->lock, &table->static_secret_len, + sizeof(table->static_secret_len)); + lock_protect(&table->lock, table->static_secret, + table->static_secret_len); + lock_protect(&table->lock, &table->sv_scidlen, + sizeof(table->sv_scidlen)); + lock_protect(&table->lock, &table->idle_timeout, + sizeof(table->idle_timeout)); + lock_protect(&table->lock, &table->conn_tree, sizeof(table->conn_tree)); + lock_protect(&table->lock, table->conn_tree, sizeof(*table->conn_tree)); + lock_protect(&table->conid_lock, table->conid_tree, + sizeof(*table->conid_tree)); + lock_protect(&table->lock, table->timer_tree, + sizeof(*table->timer_tree)); + lock_protect(&table->size_lock, &table->current_size, + sizeof(table->current_size)); + return table; +} + +/** delete elements from the connection tree */ +static void +conn_tree_del(rbnode_type* node, void* arg) +{ + struct doq_table* table = (struct doq_table*)arg; + struct doq_conn* conn; + if(!node) + return; + conn = (struct doq_conn*)node->key; + if(conn->timer.timer_in_list) { + /* Remove timer from list first, because finding the rbnode + * element of the setlist of same timeouts needs tree lookup. + * Edit the tree structure after that lookup. */ + doq_timer_list_remove(conn->table, &conn->timer); + } + if(conn->timer.timer_in_tree) + doq_timer_tree_remove(conn->table, &conn->timer); + doq_table_quic_size_subtract(table, sizeof(*conn)+conn->key.dcidlen); + doq_conn_delete(conn, table); +} + +/** delete elements from the connection id tree */ +static void +conid_tree_del(rbnode_type* node, void* ATTR_UNUSED(arg)) +{ + if(!node) + return; + doq_conid_delete((struct doq_conid*)node->key); +} + +void +doq_table_delete(struct doq_table* table) +{ + if(!table) + return; + lock_rw_destroy(&table->lock); + free(table->static_secret); + if(table->conn_tree) { + traverse_postorder(table->conn_tree, conn_tree_del, table); + free(table->conn_tree); + } + lock_rw_destroy(&table->conid_lock); + if(table->conid_tree) { + /* The tree should be empty, because the doq_conn_delete calls + * above should have also removed their conid elements. */ + traverse_postorder(table->conid_tree, conid_tree_del, NULL); + free(table->conid_tree); + } + lock_basic_destroy(&table->size_lock); + if(table->timer_tree) { + /* The tree should be empty, because the conn_tree_del calls + * above should also have removed them. Also the doq_timer + * is part of the doq_conn struct, so is already freed. */ + free(table->timer_tree); + } + table->write_list_first = NULL; + table->write_list_last = NULL; + free(table); +} + +struct doq_timer* +doq_timer_find_time(struct doq_table* table, struct timeval* tv) +{ + struct doq_timer key; + struct rbnode_type* node; + memset(&key, 0, sizeof(key)); + key.time.tv_sec = tv->tv_sec; + key.time.tv_usec = tv->tv_usec; + node = rbtree_search(table->timer_tree, &key); + if(node) + return (struct doq_timer*)node->key; + return NULL; +} + +void +doq_timer_tree_remove(struct doq_table* table, struct doq_timer* timer) +{ + if(!timer->timer_in_tree) + return; + rbtree_delete(table->timer_tree, timer); + timer->timer_in_tree = 0; + /* This item could have more timers in the same set. */ + if(timer->setlist_first) { + struct doq_timer* rb_timer = timer->setlist_first; + /* del first element from setlist */ + if(rb_timer->setlist_next) + rb_timer->setlist_next->setlist_prev = NULL; + else + timer->setlist_last = NULL; + timer->setlist_first = rb_timer->setlist_next; + rb_timer->setlist_prev = NULL; + rb_timer->setlist_next = NULL; + rb_timer->timer_in_list = 0; + /* insert it into the tree as new rb element */ + memset(&rb_timer->node, 0, sizeof(rb_timer->node)); + rb_timer->node.key = rb_timer; + rbtree_insert(table->timer_tree, &rb_timer->node); + rb_timer->timer_in_tree = 1; + /* the setlist, if any remainder, moves to the rb element */ + rb_timer->setlist_first = timer->setlist_first; + rb_timer->setlist_last = timer->setlist_last; + timer->setlist_first = NULL; + timer->setlist_last = NULL; + rb_timer->worker_doq_socket = timer->worker_doq_socket; + } + timer->worker_doq_socket = NULL; +} + +void +doq_timer_list_remove(struct doq_table* table, struct doq_timer* timer) +{ + struct doq_timer* rb_timer; + if(!timer->timer_in_list) + return; + /* The item in the rbtree has the list start and end. */ + rb_timer = doq_timer_find_time(table, &timer->time); + if(rb_timer) { + if(timer->setlist_prev) + timer->setlist_prev->setlist_next = timer->setlist_next; + else + rb_timer->setlist_first = timer->setlist_next; + if(timer->setlist_next) + timer->setlist_next->setlist_prev = timer->setlist_prev; + else + rb_timer->setlist_last = timer->setlist_prev; + timer->setlist_prev = NULL; + timer->setlist_next = NULL; + } + timer->timer_in_list = 0; +} + +/** doq append timer to setlist */ +static void +doq_timer_list_append(struct doq_timer* rb_timer, struct doq_timer* timer) +{ + log_assert(timer->timer_in_list == 0); + timer->timer_in_list = 1; + timer->setlist_next = NULL; + timer->setlist_prev = rb_timer->setlist_last; + if(rb_timer->setlist_last) + rb_timer->setlist_last->setlist_next = timer; + else + rb_timer->setlist_first = timer; + rb_timer->setlist_last = timer; +} + +void +doq_timer_unset(struct doq_table* table, struct doq_timer* timer) +{ + if(timer->timer_in_list) { + /* Remove timer from list first, because finding the rbnode + * element of the setlist of same timeouts needs tree lookup. + * Edit the tree structure after that lookup. */ + doq_timer_list_remove(table, timer); + } + if(timer->timer_in_tree) + doq_timer_tree_remove(table, timer); + timer->worker_doq_socket = NULL; +} + +void doq_timer_set(struct doq_table* table, struct doq_timer* timer, + struct doq_server_socket* worker_doq_socket, struct timeval* tv) +{ + struct doq_timer* rb_timer; + if(verbosity >= VERB_ALGO && timer->conn) { + char a[256]; + struct timeval rel; + addr_to_str((void*)&timer->conn->key.paddr.addr, + timer->conn->key.paddr.addrlen, a, sizeof(a)); + timeval_subtract(&rel, tv, worker_doq_socket->now_tv); + verbose(VERB_ALGO, "doq %s timer set %d.%6.6d in %d.%6.6d", + a, (int)tv->tv_sec, (int)tv->tv_usec, + (int)rel.tv_sec, (int)rel.tv_usec); + } + if(timer->timer_in_tree || timer->timer_in_list) { + if(timer->time.tv_sec == tv->tv_sec && + timer->time.tv_usec == tv->tv_usec) + return; /* already set on that time */ + doq_timer_unset(table, timer); + } + timer->time.tv_sec = tv->tv_sec; + timer->time.tv_usec = tv->tv_usec; + rb_timer = doq_timer_find_time(table, tv); + if(rb_timer) { + /* There is a timeout already with this value. Timer is + * added to the setlist. */ + doq_timer_list_append(rb_timer, timer); + } else { + /* There is no timeout with this value. Make timer a new + * tree element. */ + memset(&timer->node, 0, sizeof(timer->node)); + timer->node.key = timer; + rbtree_insert(table->timer_tree, &timer->node); + timer->timer_in_tree = 1; + timer->setlist_first = NULL; + timer->setlist_last = NULL; + timer->worker_doq_socket = worker_doq_socket; + } +} + +struct doq_conn* +doq_conn_create(struct comm_point* c, struct doq_pkt_addr* paddr, + const uint8_t* dcid, size_t dcidlen, uint32_t version) +{ + struct doq_conn* conn = calloc(1, sizeof(*conn)); + if(!conn) + return NULL; + conn->node.key = conn; + conn->doq_socket = c->doq_socket; + conn->table = c->doq_socket->table; + memmove(&conn->key.paddr.addr, &paddr->addr, paddr->addrlen); + conn->key.paddr.addrlen = paddr->addrlen; + memmove(&conn->key.paddr.localaddr, &paddr->localaddr, + paddr->localaddrlen); + conn->key.paddr.localaddrlen = paddr->localaddrlen; + conn->key.paddr.ifindex = paddr->ifindex; + conn->key.dcid = memdup((void*)dcid, dcidlen); + if(!conn->key.dcid) { + free(conn); + return NULL; + } + conn->key.dcidlen = dcidlen; + conn->version = version; +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + ngtcp2_ccerr_default(&conn->ccerr); +#else + ngtcp2_connection_close_error_default(&conn->last_error); +#endif + rbtree_init(&conn->stream_tree, &doq_stream_cmp); + conn->timer.conn = conn; + lock_basic_init(&conn->lock); + lock_protect(&conn->lock, &conn->key, sizeof(conn->key)); + lock_protect(&conn->lock, &conn->doq_socket, sizeof(conn->doq_socket)); + lock_protect(&conn->lock, &conn->table, sizeof(conn->table)); + lock_protect(&conn->lock, &conn->is_deleted, sizeof(conn->is_deleted)); + lock_protect(&conn->lock, &conn->version, sizeof(conn->version)); + lock_protect(&conn->lock, &conn->conn, sizeof(conn->conn)); + lock_protect(&conn->lock, &conn->conid_list, sizeof(conn->conid_list)); +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + lock_protect(&conn->lock, &conn->ccerr, sizeof(conn->ccerr)); +#else + lock_protect(&conn->lock, &conn->last_error, sizeof(conn->last_error)); +#endif + lock_protect(&conn->lock, &conn->tls_alert, sizeof(conn->tls_alert)); + lock_protect(&conn->lock, &conn->ssl, sizeof(conn->ssl)); + lock_protect(&conn->lock, &conn->close_pkt, sizeof(conn->close_pkt)); + lock_protect(&conn->lock, &conn->close_pkt_len, sizeof(conn->close_pkt_len)); + lock_protect(&conn->lock, &conn->close_ecn, sizeof(conn->close_ecn)); + lock_protect(&conn->lock, &conn->stream_tree, sizeof(conn->stream_tree)); + lock_protect(&conn->lock, &conn->stream_write_first, sizeof(conn->stream_write_first)); + lock_protect(&conn->lock, &conn->stream_write_last, sizeof(conn->stream_write_last)); + lock_protect(&conn->lock, &conn->write_interest, sizeof(conn->write_interest)); + lock_protect(&conn->lock, &conn->on_write_list, sizeof(conn->on_write_list)); + lock_protect(&conn->lock, &conn->write_prev, sizeof(conn->write_prev)); + lock_protect(&conn->lock, &conn->write_next, sizeof(conn->write_next)); + return conn; +} + +/** delete stream tree node */ +static void +stream_tree_del(rbnode_type* node, void* arg) +{ + struct doq_table* table = (struct doq_table*)arg; + struct doq_stream* stream; + if(!node) + return; + stream = (struct doq_stream*)node; + if(stream->in) + doq_table_quic_size_subtract(table, stream->inlen); + if(stream->out) + doq_table_quic_size_subtract(table, stream->outlen); + doq_table_quic_size_subtract(table, sizeof(*stream)); + doq_stream_delete(stream); +} + +void +doq_conn_delete(struct doq_conn* conn, struct doq_table* table) +{ + if(!conn) + return; + lock_basic_destroy(&conn->lock); + lock_rw_wrlock(&conn->table->conid_lock); + doq_conn_clear_conids(conn); + lock_rw_unlock(&conn->table->conid_lock); + ngtcp2_conn_del(conn->conn); + if(conn->stream_tree.count != 0) { + traverse_postorder(&conn->stream_tree, stream_tree_del, table); + } + free(conn->key.dcid); + SSL_free(conn->ssl); + free(conn->close_pkt); + free(conn); +} + +int +doq_conn_cmp(const void* key1, const void* key2) +{ + struct doq_conn* c = (struct doq_conn*)key1; + struct doq_conn* d = (struct doq_conn*)key2; + int r; + /* Compared in the order destination address, then + * local address, ifindex and then dcid. + * So that for a search for findlessorequal for the destination + * address will find connections to that address, with different + * dcids. + * Also a printout in sorted order prints the connections by IP + * address of destination, and then a number of them depending on the + * dcids. */ + if(c->key.paddr.addrlen != d->key.paddr.addrlen) { + if(c->key.paddr.addrlen < d->key.paddr.addrlen) + return -1; + return 1; + } + if((r=memcmp(&c->key.paddr.addr, &d->key.paddr.addr, + c->key.paddr.addrlen))!=0) + return r; + if(c->key.paddr.localaddrlen != d->key.paddr.localaddrlen) { + if(c->key.paddr.localaddrlen < d->key.paddr.localaddrlen) + return -1; + return 1; + } + if((r=memcmp(&c->key.paddr.localaddr, &d->key.paddr.localaddr, + c->key.paddr.localaddrlen))!=0) + return r; + if(c->key.paddr.ifindex != d->key.paddr.ifindex) { + if(c->key.paddr.ifindex < d->key.paddr.ifindex) + return -1; + return 1; + } + if(c->key.dcidlen != d->key.dcidlen) { + if(c->key.dcidlen < d->key.dcidlen) + return -1; + return 1; + } + if((r=memcmp(c->key.dcid, d->key.dcid, c->key.dcidlen))!=0) + return r; + return 0; +} + +int doq_conid_cmp(const void* key1, const void* key2) +{ + struct doq_conid* c = (struct doq_conid*)key1; + struct doq_conid* d = (struct doq_conid*)key2; + if(c->cidlen != d->cidlen) { + if(c->cidlen < d->cidlen) + return -1; + return 1; + } + return memcmp(c->cid, d->cid, c->cidlen); +} + +int doq_timer_cmp(const void* key1, const void* key2) +{ + struct doq_timer* e = (struct doq_timer*)key1; + struct doq_timer* f = (struct doq_timer*)key2; + if(e->time.tv_sec < f->time.tv_sec) + return -1; + if(e->time.tv_sec > f->time.tv_sec) + return 1; + if(e->time.tv_usec < f->time.tv_usec) + return -1; + if(e->time.tv_usec > f->time.tv_usec) + return 1; + return 0; +} + +int doq_stream_cmp(const void* key1, const void* key2) +{ + struct doq_stream* c = (struct doq_stream*)key1; + struct doq_stream* d = (struct doq_stream*)key2; + if(c->stream_id != d->stream_id) { + if(c->stream_id < d->stream_id) + return -1; + return 1; + } + return 0; +} + +/** doq store a local address in repinfo */ +static void +doq_repinfo_store_localaddr(struct comm_reply* repinfo, + struct doq_addr_storage* localaddr, socklen_t localaddrlen) +{ + /* use the pktinfo that we have for ancillary udp data otherwise, + * this saves space for a sockaddr */ + memset(&repinfo->pktinfo, 0, sizeof(repinfo->pktinfo)); + if(addr_is_ip6((void*)localaddr, localaddrlen)) { +#ifdef IPV6_PKTINFO + struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr; + memmove(&repinfo->pktinfo.v6info.ipi6_addr, + &sa6->sin6_addr, sizeof(struct in6_addr)); + repinfo->doq_srcport = sa6->sin6_port; +#endif + repinfo->srctype = 6; + } else { +#ifdef IP_PKTINFO + struct sockaddr_in* sa = (struct sockaddr_in*)localaddr; + memmove(&repinfo->pktinfo.v4info.ipi_addr, + &sa->sin_addr, sizeof(struct in_addr)); + repinfo->doq_srcport = sa->sin_port; +#elif defined(IP_RECVDSTADDR) + struct sockaddr_in* sa = (struct sockaddr_in*)localaddr; + memmove(&repinfo->pktinfo.v4addr, &sa->sin_addr, + sizeof(struct in_addr)); + repinfo->doq_srcport = sa->sin_port; +#endif + repinfo->srctype = 4; + } +} + +/** doq retrieve localaddr from repinfo */ +static void +doq_repinfo_retrieve_localaddr(struct comm_reply* repinfo, + struct doq_addr_storage* localaddr, socklen_t* localaddrlen) +{ + if(repinfo->srctype == 6) { +#ifdef IPV6_PKTINFO + struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr; + *localaddrlen = (socklen_t)sizeof(struct sockaddr_in6); + memset(sa6, 0, *localaddrlen); + sa6->sin6_family = AF_INET6; + memmove(&sa6->sin6_addr, &repinfo->pktinfo.v6info.ipi6_addr, + *localaddrlen); + sa6->sin6_port = repinfo->doq_srcport; +#endif + } else { +#ifdef IP_PKTINFO + struct sockaddr_in* sa = (struct sockaddr_in*)localaddr; + *localaddrlen = (socklen_t)sizeof(struct sockaddr_in); + memset(sa, 0, *localaddrlen); + sa->sin_family = AF_INET; + memmove(&sa->sin_addr, &repinfo->pktinfo.v4info.ipi_addr, + *localaddrlen); + sa->sin_port = repinfo->doq_srcport; +#elif defined(IP_RECVDSTADDR) + struct sockaddr_in* sa = (struct sockaddr_in*)localaddr; + *localaddrlen = (socklen_t)sizeof(struct sockaddr_in); + memset(sa, 0, *localaddrlen); + sa->sin_family = AF_INET; + memmove(&sa->sin_addr, &repinfo->pktinfo.v4addr, + sizeof(struct in_addr)); + sa->sin_port = repinfo->doq_srcport; +#endif + } +} + +/** doq write a connection key into repinfo, false if it does not fit */ +static int +doq_conn_key_store_repinfo(struct doq_conn_key* key, + struct comm_reply* repinfo) +{ + repinfo->is_proxied = 0; + repinfo->doq_ifindex = key->paddr.ifindex; + repinfo->remote_addrlen = key->paddr.addrlen; + memmove(&repinfo->remote_addr, &key->paddr.addr, + repinfo->remote_addrlen); + repinfo->client_addrlen = key->paddr.addrlen; + memmove(&repinfo->client_addr, &key->paddr.addr, + repinfo->client_addrlen); + doq_repinfo_store_localaddr(repinfo, &key->paddr.localaddr, + key->paddr.localaddrlen); + if(key->dcidlen > sizeof(repinfo->doq_dcid)) + return 0; + repinfo->doq_dcidlen = key->dcidlen; + memmove(repinfo->doq_dcid, key->dcid, key->dcidlen); + return 1; +} + +void +doq_conn_key_from_repinfo(struct doq_conn_key* key, struct comm_reply* repinfo) +{ + key->paddr.ifindex = repinfo->doq_ifindex; + key->paddr.addrlen = repinfo->remote_addrlen; + memmove(&key->paddr.addr, &repinfo->remote_addr, + repinfo->remote_addrlen); + doq_repinfo_retrieve_localaddr(repinfo, &key->paddr.localaddr, + &key->paddr.localaddrlen); + key->dcidlen = repinfo->doq_dcidlen; + key->dcid = repinfo->doq_dcid; +} + +/** doq add a stream to the connection */ +static void +doq_conn_add_stream(struct doq_conn* conn, struct doq_stream* stream) +{ + (void)rbtree_insert(&conn->stream_tree, &stream->node); +} + +/** doq delete a stream from the connection */ +static void +doq_conn_del_stream(struct doq_conn* conn, struct doq_stream* stream) +{ + (void)rbtree_delete(&conn->stream_tree, &stream->node); +} + +/** doq create new stream */ +static struct doq_stream* +doq_stream_create(int64_t stream_id) +{ + struct doq_stream* stream = calloc(1, sizeof(*stream)); + if(!stream) + return NULL; + stream->node.key = stream; + stream->stream_id = stream_id; + return stream; +} + +void doq_stream_delete(struct doq_stream* stream) +{ + if(!stream) + return; + free(stream->in); + free(stream->out); + free(stream); +} + +struct doq_stream* +doq_stream_find(struct doq_conn* conn, int64_t stream_id) +{ + rbnode_type* node; + struct doq_stream key; + key.node.key = &key; + key.stream_id = stream_id; + node = rbtree_search(&conn->stream_tree, &key); + if(node) + return (struct doq_stream*)node->key; + return NULL; +} + +/** doq put stream on the conn write list */ +static void +doq_stream_on_write_list(struct doq_conn* conn, struct doq_stream* stream) +{ + if(stream->on_write_list) + return; + stream->write_prev = conn->stream_write_last; + if(conn->stream_write_last) + conn->stream_write_last->write_next = stream; + else + conn->stream_write_first = stream; + conn->stream_write_last = stream; + stream->write_next = NULL; + stream->on_write_list = 1; +} + +/** doq remove stream from the conn write list */ +static void +doq_stream_off_write_list(struct doq_conn* conn, struct doq_stream* stream) +{ + if(!stream->on_write_list) + return; + if(stream->write_next) + stream->write_next->write_prev = stream->write_prev; + else conn->stream_write_last = stream->write_prev; + if(stream->write_prev) + stream->write_prev->write_next = stream->write_next; + else conn->stream_write_first = stream->write_next; + stream->write_prev = NULL; + stream->write_next = NULL; + stream->on_write_list = 0; +} + +/** doq stream remove in buffer */ +static void +doq_stream_remove_in_buffer(struct doq_stream* stream, struct doq_table* table) +{ + if(stream->in) { + doq_table_quic_size_subtract(table, stream->inlen); + free(stream->in); + stream->in = NULL; + stream->inlen = 0; + } +} + +/** doq stream remove out buffer */ +static void +doq_stream_remove_out_buffer(struct doq_stream* stream, + struct doq_table* table) +{ + if(stream->out) { + doq_table_quic_size_subtract(table, stream->outlen); + free(stream->out); + stream->out = NULL; + stream->outlen = 0; + } +} + +int +doq_stream_close(struct doq_conn* conn, struct doq_stream* stream, + int send_shutdown) +{ + int ret; + if(stream->is_closed) + return 1; + stream->is_closed = 1; + doq_stream_off_write_list(conn, stream); + if(send_shutdown) { + verbose(VERB_ALGO, "doq: shutdown stream_id %d with app_error_code %d", + (int)stream->stream_id, (int)DOQ_APP_ERROR_CODE); + ret = ngtcp2_conn_shutdown_stream(conn->conn, +#ifdef HAVE_NGTCP2_CONN_SHUTDOWN_STREAM4 + 0, +#endif + stream->stream_id, DOQ_APP_ERROR_CODE); + if(ret != 0) { + log_err("doq ngtcp2_conn_shutdown_stream %d failed: %s", + (int)stream->stream_id, ngtcp2_strerror(ret)); + return 0; + } + doq_conn_write_enable(conn); + } + verbose(VERB_ALGO, "doq: conn extend max streams bidi by 1"); + ngtcp2_conn_extend_max_streams_bidi(conn->conn, 1); + doq_conn_write_enable(conn); + doq_stream_remove_in_buffer(stream, conn->doq_socket->table); + doq_stream_remove_out_buffer(stream, conn->doq_socket->table); + doq_table_quic_size_subtract(conn->doq_socket->table, sizeof(*stream)); + doq_conn_del_stream(conn, stream); + doq_stream_delete(stream); + return 1; +} + +/** doq stream pick up answer data from buffer */ +static int +doq_stream_pickup_answer(struct doq_stream* stream, struct sldns_buffer* buf) +{ + stream->is_answer_available = 1; + if(stream->out) { + free(stream->out); + stream->out = NULL; + stream->outlen = 0; + } + stream->nwrite = 0; + stream->outlen = sldns_buffer_limit(buf); + /* For quic the output bytes have to stay allocated and available, + * for potential resends, until the remote end has acknowledged them. + * This includes the tcplen start uint16_t, in outlen_wire. */ + stream->outlen_wire = htons(stream->outlen); + stream->out = memdup(sldns_buffer_begin(buf), sldns_buffer_limit(buf)); + if(!stream->out) { + log_err("doq could not send answer: out of memory"); + return 0; + } + return 1; +} + +int +doq_stream_send_reply(struct doq_conn* conn, struct doq_stream* stream, + struct sldns_buffer* buf) +{ + if(verbosity >= VERB_ALGO) { + char* s = sldns_wire2str_pkt(sldns_buffer_begin(buf), + sldns_buffer_limit(buf)); + verbose(VERB_ALGO, "doq stream %d response\n%s", + (int)stream->stream_id, (s?s:"null")); + free(s); + } + if(stream->out) + doq_table_quic_size_subtract(conn->doq_socket->table, + stream->outlen); + if(!doq_stream_pickup_answer(stream, buf)) + return 0; + doq_table_quic_size_add(conn->doq_socket->table, stream->outlen); + doq_stream_on_write_list(conn, stream); + doq_conn_write_enable(conn); + return 1; +} + +/** doq stream data length has completed, allocations can be done. False on + * allocation failure. */ +static int +doq_stream_datalen_complete(struct doq_stream* stream, struct doq_table* table) +{ + if(stream->inlen > 1024*1024) { + log_err("doq stream in length too large %d", + (int)stream->inlen); + return 0; + } + stream->in = calloc(1, stream->inlen); + if(!stream->in) { + log_err("doq could not read stream, calloc failed: " + "out of memory"); + return 0; + } + doq_table_quic_size_add(table, stream->inlen); + return 1; +} + +/** doq stream data is complete, the input data has been received. */ +static int +doq_stream_data_complete(struct doq_conn* conn, struct doq_stream* stream) +{ + struct comm_point* c; + if(verbosity >= VERB_ALGO) { + char* s = sldns_wire2str_pkt(stream->in, stream->inlen); + char a[128]; + addr_to_str((void*)&conn->key.paddr.addr, + conn->key.paddr.addrlen, a, sizeof(a)); + verbose(VERB_ALGO, "doq %s stream %d incoming query\n%s", + a, (int)stream->stream_id, (s?s:"null")); + free(s); + } + stream->is_query_complete = 1; + c = conn->doq_socket->cp; + if(!stream->in) { + verbose(VERB_ALGO, "doq_stream_data_complete: no in buffer"); + return 0; + } + if(stream->inlen > sldns_buffer_capacity(c->buffer)) { + verbose(VERB_ALGO, "doq_stream_data_complete: query too long"); + return 0; + } + sldns_buffer_clear(c->buffer); + sldns_buffer_write(c->buffer, stream->in, stream->inlen); + sldns_buffer_flip(c->buffer); + c->repinfo.c = c; + if(!doq_conn_key_store_repinfo(&conn->key, &c->repinfo)) { + verbose(VERB_ALGO, "doq_stream_data_complete: connection " + "DCID too long"); + return 0; + } + c->repinfo.doq_streamid = stream->stream_id; + conn->doq_socket->current_conn = conn; + fptr_ok(fptr_whitelist_comm_point(c->callback)); + if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo)) { + conn->doq_socket->current_conn = NULL; + if(!doq_stream_send_reply(conn, stream, c->buffer)) { + verbose(VERB_ALGO, "doq: failed to send_reply"); + return 0; + } + return 1; + } + conn->doq_socket->current_conn = NULL; + return 1; +} + +/** doq receive data for a stream, more bytes of the incoming data */ +static int +doq_stream_recv_data(struct doq_stream* stream, const uint8_t* data, + size_t datalen, int* recv_done, struct doq_table* table) +{ + int got_data = 0; + /* read the tcplength uint16_t at the start */ + if(stream->nread < 2) { + uint16_t tcplen = 0; + size_t todolen = 2 - stream->nread; + + if(stream->nread > 0) { + /* put in the already read byte if there is one */ + tcplen = stream->inlen; + } + if(datalen < todolen) + todolen = datalen; + memmove(((uint8_t*)&tcplen)+stream->nread, data, todolen); + stream->nread += todolen; + data += todolen; + datalen -= todolen; + if(stream->nread == 2) { + /* the initial length value is completed */ + stream->inlen = ntohs(tcplen); + if(!doq_stream_datalen_complete(stream, table)) + return 0; + } else { + /* store for later */ + stream->inlen = tcplen; + return 1; + } + } + /* if there are more data bytes */ + if(datalen > 0) { + size_t to_write = datalen; + if(stream->nread-2 > stream->inlen) { + verbose(VERB_ALGO, "doq stream buffer too small"); + return 0; + } + if(datalen > stream->inlen - (stream->nread-2)) + to_write = stream->inlen - (stream->nread-2); + if(to_write > 0) { + if(!stream->in) { + verbose(VERB_ALGO, "doq: stream has " + "no buffer"); + return 0; + } + memmove(stream->in+(stream->nread-2), data, to_write); + stream->nread += to_write; + data += to_write; + datalen -= to_write; + got_data = 1; + } + } + /* Are there extra bytes received after the end? If so, log them. */ + if(datalen > 0) { + if(verbosity >= VERB_ALGO) + log_hex("doq stream has extra bytes received after end", + (void*)data, datalen); + } + /* Is the input data complete? */ + if(got_data && stream->nread >= stream->inlen+2) { + if(!stream->in) { + verbose(VERB_ALGO, "doq: completed stream has " + "no buffer"); + return 0; + } + *recv_done = 1; + } + return 1; +} + +/** doq receive FIN for a stream. No more bytes are going to arrive. */ +static int +doq_stream_recv_fin(struct doq_conn* conn, struct doq_stream* stream, int + recv_done) +{ + if(!stream->is_query_complete && !recv_done) { + verbose(VERB_ALGO, "doq: stream recv FIN, but is " + "not complete, have %d of %d bytes", + ((int)stream->nread)-2, (int)stream->inlen); + if(!doq_stream_close(conn, stream, 1)) + return 0; + } + return 1; +} + +void doq_fill_rand(struct ub_randstate* rnd, uint8_t* buf, size_t len) +{ + size_t i; + for(i=0; i<len; i++) + buf[i] = ub_random(rnd)&0xff; +} + +/** generate new connection id, checks for duplicates. + * caller must hold lock on conid tree. */ +static int +doq_conn_generate_new_conid(struct doq_conn* conn, uint8_t* data, + size_t datalen) +{ + int max_try = 100; + int i; + for(i=0; i<max_try; i++) { + doq_fill_rand(conn->doq_socket->rnd, data, datalen); + if(!doq_conid_find(conn->table, data, datalen)) { + /* Found an unused connection id. */ + return 1; + } + } + verbose(VERB_ALGO, "doq_conn_generate_new_conid failed: could not " + "generate random unused connection id value in %d attempts.", + max_try); + return 0; +} + +/** ngtcp2 rand callback function */ +static void +doq_rand_cb(uint8_t* dest, size_t destlen, const ngtcp2_rand_ctx* rand_ctx) +{ + struct ub_randstate* rnd = (struct ub_randstate*) + rand_ctx->native_handle; + doq_fill_rand(rnd, dest, destlen); +} + +/** ngtcp2 get_new_connection_id callback function */ +static int +doq_get_new_connection_id_cb(ngtcp2_conn* ATTR_UNUSED(conn), ngtcp2_cid* cid, + uint8_t* token, size_t cidlen, void* user_data) +{ + struct doq_conn* doq_conn = (struct doq_conn*)user_data; + /* Lock the conid tree, so we can check for duplicates while + * generating the id, and then insert it, whilst keeping the tree + * locked against other modifications, guaranteeing uniqueness. */ + lock_rw_wrlock(&doq_conn->table->conid_lock); + if(!doq_conn_generate_new_conid(doq_conn, cid->data, cidlen)) { + lock_rw_unlock(&doq_conn->table->conid_lock); + return NGTCP2_ERR_CALLBACK_FAILURE; + } + cid->datalen = cidlen; + if(ngtcp2_crypto_generate_stateless_reset_token(token, + doq_conn->doq_socket->static_secret, + doq_conn->doq_socket->static_secret_len, cid) != 0) { + lock_rw_unlock(&doq_conn->table->conid_lock); + return NGTCP2_ERR_CALLBACK_FAILURE; + } + if(!doq_conn_associate_conid(doq_conn, cid->data, cid->datalen)) { + lock_rw_unlock(&doq_conn->table->conid_lock); + return NGTCP2_ERR_CALLBACK_FAILURE; + } + lock_rw_unlock(&doq_conn->table->conid_lock); + return 0; +} + +/** ngtcp2 remove_connection_id callback function */ +static int +doq_remove_connection_id_cb(ngtcp2_conn* ATTR_UNUSED(conn), + const ngtcp2_cid* cid, void* user_data) +{ + struct doq_conn* doq_conn = (struct doq_conn*)user_data; + lock_rw_wrlock(&doq_conn->table->conid_lock); + doq_conn_dissociate_conid(doq_conn, cid->data, cid->datalen); + lock_rw_unlock(&doq_conn->table->conid_lock); + return 0; +} + +/** doq submit a new token */ +static int +doq_submit_new_token(struct doq_conn* conn) +{ + uint8_t token[NGTCP2_CRYPTO_MAX_REGULAR_TOKENLEN]; + ngtcp2_ssize tokenlen; + int ret; + const ngtcp2_path* path = ngtcp2_conn_get_path(conn->conn); + ngtcp2_tstamp ts = doq_get_timestamp_nanosec(); + + tokenlen = ngtcp2_crypto_generate_regular_token(token, + conn->doq_socket->static_secret, + conn->doq_socket->static_secret_len, path->remote.addr, + path->remote.addrlen, ts); + if(tokenlen < 0) { + log_err("doq ngtcp2_crypto_generate_regular_token failed"); + return 1; + } + + verbose(VERB_ALGO, "doq submit new token"); + ret = ngtcp2_conn_submit_new_token(conn->conn, token, tokenlen); + if(ret != 0) { + log_err("doq ngtcp2_conn_submit_new_token failed: %s", + ngtcp2_strerror(ret)); + return 0; + } + return 1; +} + +/** ngtcp2 handshake_completed callback function */ +static int +doq_handshake_completed_cb(ngtcp2_conn* ATTR_UNUSED(conn), void* user_data) +{ + struct doq_conn* doq_conn = (struct doq_conn*)user_data; + verbose(VERB_ALGO, "doq handshake_completed callback"); + verbose(VERB_ALGO, "ngtcp2_conn_get_max_data_left is %d", + (int)ngtcp2_conn_get_max_data_left(doq_conn->conn)); +#ifdef HAVE_NGTCP2_CONN_GET_MAX_LOCAL_STREAMS_UNI + verbose(VERB_ALGO, "ngtcp2_conn_get_max_local_streams_uni is %d", + (int)ngtcp2_conn_get_max_local_streams_uni(doq_conn->conn)); +#endif + verbose(VERB_ALGO, "ngtcp2_conn_get_streams_uni_left is %d", + (int)ngtcp2_conn_get_streams_uni_left(doq_conn->conn)); + verbose(VERB_ALGO, "ngtcp2_conn_get_streams_bidi_left is %d", + (int)ngtcp2_conn_get_streams_bidi_left(doq_conn->conn)); + verbose(VERB_ALGO, "negotiated cipher name is %s", + SSL_get_cipher_name(doq_conn->ssl)); + if(verbosity > VERB_ALGO) { + const unsigned char* alpn = NULL; + unsigned int alpnlen = 0; + char alpnstr[128]; + SSL_get0_alpn_selected(doq_conn->ssl, &alpn, &alpnlen); + if(alpnlen > sizeof(alpnstr)-1) + alpnlen = sizeof(alpnstr)-1; + memmove(alpnstr, alpn, alpnlen); + alpnstr[alpnlen]=0; + verbose(VERB_ALGO, "negotiated ALPN is '%s'", alpnstr); + } + + if(!doq_submit_new_token(doq_conn)) + return -1; + return 0; +} + +/** ngtcp2 stream_open callback function */ +static int +doq_stream_open_cb(ngtcp2_conn* ATTR_UNUSED(conn), int64_t stream_id, + void* user_data) +{ + struct doq_conn* doq_conn = (struct doq_conn*)user_data; + struct doq_stream* stream; + verbose(VERB_ALGO, "doq new stream %x", (int)stream_id); + if(doq_stream_find(doq_conn, stream_id)) { + verbose(VERB_ALGO, "doq: stream with this id already exists"); + return 0; + } + if(stream_id != 0 && stream_id != 4 && /* allow one stream on a new connection */ + !doq_table_quic_size_available(doq_conn->doq_socket->table, + doq_conn->doq_socket->cfg, sizeof(*stream) + + 100 /* estimated query in */ + + 512 /* estimated response out */ + )) { + int rv; + verbose(VERB_ALGO, "doq: no mem for new stream"); + rv = ngtcp2_conn_shutdown_stream(doq_conn->conn, +#ifdef HAVE_NGTCP2_CONN_SHUTDOWN_STREAM4 + 0, +#endif + stream_id, NGTCP2_CONNECTION_REFUSED); + if(rv != 0) { + log_err("ngtcp2_conn_shutdown_stream failed: %s", + ngtcp2_strerror(rv)); + return NGTCP2_ERR_CALLBACK_FAILURE; + } + return 0; + } + stream = doq_stream_create(stream_id); + if(!stream) { + log_err("doq: could not doq_stream_create: out of memory"); + return NGTCP2_ERR_CALLBACK_FAILURE; + } + doq_table_quic_size_add(doq_conn->doq_socket->table, sizeof(*stream)); + doq_conn_add_stream(doq_conn, stream); + return 0; +} + +/** ngtcp2 recv_stream_data callback function */ +static int +doq_recv_stream_data_cb(ngtcp2_conn* ATTR_UNUSED(conn), uint32_t flags, + int64_t stream_id, uint64_t offset, const uint8_t* data, + size_t datalen, void* user_data, void* ATTR_UNUSED(stream_user_data)) +{ + int recv_done = 0; + struct doq_conn* doq_conn = (struct doq_conn*)user_data; + struct doq_stream* stream; + verbose(VERB_ALGO, "doq recv stream data stream id %d offset %d " + "datalen %d%s%s", (int)stream_id, (int)offset, (int)datalen, + ((flags&NGTCP2_STREAM_DATA_FLAG_FIN)!=0?" FIN":""), +#ifdef NGTCP2_STREAM_DATA_FLAG_0RTT + ((flags&NGTCP2_STREAM_DATA_FLAG_0RTT)!=0?" 0RTT":"") +#else + ((flags&NGTCP2_STREAM_DATA_FLAG_EARLY)!=0?" EARLY":"") +#endif + ); + stream = doq_stream_find(doq_conn, stream_id); + if(!stream) { + verbose(VERB_ALGO, "doq: received stream data for " + "unknown stream %d", (int)stream_id); + return 0; + } + if(stream->is_closed) { + verbose(VERB_ALGO, "doq: stream is closed, ignore recv data"); + return 0; + } + if(datalen != 0) { + if(!doq_stream_recv_data(stream, data, datalen, &recv_done, + doq_conn->doq_socket->table)) + return NGTCP2_ERR_CALLBACK_FAILURE; + } + if((flags&NGTCP2_STREAM_DATA_FLAG_FIN)!=0) { + if(!doq_stream_recv_fin(doq_conn, stream, recv_done)) + return NGTCP2_ERR_CALLBACK_FAILURE; + } + ngtcp2_conn_extend_max_stream_offset(doq_conn->conn, stream_id, + datalen); + ngtcp2_conn_extend_max_offset(doq_conn->conn, datalen); + if(recv_done) { + if(!doq_stream_data_complete(doq_conn, stream)) + return NGTCP2_ERR_CALLBACK_FAILURE; + } + return 0; +} + +/** ngtcp2 stream_close callback function */ +static int +doq_stream_close_cb(ngtcp2_conn* ATTR_UNUSED(conn), uint32_t flags, + int64_t stream_id, uint64_t app_error_code, void* user_data, + void* ATTR_UNUSED(stream_user_data)) +{ + struct doq_conn* doq_conn = (struct doq_conn*)user_data; + struct doq_stream* stream; + if((flags&NGTCP2_STREAM_CLOSE_FLAG_APP_ERROR_CODE_SET)!=0) + verbose(VERB_ALGO, "doq stream close for stream id %d %sapp_error_code %d", + (int)stream_id, + (((flags&NGTCP2_STREAM_CLOSE_FLAG_APP_ERROR_CODE_SET)!=0)? + "APP_ERROR_CODE_SET ":""), + (int)app_error_code); + else + verbose(VERB_ALGO, "doq stream close for stream id %d", + (int)stream_id); + + stream = doq_stream_find(doq_conn, stream_id); + if(!stream) { + verbose(VERB_ALGO, "doq: stream close for " + "unknown stream %d", (int)stream_id); + return 0; + } + if(!doq_stream_close(doq_conn, stream, 0)) + return NGTCP2_ERR_CALLBACK_FAILURE; + return 0; +} + +/** ngtcp2 stream_reset callback function */ +static int +doq_stream_reset_cb(ngtcp2_conn* ATTR_UNUSED(conn), int64_t stream_id, + uint64_t final_size, uint64_t app_error_code, void* user_data, + void* ATTR_UNUSED(stream_user_data)) +{ + struct doq_conn* doq_conn = (struct doq_conn*)user_data; + struct doq_stream* stream; + verbose(VERB_ALGO, "doq stream reset for stream id %d final_size %d " + "app_error_code %d", (int)stream_id, (int)final_size, + (int)app_error_code); + + stream = doq_stream_find(doq_conn, stream_id); + if(!stream) { + verbose(VERB_ALGO, "doq: stream reset for " + "unknown stream %d", (int)stream_id); + return 0; + } + if(!doq_stream_close(doq_conn, stream, 0)) + return NGTCP2_ERR_CALLBACK_FAILURE; + return 0; +} + +/** ngtcp2 acked_stream_data_offset callback function */ +static int +doq_acked_stream_data_offset_cb(ngtcp2_conn* ATTR_UNUSED(conn), + int64_t stream_id, uint64_t offset, uint64_t datalen, void* user_data, + void* ATTR_UNUSED(stream_user_data)) +{ + struct doq_conn* doq_conn = (struct doq_conn*)user_data; + struct doq_stream* stream; + verbose(VERB_ALGO, "doq stream acked data for stream id %d offset %d " + "datalen %d", (int)stream_id, (int)offset, (int)datalen); + + stream = doq_stream_find(doq_conn, stream_id); + if(!stream) { + verbose(VERB_ALGO, "doq: stream acked data for " + "unknown stream %d", (int)stream_id); + return 0; + } + /* Acked the data from [offset .. offset+datalen). */ + if(stream->is_closed) + return 0; + if(offset+datalen >= stream->outlen) { + doq_stream_remove_in_buffer(stream, + doq_conn->doq_socket->table); + doq_stream_remove_out_buffer(stream, + doq_conn->doq_socket->table); + } + return 0; +} + +/** ngtc2p log_printf callback function */ +static void +doq_log_printf_cb(void* ATTR_UNUSED(user_data), const char* fmt, ...) +{ + char buf[1024]; + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + verbose(VERB_ALGO, "libngtcp2: %s", buf); + va_end(ap); +} + +#ifndef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT +/** the doq application tx key callback, false on failure */ +static int +doq_application_tx_key_cb(struct doq_conn* conn) +{ + verbose(VERB_ALGO, "doq application tx key cb"); + /* The server does not want to open streams to the client, + * the client instead initiates by opening bidi streams. */ + verbose(VERB_ALGO, "doq ngtcp2_conn_get_max_data_left is %d", + (int)ngtcp2_conn_get_max_data_left(conn->conn)); +#ifdef HAVE_NGTCP2_CONN_GET_MAX_LOCAL_STREAMS_UNI + verbose(VERB_ALGO, "doq ngtcp2_conn_get_max_local_streams_uni is %d", + (int)ngtcp2_conn_get_max_local_streams_uni(conn->conn)); +#endif + verbose(VERB_ALGO, "doq ngtcp2_conn_get_streams_uni_left is %d", + (int)ngtcp2_conn_get_streams_uni_left(conn->conn)); + verbose(VERB_ALGO, "doq ngtcp2_conn_get_streams_bidi_left is %d", + (int)ngtcp2_conn_get_streams_bidi_left(conn->conn)); + return 1; +} + +/** quic_method set_encryption_secrets function */ +static int +doq_set_encryption_secrets(SSL *ssl, OSSL_ENCRYPTION_LEVEL ossl_level, + const uint8_t *read_secret, const uint8_t *write_secret, + size_t secret_len) +{ + struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl); +#ifdef HAVE_NGTCP2_ENCRYPTION_LEVEL + ngtcp2_encryption_level +#else + ngtcp2_crypto_level +#endif + level = +#ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_FROM_OSSL_ENCRYPTION_LEVEL + ngtcp2_crypto_quictls_from_ossl_encryption_level(ossl_level); +#else + ngtcp2_crypto_openssl_from_ossl_encryption_level(ossl_level); +#endif + + if(read_secret) { + verbose(VERB_ALGO, "doq: ngtcp2_crypto_derive_and_install_rx_key for level %d ossl %d", (int)level, (int)ossl_level); + if(ngtcp2_crypto_derive_and_install_rx_key(doq_conn->conn, + NULL, NULL, NULL, level, read_secret, secret_len) + != 0) { + log_err("ngtcp2_crypto_derive_and_install_rx_key " + "failed"); + return 0; + } + } + + if(write_secret) { + verbose(VERB_ALGO, "doq: ngtcp2_crypto_derive_and_install_tx_key for level %d ossl %d", (int)level, (int)ossl_level); + if(ngtcp2_crypto_derive_and_install_tx_key(doq_conn->conn, + NULL, NULL, NULL, level, write_secret, secret_len) + != 0) { + log_err("ngtcp2_crypto_derive_and_install_tx_key " + "failed"); + return 0; + } + if(level == NGTCP2_CRYPTO_LEVEL_APPLICATION) { + if(!doq_application_tx_key_cb(doq_conn)) + return 0; + } + } + return 1; +} + +/** quic_method add_handshake_data function */ +static int +doq_add_handshake_data(SSL *ssl, OSSL_ENCRYPTION_LEVEL ossl_level, + const uint8_t *data, size_t len) +{ + struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl); +#ifdef HAVE_NGTCP2_ENCRYPTION_LEVEL + ngtcp2_encryption_level +#else + ngtcp2_crypto_level +#endif + level = +#ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_FROM_OSSL_ENCRYPTION_LEVEL + ngtcp2_crypto_quictls_from_ossl_encryption_level(ossl_level); +#else + ngtcp2_crypto_openssl_from_ossl_encryption_level(ossl_level); +#endif + int rv; + + verbose(VERB_ALGO, "doq_add_handshake_data: " + "ngtcp2_con_submit_crypto_data level %d", (int)level); + rv = ngtcp2_conn_submit_crypto_data(doq_conn->conn, level, data, len); + if(rv != 0) { + log_err("ngtcp2_conn_submit_crypto_data failed: %s", + ngtcp2_strerror(rv)); + ngtcp2_conn_set_tls_error(doq_conn->conn, rv); + return 0; + } + return 1; +} + +/** quic_method flush_flight function */ +static int +doq_flush_flight(SSL* ATTR_UNUSED(ssl)) +{ + return 1; +} + +/** quic_method send_alert function */ +static int +doq_send_alert(SSL *ssl, enum ssl_encryption_level_t ATTR_UNUSED(level), + uint8_t alert) +{ + struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl); + doq_conn->tls_alert = alert; + return 1; +} +#endif /* HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT */ + +/** ALPN select callback for the doq SSL context */ +static int +doq_alpn_select_cb(SSL* ATTR_UNUSED(ssl), const unsigned char** out, + unsigned char* outlen, const unsigned char* in, unsigned int inlen, + void* ATTR_UNUSED(arg)) +{ + /* select "doq" */ + int ret = SSL_select_next_proto((void*)out, outlen, + (const unsigned char*)"\x03""doq", 4, in, inlen); + if(ret == OPENSSL_NPN_NEGOTIATED) + return SSL_TLSEXT_ERR_OK; + verbose(VERB_ALGO, "doq alpn_select_cb: ALPN from client does " + "not have 'doq'"); + return SSL_TLSEXT_ERR_ALERT_FATAL; +} + +/** create new tls session for server doq connection */ +static SSL_CTX* +doq_ctx_server_setup(struct doq_server_socket* doq_socket) +{ + char* sid_ctx = "unbound server"; +#ifndef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT + SSL_QUIC_METHOD* quic_method; +#endif + SSL_CTX* ctx = SSL_CTX_new(TLS_server_method()); + if(!ctx) { + log_crypto_err("Could not SSL_CTX_new"); + return NULL; + } + SSL_CTX_set_options(ctx, + (SSL_OP_ALL & ~SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS) | + SSL_OP_SINGLE_ECDH_USE | + SSL_OP_CIPHER_SERVER_PREFERENCE | + SSL_OP_NO_ANTI_REPLAY); + SSL_CTX_set_mode(ctx, SSL_MODE_RELEASE_BUFFERS); + SSL_CTX_set_min_proto_version(ctx, TLS1_3_VERSION); + SSL_CTX_set_max_proto_version(ctx, TLS1_3_VERSION); +#ifdef HAVE_SSL_CTX_SET_ALPN_SELECT_CB + SSL_CTX_set_alpn_select_cb(ctx, doq_alpn_select_cb, NULL); +#endif + SSL_CTX_set_default_verify_paths(ctx); + if(!SSL_CTX_use_certificate_chain_file(ctx, + doq_socket->ssl_service_pem)) { + log_err("doq: error for cert file: %s", + doq_socket->ssl_service_pem); + log_crypto_err("doq: error in " + "SSL_CTX_use_certificate_chain_file"); + SSL_CTX_free(ctx); + return NULL; + } + if(!SSL_CTX_use_PrivateKey_file(ctx, doq_socket->ssl_service_key, + SSL_FILETYPE_PEM)) { + log_err("doq: error for private key file: %s", + doq_socket->ssl_service_key); + log_crypto_err("doq: error in SSL_CTX_use_PrivateKey_file"); + SSL_CTX_free(ctx); + return NULL; + } + if(!SSL_CTX_check_private_key(ctx)) { + log_err("doq: error for key file: %s", + doq_socket->ssl_service_key); + log_crypto_err("doq: error in SSL_CTX_check_private_key"); + SSL_CTX_free(ctx); + return NULL; + } + SSL_CTX_set_session_id_context(ctx, (void*)sid_ctx, strlen(sid_ctx)); + if(doq_socket->ssl_verify_pem && doq_socket->ssl_verify_pem[0]) { + if(!SSL_CTX_load_verify_locations(ctx, + doq_socket->ssl_verify_pem, NULL)) { + log_err("doq: error for verify pem file: %s", + doq_socket->ssl_verify_pem); + log_crypto_err("doq: error in " + "SSL_CTX_load_verify_locations"); + SSL_CTX_free(ctx); + return NULL; + } + SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file( + doq_socket->ssl_verify_pem)); + SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER| + SSL_VERIFY_CLIENT_ONCE| + SSL_VERIFY_FAIL_IF_NO_PEER_CERT, NULL); + } + + SSL_CTX_set_max_early_data(ctx, 0xffffffff); +#ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT + if(ngtcp2_crypto_quictls_configure_server_context(ctx) != 0) { + log_err("ngtcp2_crypto_quictls_configure_server_context failed"); + SSL_CTX_free(ctx); + return NULL; + } +#else + /* The quic_method needs to remain valid during the SSL_CTX + * lifetime, so we allocate it. It is freed with the + * doq_server_socket. */ + quic_method = calloc(1, sizeof(SSL_QUIC_METHOD)); + if(!quic_method) { + log_err("calloc failed: out of memory"); + SSL_CTX_free(ctx); + return NULL; + } + doq_socket->quic_method = quic_method; + quic_method->set_encryption_secrets = doq_set_encryption_secrets; + quic_method->add_handshake_data = doq_add_handshake_data; + quic_method->flush_flight = doq_flush_flight; + quic_method->send_alert = doq_send_alert; + SSL_CTX_set_quic_method(ctx, doq_socket->quic_method); +#endif + return ctx; +} + +/** Get the ngtcp2_conn from ssl userdata of type ngtcp2_conn_ref */ +static ngtcp2_conn* doq_conn_ref_get_conn(ngtcp2_crypto_conn_ref* conn_ref) +{ + struct doq_conn* conn = (struct doq_conn*)conn_ref->user_data; + return conn->conn; +} + +/** create new SSL session for server connection */ +static SSL* +doq_ssl_server_setup(SSL_CTX* ctx, struct doq_conn* conn) +{ + SSL* ssl = SSL_new(ctx); + if(!ssl) { + log_crypto_err("doq: SSL_new failed"); + return NULL; + } +#ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT + conn->conn_ref.get_conn = &doq_conn_ref_get_conn; + conn->conn_ref.user_data = conn; + SSL_set_app_data(ssl, &conn->conn_ref); +#else + SSL_set_app_data(ssl, conn); +#endif + SSL_set_accept_state(ssl); + SSL_set_quic_early_data_enabled(ssl, 1); + return ssl; +} + +/** setup the doq_socket server tls context */ +int +doq_socket_setup_ctx(struct doq_server_socket* doq_socket) +{ + doq_socket->ctx = doq_ctx_server_setup(doq_socket); + if(!doq_socket->ctx) + return 0; + return 1; +} + +int +doq_conn_setup(struct doq_conn* conn, uint8_t* scid, size_t scidlen, + uint8_t* ocid, size_t ocidlen, const uint8_t* token, size_t tokenlen) +{ + int rv; + struct ngtcp2_cid dcid, sv_scid, scid_cid; + struct ngtcp2_path path; + struct ngtcp2_callbacks callbacks; + struct ngtcp2_settings settings; + struct ngtcp2_transport_params params; + memset(&dcid, 0, sizeof(dcid)); + memset(&sv_scid, 0, sizeof(sv_scid)); + memset(&scid_cid, 0, sizeof(scid_cid)); + memset(&path, 0, sizeof(path)); + memset(&callbacks, 0, sizeof(callbacks)); + memset(&settings, 0, sizeof(settings)); + memset(¶ms, 0, sizeof(params)); + + ngtcp2_cid_init(&scid_cid, scid, scidlen); + ngtcp2_cid_init(&dcid, conn->key.dcid, conn->key.dcidlen); + + path.remote.addr = (struct sockaddr*)&conn->key.paddr.addr; + path.remote.addrlen = conn->key.paddr.addrlen; + path.local.addr = (struct sockaddr*)&conn->key.paddr.localaddr; + path.local.addrlen = conn->key.paddr.localaddrlen; + + callbacks.recv_client_initial = ngtcp2_crypto_recv_client_initial_cb; + callbacks.recv_crypto_data = ngtcp2_crypto_recv_crypto_data_cb; + callbacks.encrypt = ngtcp2_crypto_encrypt_cb; + callbacks.decrypt = ngtcp2_crypto_decrypt_cb; + callbacks.hp_mask = ngtcp2_crypto_hp_mask; + callbacks.update_key = ngtcp2_crypto_update_key_cb; + callbacks.delete_crypto_aead_ctx = + ngtcp2_crypto_delete_crypto_aead_ctx_cb; + callbacks.delete_crypto_cipher_ctx = + ngtcp2_crypto_delete_crypto_cipher_ctx_cb; + callbacks.get_path_challenge_data = + ngtcp2_crypto_get_path_challenge_data_cb; + callbacks.version_negotiation = ngtcp2_crypto_version_negotiation_cb; + callbacks.rand = doq_rand_cb; + callbacks.get_new_connection_id = doq_get_new_connection_id_cb; + callbacks.remove_connection_id = doq_remove_connection_id_cb; + callbacks.handshake_completed = doq_handshake_completed_cb; + callbacks.stream_open = doq_stream_open_cb; + callbacks.stream_close = doq_stream_close_cb; + callbacks.stream_reset = doq_stream_reset_cb; + callbacks.acked_stream_data_offset = doq_acked_stream_data_offset_cb; + callbacks.recv_stream_data = doq_recv_stream_data_cb; + + ngtcp2_settings_default(&settings); + if(verbosity >= VERB_ALGO) { + settings.log_printf = doq_log_printf_cb; + } + settings.rand_ctx.native_handle = conn->doq_socket->rnd; + settings.initial_ts = doq_get_timestamp_nanosec(); + settings.max_stream_window = 6*1024*1024; + settings.max_window = 6*1024*1024; +#ifdef HAVE_STRUCT_NGTCP2_SETTINGS_TOKENLEN + settings.token = (void*)token; + settings.tokenlen = tokenlen; +#else + settings.token.base = (void*)token; + settings.token.len = tokenlen; +#endif + + ngtcp2_transport_params_default(¶ms); + params.max_idle_timeout = conn->doq_socket->idle_timeout; + params.active_connection_id_limit = 7; + params.initial_max_stream_data_bidi_local = 256*1024; + params.initial_max_stream_data_bidi_remote = 256*1024; + params.initial_max_data = 1024*1024; + /* DoQ uses bidi streams, so we allow 0 uni streams. */ + params.initial_max_streams_uni = 0; + /* Initial max on number of bidi streams the remote end can open. + * That is the number of queries it can make, at first. */ + params.initial_max_streams_bidi = 10; + if(ocid) { + ngtcp2_cid_init(¶ms.original_dcid, ocid, ocidlen); + ngtcp2_cid_init(¶ms.retry_scid, conn->key.dcid, + conn->key.dcidlen); + params.retry_scid_present = 1; + } else { + ngtcp2_cid_init(¶ms.original_dcid, conn->key.dcid, + conn->key.dcidlen); + } +#ifdef HAVE_STRUCT_NGTCP2_TRANSPORT_PARAMS_ORIGINAL_DCID_PRESENT + params.original_dcid_present = 1; +#endif + doq_fill_rand(conn->doq_socket->rnd, params.stateless_reset_token, + sizeof(params.stateless_reset_token)); + sv_scid.datalen = conn->doq_socket->sv_scidlen; + lock_rw_wrlock(&conn->table->conid_lock); + if(!doq_conn_generate_new_conid(conn, sv_scid.data, sv_scid.datalen)) { + lock_rw_unlock(&conn->table->conid_lock); + return 0; + } + + rv = ngtcp2_conn_server_new(&conn->conn, &scid_cid, &sv_scid, &path, + conn->version, &callbacks, &settings, ¶ms, NULL, conn); + if(rv != 0) { + lock_rw_unlock(&conn->table->conid_lock); + log_err("ngtcp2_conn_server_new failed: %s", + ngtcp2_strerror(rv)); + return 0; + } + if(!doq_conn_setup_conids(conn)) { + lock_rw_unlock(&conn->table->conid_lock); + log_err("doq_conn_setup_conids failed: out of memory"); + return 0; + } + lock_rw_unlock(&conn->table->conid_lock); + conn->ssl = doq_ssl_server_setup((SSL_CTX*)conn->doq_socket->ctx, + conn); + if(!conn->ssl) { + log_err("doq_ssl_server_setup failed"); + return 0; + } + ngtcp2_conn_set_tls_native_handle(conn->conn, conn->ssl); + doq_conn_write_enable(conn); + return 1; +} + +struct doq_conid* +doq_conid_find(struct doq_table* table, const uint8_t* data, size_t datalen) +{ + struct rbnode_type* node; + struct doq_conid key; + key.node.key = &key; + key.cid = (void*)data; + key.cidlen = datalen; + node = rbtree_search(table->conid_tree, &key); + if(node) + return (struct doq_conid*)node->key; + return NULL; +} + +/** insert conid in the conid list */ +static void +doq_conid_list_insert(struct doq_conn* conn, struct doq_conid* conid) +{ + conid->prev = NULL; + conid->next = conn->conid_list; + if(conn->conid_list) + conn->conid_list->prev = conid; + conn->conid_list = conid; +} + +/** remove conid from the conid list */ +static void +doq_conid_list_remove(struct doq_conn* conn, struct doq_conid* conid) +{ + if(conid->prev) + conid->prev->next = conid->next; + else conn->conid_list = conid->next; + if(conid->next) + conid->next->prev = conid->prev; +} + +/** create a doq_conid */ +static struct doq_conid* +doq_conid_create(uint8_t* data, size_t datalen, struct doq_conn_key* key) +{ + struct doq_conid* conid; + conid = calloc(1, sizeof(*conid)); + if(!conid) + return NULL; + conid->cid = memdup(data, datalen); + if(!conid->cid) { + free(conid); + return NULL; + } + conid->cidlen = datalen; + conid->node.key = conid; + conid->key = *key; + conid->key.dcid = memdup(key->dcid, key->dcidlen); + if(!conid->key.dcid) { + free(conid->cid); + free(conid); + return NULL; + } + return conid; +} + +void +doq_conid_delete(struct doq_conid* conid) +{ + if(!conid) + return; + free(conid->key.dcid); + free(conid->cid); + free(conid); +} + +/** return true if the conid is for the conn. */ +static int +conid_is_for_conn(struct doq_conn* conn, struct doq_conid* conid) +{ + if(conid->key.dcidlen == conn->key.dcidlen && + memcmp(conid->key.dcid, conn->key.dcid, conid->key.dcidlen)==0 + && conid->key.paddr.addrlen == conn->key.paddr.addrlen && + memcmp(&conid->key.paddr.addr, &conn->key.paddr.addr, + conid->key.paddr.addrlen) == 0 && + conid->key.paddr.localaddrlen == conn->key.paddr.localaddrlen && + memcmp(&conid->key.paddr.localaddr, &conn->key.paddr.localaddr, + conid->key.paddr.localaddrlen) == 0 && + conid->key.paddr.ifindex == conn->key.paddr.ifindex) + return 1; + return 0; +} + +int +doq_conn_associate_conid(struct doq_conn* conn, uint8_t* data, size_t datalen) +{ + struct doq_conid* conid; + conid = doq_conid_find(conn->table, data, datalen); + if(conid && !conid_is_for_conn(conn, conid)) { + verbose(VERB_ALGO, "doq connection id already exists for " + "another doq_conn. Ignoring second connection id."); + /* Already exists to another conn, ignore it. + * This works, in that the conid is listed in the doq_conn + * conid_list element, and removed from there. So our conid + * tree and list are fine, when created and removed. + * The tree now does not have the lookup element pointing + * to this connection. */ + return 1; + } + if(conid) + return 1; /* already inserted */ + conid = doq_conid_create(data, datalen, &conn->key); + if(!conid) + return 0; + doq_conid_list_insert(conn, conid); + (void)rbtree_insert(conn->table->conid_tree, &conid->node); + return 1; +} + +void +doq_conn_dissociate_conid(struct doq_conn* conn, const uint8_t* data, + size_t datalen) +{ + struct doq_conid* conid; + conid = doq_conid_find(conn->table, data, datalen); + if(conid && !conid_is_for_conn(conn, conid)) + return; + if(conid) { + (void)rbtree_delete(conn->table->conid_tree, + conid->node.key); + doq_conid_list_remove(conn, conid); + doq_conid_delete(conid); + } +} + +/** associate the scid array and also the dcid. + * caller must hold the locks on conn and doq_table.conid_lock. */ +static int +doq_conn_setup_id_array_and_dcid(struct doq_conn* conn, + struct ngtcp2_cid* scids, size_t num_scid) +{ + size_t i; + for(i=0; i<num_scid; i++) { + if(!doq_conn_associate_conid(conn, scids[i].data, + scids[i].datalen)) + return 0; + } + if(!doq_conn_associate_conid(conn, conn->key.dcid, conn->key.dcidlen)) + return 0; + return 1; +} + +int +doq_conn_setup_conids(struct doq_conn* conn) +{ + size_t num_scid = +#ifndef HAVE_NGTCP2_CONN_GET_NUM_SCID + ngtcp2_conn_get_scid(conn->conn, NULL); +#else + ngtcp2_conn_get_num_scid(conn->conn); +#endif + if(num_scid <= 4) { + struct ngtcp2_cid ids[4]; + /* Usually there are not that many scids when just accepted, + * like only 2. */ + ngtcp2_conn_get_scid(conn->conn, ids); + return doq_conn_setup_id_array_and_dcid(conn, ids, num_scid); + } else { + struct ngtcp2_cid *scids = calloc(num_scid, + sizeof(struct ngtcp2_cid)); + if(!scids) + return 0; + ngtcp2_conn_get_scid(conn->conn, scids); + if(!doq_conn_setup_id_array_and_dcid(conn, scids, num_scid)) { + free(scids); + return 0; + } + free(scids); + } + return 1; +} + +void +doq_conn_clear_conids(struct doq_conn* conn) +{ + struct doq_conid* p, *next; + if(!conn) + return; + p = conn->conid_list; + while(p) { + next = p->next; + (void)rbtree_delete(conn->table->conid_tree, p->node.key); + doq_conid_delete(p); + p = next; + } + conn->conid_list = NULL; +} + +ngtcp2_tstamp doq_get_timestamp_nanosec(void) +{ +#ifdef CLOCK_REALTIME + struct timespec tp; + memset(&tp, 0, sizeof(tp)); + /* Get a nanosecond time, that can be compared with the event base. */ + if(clock_gettime(CLOCK_REALTIME, &tp) == -1) { + log_err("clock_gettime failed: %s", strerror(errno)); + } + return ((uint64_t)tp.tv_sec)*((uint64_t)1000000000) + + ((uint64_t)tp.tv_nsec); +#else + struct timeval tv; + if(gettimeofday(&tv, NULL) < 0) { + log_err("gettimeofday failed: %s", strerror(errno)); + } + return ((uint64_t)tv.tv_sec)*((uint64_t)1000000000) + + ((uint64_t)tv.tv_usec)*((uint64_t)1000); +#endif /* CLOCK_REALTIME */ +} + +/** doq start the closing period for the connection. */ +static int +doq_conn_start_closing_period(struct comm_point* c, struct doq_conn* conn) +{ + struct ngtcp2_path_storage ps; + struct ngtcp2_pkt_info pi; + ngtcp2_ssize ret; + if(!conn) + return 1; + if( +#ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD + ngtcp2_conn_in_closing_period(conn->conn) +#else + ngtcp2_conn_is_in_closing_period(conn->conn) +#endif + ) + return 1; + if( +#ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD + ngtcp2_conn_in_draining_period(conn->conn) +#else + ngtcp2_conn_is_in_draining_period(conn->conn) +#endif + ) { + doq_conn_write_disable(conn); + return 1; + } + ngtcp2_path_storage_zero(&ps); + sldns_buffer_clear(c->doq_socket->pkt_buf); + /* the call to ngtcp2_conn_write_connection_close causes the + * conn to be closed. It is now in the closing period. */ + ret = ngtcp2_conn_write_connection_close(conn->conn, &ps.path, + &pi, sldns_buffer_begin(c->doq_socket->pkt_buf), + sldns_buffer_remaining(c->doq_socket->pkt_buf), +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + &conn->ccerr +#else + &conn->last_error +#endif + , doq_get_timestamp_nanosec()); + if(ret < 0) { + log_err("doq ngtcp2_conn_write_connection_close failed: %s", + ngtcp2_strerror(ret)); + return 0; + } + if(ret == 0) { + return 0; + } + sldns_buffer_set_position(c->doq_socket->pkt_buf, ret); + sldns_buffer_flip(c->doq_socket->pkt_buf); + + /* The close packet is allocated, because it may have to be repeated. + * When incoming packets have this connection dcid. */ + conn->close_pkt = memdup(sldns_buffer_begin(c->doq_socket->pkt_buf), + sldns_buffer_limit(c->doq_socket->pkt_buf)); + if(!conn->close_pkt) { + log_err("doq: could not allocate close packet: out of memory"); + return 0; + } + conn->close_pkt_len = sldns_buffer_limit(c->doq_socket->pkt_buf); + conn->close_ecn = pi.ecn; + return 1; +} + +/** doq send the close packet for the connection, perhaps again. */ +int +doq_conn_send_close(struct comm_point* c, struct doq_conn* conn) +{ + if(!conn) + return 0; + if(!conn->close_pkt) + return 0; + if(conn->close_pkt_len > sldns_buffer_capacity(c->doq_socket->pkt_buf)) + return 0; + sldns_buffer_clear(c->doq_socket->pkt_buf); + sldns_buffer_write(c->doq_socket->pkt_buf, conn->close_pkt, conn->close_pkt_len); + sldns_buffer_flip(c->doq_socket->pkt_buf); + verbose(VERB_ALGO, "doq send connection close"); + doq_send_pkt(c, &conn->key.paddr, conn->close_ecn); + doq_conn_write_disable(conn); + return 1; +} + +/** doq close the connection on error. If it returns a failure, it + * does not wait to send a close, and the connection can be dropped. */ +static int +doq_conn_close_error(struct comm_point* c, struct doq_conn* conn) +{ +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + if(conn->ccerr.type == NGTCP2_CCERR_TYPE_IDLE_CLOSE) + return 0; +#else + if(conn->last_error.type == + NGTCP2_CONNECTION_CLOSE_ERROR_CODE_TYPE_TRANSPORT_IDLE_CLOSE) + return 0; +#endif + if(!doq_conn_start_closing_period(c, conn)) + return 0; + if( +#ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD + ngtcp2_conn_in_draining_period(conn->conn) +#else + ngtcp2_conn_is_in_draining_period(conn->conn) +#endif + ) { + doq_conn_write_disable(conn); + return 1; + } + doq_conn_write_enable(conn); + if(!doq_conn_send_close(c, conn)) + return 0; + return 1; +} + +int +doq_conn_recv(struct comm_point* c, struct doq_pkt_addr* paddr, + struct doq_conn* conn, struct ngtcp2_pkt_info* pi, int* err_retry, + int* err_drop) +{ + int ret; + ngtcp2_tstamp ts; + struct ngtcp2_path path; + memset(&path, 0, sizeof(path)); + path.remote.addr = (struct sockaddr*)&paddr->addr; + path.remote.addrlen = paddr->addrlen; + path.local.addr = (struct sockaddr*)&paddr->localaddr; + path.local.addrlen = paddr->localaddrlen; + ts = doq_get_timestamp_nanosec(); + + ret = ngtcp2_conn_read_pkt(conn->conn, &path, pi, + sldns_buffer_begin(c->doq_socket->pkt_buf), + sldns_buffer_limit(c->doq_socket->pkt_buf), ts); + if(ret != 0) { + if(err_retry) + *err_retry = 0; + if(err_drop) + *err_drop = 0; + if(ret == NGTCP2_ERR_DRAINING) { + verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s", + ngtcp2_strerror(ret)); + doq_conn_write_disable(conn); + return 0; + } else if(ret == NGTCP2_ERR_DROP_CONN) { + verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s", + ngtcp2_strerror(ret)); + if(err_drop) + *err_drop = 1; + return 0; + } else if(ret == NGTCP2_ERR_RETRY) { + verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s", + ngtcp2_strerror(ret)); + if(err_retry) + *err_retry = 1; + if(err_drop) + *err_drop = 1; + return 0; + } else if(ret == NGTCP2_ERR_CRYPTO) { + if( +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + !conn->ccerr.error_code +#else + !conn->last_error.error_code +#endif + ) { + /* in picotls the tls alert may need to be + * copied, but this is with openssl. And there + * is conn->tls_alert. */ +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + ngtcp2_ccerr_set_tls_alert(&conn->ccerr, + conn->tls_alert, NULL, 0); +#else + ngtcp2_connection_close_error_set_transport_error_tls_alert( + &conn->last_error, conn->tls_alert, + NULL, 0); +#endif + } + } else { + if( +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + !conn->ccerr.error_code +#else + !conn->last_error.error_code +#endif + ) { +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + ngtcp2_ccerr_set_liberr(&conn->ccerr, ret, + NULL, 0); +#else + ngtcp2_connection_close_error_set_transport_error_liberr( + &conn->last_error, ret, NULL, 0); +#endif + } + } + log_err("ngtcp2_conn_read_pkt failed: %s", + ngtcp2_strerror(ret)); + if(!doq_conn_close_error(c, conn)) { + if(err_drop) + *err_drop = 1; + } + return 0; + } + doq_conn_write_enable(conn); + return 1; +} + +/** doq stream write is done */ +static void +doq_stream_write_is_done(struct doq_conn* conn, struct doq_stream* stream) +{ + /* Cannot deallocate, the buffer may be needed for resends. */ + doq_stream_off_write_list(conn, stream); +} + +int +doq_conn_write_streams(struct comm_point* c, struct doq_conn* conn, + int* err_drop) +{ + struct doq_stream* stream = conn->stream_write_first; + ngtcp2_path_storage ps; + ngtcp2_tstamp ts = doq_get_timestamp_nanosec(); + size_t num_packets = 0, max_packets = 65535; + ngtcp2_path_storage_zero(&ps); + + for(;;) { + int64_t stream_id; + uint32_t flags = 0; + ngtcp2_pkt_info pi; + ngtcp2_vec datav[2]; + size_t datav_count = 0; + ngtcp2_ssize ret, ndatalen = 0; + int fin; + + if(stream) { + /* data to send */ + verbose(VERB_ALGO, "doq: doq_conn write stream %d", + (int)stream->stream_id); + stream_id = stream->stream_id; + fin = 1; + if(stream->nwrite < 2) { + datav[0].base = ((uint8_t*)&stream-> + outlen_wire) + stream->nwrite; + datav[0].len = 2 - stream->nwrite; + datav[1].base = stream->out; + datav[1].len = stream->outlen; + datav_count = 2; + } else { + datav[0].base = stream->out + + (stream->nwrite-2); + datav[0].len = stream->outlen - + (stream->nwrite-2); + datav_count = 1; + } + } else { + /* no data to send */ + verbose(VERB_ALGO, "doq: doq_conn write stream -1"); + stream_id = -1; + fin = 0; + datav[0].base = NULL; + datav[0].len = 0; + datav_count = 1; + } + + /* if more streams, set it to write more */ + if(stream && stream->write_next) + flags |= NGTCP2_WRITE_STREAM_FLAG_MORE; + if(fin) + flags |= NGTCP2_WRITE_STREAM_FLAG_FIN; + + sldns_buffer_clear(c->doq_socket->pkt_buf); + ret = ngtcp2_conn_writev_stream(conn->conn, &ps.path, &pi, + sldns_buffer_begin(c->doq_socket->pkt_buf), + sldns_buffer_remaining(c->doq_socket->pkt_buf), + &ndatalen, flags, stream_id, datav, datav_count, ts); + if(ret < 0) { + if(ret == NGTCP2_ERR_WRITE_MORE) { + verbose(VERB_ALGO, "doq: write more, ndatalen %d", (int)ndatalen); + if(stream) { + if(ndatalen >= 0) + stream->nwrite += ndatalen; + if(stream->nwrite >= stream->outlen+2) + doq_stream_write_is_done( + conn, stream); + stream = stream->write_next; + } + continue; + } else if(ret == NGTCP2_ERR_STREAM_DATA_BLOCKED) { + verbose(VERB_ALGO, "doq: ngtcp2_conn_writev_stream returned NGTCP2_ERR_STREAM_DATA_BLOCKED"); +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + ngtcp2_ccerr_set_application_error( + &conn->ccerr, -1, NULL, 0); +#else + ngtcp2_connection_close_error_set_application_error(&conn->last_error, -1, NULL, 0); +#endif + if(err_drop) + *err_drop = 0; + if(!doq_conn_close_error(c, conn)) { + if(err_drop) + *err_drop = 1; + } + return 0; + } else if(ret == NGTCP2_ERR_STREAM_SHUT_WR) { + verbose(VERB_ALGO, "doq: ngtcp2_conn_writev_stream returned NGTCP2_ERR_STREAM_SHUT_WR"); +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + ngtcp2_ccerr_set_application_error( + &conn->ccerr, -1, NULL, 0); +#else + ngtcp2_connection_close_error_set_application_error(&conn->last_error, -1, NULL, 0); +#endif + if(err_drop) + *err_drop = 0; + if(!doq_conn_close_error(c, conn)) { + if(err_drop) + *err_drop = 1; + } + return 0; + } + + log_err("doq: ngtcp2_conn_writev_stream failed: %s", + ngtcp2_strerror(ret)); +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + ngtcp2_ccerr_set_liberr(&conn->ccerr, ret, NULL, 0); +#else + ngtcp2_connection_close_error_set_transport_error_liberr( + &conn->last_error, ret, NULL, 0); +#endif + if(err_drop) + *err_drop = 0; + if(!doq_conn_close_error(c, conn)) { + if(err_drop) + *err_drop = 1; + } + return 0; + } + verbose(VERB_ALGO, "doq: writev_stream pkt size %d ndatawritten %d", + (int)ret, (int)ndatalen); + + if(ndatalen >= 0 && stream) { + stream->nwrite += ndatalen; + if(stream->nwrite >= stream->outlen+2) + doq_stream_write_is_done(conn, stream); + } + if(ret == 0) { + /* congestion limited */ + doq_conn_write_disable(conn); + ngtcp2_conn_update_pkt_tx_time(conn->conn, ts); + return 1; + } + sldns_buffer_set_position(c->doq_socket->pkt_buf, ret); + sldns_buffer_flip(c->doq_socket->pkt_buf); + doq_send_pkt(c, &conn->key.paddr, pi.ecn); + + if(c->doq_socket->have_blocked_pkt) + break; + if(++num_packets == max_packets) + break; + if(stream) + stream = stream->write_next; + } + ngtcp2_conn_update_pkt_tx_time(conn->conn, ts); + return 1; +} + +void +doq_conn_write_enable(struct doq_conn* conn) +{ + conn->write_interest = 1; +} + +void +doq_conn_write_disable(struct doq_conn* conn) +{ + conn->write_interest = 0; +} + +/** doq append the connection to the write list */ +static void +doq_conn_write_list_append(struct doq_table* table, struct doq_conn* conn) +{ + if(conn->on_write_list) + return; + conn->write_prev = table->write_list_last; + if(table->write_list_last) + table->write_list_last->write_next = conn; + else table->write_list_first = conn; + conn->write_next = NULL; + table->write_list_last = conn; + conn->on_write_list = 1; +} + +void +doq_conn_write_list_remove(struct doq_table* table, struct doq_conn* conn) +{ + if(!conn->on_write_list) + return; + if(conn->write_next) + conn->write_next->write_prev = conn->write_prev; + else table->write_list_last = conn->write_prev; + if(conn->write_prev) + conn->write_prev->write_next = conn->write_next; + else table->write_list_first = conn->write_next; + conn->write_prev = NULL; + conn->write_next = NULL; + conn->on_write_list = 0; +} + +void +doq_conn_set_write_list(struct doq_table* table, struct doq_conn* conn) +{ + if(conn->write_interest && conn->on_write_list) + return; + if(!conn->write_interest && !conn->on_write_list) + return; + if(conn->write_interest) + doq_conn_write_list_append(table, conn); + else doq_conn_write_list_remove(table, conn); +} + +struct doq_conn* +doq_table_pop_first(struct doq_table* table) +{ + struct doq_conn* conn = table->write_list_first; + if(!conn) + return NULL; + lock_basic_lock(&conn->lock); + table->write_list_first = conn->write_next; + if(conn->write_next) + conn->write_next->write_prev = NULL; + else table->write_list_last = NULL; + conn->write_next = NULL; + conn->write_prev = NULL; + conn->on_write_list = 0; + return conn; +} + +int +doq_conn_check_timer(struct doq_conn* conn, struct timeval* tv) +{ + ngtcp2_tstamp expiry = ngtcp2_conn_get_expiry(conn->conn); + ngtcp2_tstamp now = doq_get_timestamp_nanosec(); + ngtcp2_tstamp t; + + if(expiry <= now) { + /* The timer has already expired, add with zero timeout. + * This should call the callback straight away. Calling it + * from the event callbacks is cleaner than calling it here, + * because then it is always called with the same locks and + * so on. This routine only has the conn.lock. */ + t = now; + } else { + t = expiry; + } + + /* convert to timeval */ + memset(tv, 0, sizeof(*tv)); + tv->tv_sec = t / NGTCP2_SECONDS; + tv->tv_usec = (t / NGTCP2_MICROSECONDS)%1000000; + + /* If we already have a timer, is it the right value? */ + if(conn->timer.timer_in_tree || conn->timer.timer_in_list) { + if(conn->timer.time.tv_sec == tv->tv_sec && + conn->timer.time.tv_usec == tv->tv_usec) + return 0; + } + return 1; +} + +/* doq print connection log */ +static void +doq_conn_log_line(struct doq_conn* conn, char* s) +{ + char remotestr[256], localstr[256]; + addr_to_str((void*)&conn->key.paddr.addr, conn->key.paddr.addrlen, + remotestr, sizeof(remotestr)); + addr_to_str((void*)&conn->key.paddr.localaddr, + conn->key.paddr.localaddrlen, localstr, sizeof(localstr)); + log_info("doq conn %s %s %s", remotestr, localstr, s); +} + +int +doq_conn_handle_timeout(struct doq_conn* conn) +{ + ngtcp2_tstamp now = doq_get_timestamp_nanosec(); + int rv; + + if(verbosity >= VERB_ALGO) + doq_conn_log_line(conn, "timeout"); + + rv = ngtcp2_conn_handle_expiry(conn->conn, now); + if(rv != 0) { + verbose(VERB_ALGO, "ngtcp2_conn_handle_expiry failed: %s", + ngtcp2_strerror(rv)); +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + ngtcp2_ccerr_set_liberr(&conn->ccerr, rv, NULL, 0); +#else + ngtcp2_connection_close_error_set_transport_error_liberr( + &conn->last_error, rv, NULL, 0); +#endif + if(!doq_conn_close_error(conn->doq_socket->cp, conn)) { + /* failed, return for deletion */ + return 0; + } + return 1; + } + doq_conn_write_enable(conn); + if(!doq_conn_write_streams(conn->doq_socket->cp, conn, NULL)) { + /* failed, return for deletion. */ + return 0; + } + return 1; +} + +void +doq_table_quic_size_add(struct doq_table* table, size_t add) +{ + lock_basic_lock(&table->size_lock); + table->current_size += add; + lock_basic_unlock(&table->size_lock); +} + +void +doq_table_quic_size_subtract(struct doq_table* table, size_t subtract) +{ + lock_basic_lock(&table->size_lock); + if(table->current_size < subtract) + table->current_size = 0; + else table->current_size -= subtract; + lock_basic_unlock(&table->size_lock); +} + +int +doq_table_quic_size_available(struct doq_table* table, + struct config_file* cfg, size_t mem) +{ + size_t cur; + lock_basic_lock(&table->size_lock); + cur = table->current_size; + lock_basic_unlock(&table->size_lock); + + if(cur + mem > cfg->quic_size) + return 0; + return 1; +} + +size_t doq_table_quic_size_get(struct doq_table* table) +{ + size_t sz; + if(!table) + return 0; + lock_basic_lock(&table->size_lock); + sz = table->current_size; + lock_basic_unlock(&table->size_lock); + return sz; +} +#endif /* HAVE_NGTCP2 */ diff --git a/services/listen_dnsport.h b/services/listen_dnsport.h index 84ac4b068b1b..c29f4d72b0a2 100644 --- a/services/listen_dnsport.h +++ b/services/listen_dnsport.h @@ -43,10 +43,16 @@ #define LISTEN_DNSPORT_H #include "util/netevent.h" +#include "util/rbtree.h" +#include "util/locks.h" #include "daemon/acl_list.h" #ifdef HAVE_NGHTTP2_NGHTTP2_H #include <nghttp2/nghttp2.h> #endif +#ifdef HAVE_NGTCP2 +#include <ngtcp2/ngtcp2.h> +#include <ngtcp2/ngtcp2_crypto.h> +#endif struct listen_list; struct config_file; struct addrinfo; @@ -100,7 +106,9 @@ enum listen_type { /** udp ipv6 (v4mapped) for use with ancillary data + dnscrypt*/ listen_type_udpancil_dnscrypt, /** HTTP(2) over TLS over TCP */ - listen_type_http + listen_type_http, + /** DNS over QUIC */ + listen_type_doq }; /* @@ -188,6 +196,11 @@ int resolve_interface_names(char** ifs, int num_ifs, * @param tcp_conn_limit: TCP connection limit info. * @param sslctx: nonNULL if ssl context. * @param dtenv: nonNULL if dnstap enabled. + * @param doq_table: the doq connection table, with shared information. + * @param rnd: random state. + * @param ssl_service_key: the SSL service key file. + * @param ssl_service_pem: the SSL service pem file. + * @param cfg: config file struct. * @param cb: callback function when a request arrives. It is passed * the packet and user argument. Return true to send a reply. * @param cb_arg: user data argument for callback function. @@ -198,8 +211,10 @@ listen_create(struct comm_base* base, struct listen_port* ports, size_t bufsize, int tcp_accept_count, int tcp_idle_timeout, int harden_large_queries, uint32_t http_max_streams, char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit, - void* sslctx, struct dt_env* dtenv, comm_point_callback_type* cb, - void *cb_arg); + void* sslctx, struct dt_env* dtenv, struct doq_table* doq_table, + struct ub_randstate* rnd, const char* ssl_service_key, + const char* ssl_service_pem, struct config_file* cfg, + comm_point_callback_type* cb, void *cb_arg); /** * delete the listening structure @@ -278,11 +293,12 @@ int create_udp_sock(int family, int socktype, struct sockaddr* addr, * @param freebind: set IP_FREEBIND socket option. * @param use_systemd: if true, fetch sockets from systemd. * @param dscp: DSCP to use. + * @param additional: additional log information for the socket type. * @return: the socket. -1 on error. */ int create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto, int* reuseport, int transparent, int mss, int nodelay, int freebind, - int use_systemd, int dscp); + int use_systemd, int dscp, const char* additional); /** * Create and bind local listening socket @@ -452,6 +468,377 @@ int http2_submit_dns_response(struct http2_session* h2_session); int http2_submit_dns_response(void* v); #endif /* HAVE_NGHTTP2 */ +#ifdef HAVE_NGTCP2 +struct doq_conid; +struct doq_server_socket; + +/** + * DoQ shared connection table. This is the connections for the host. + * And some config parameter values for connections. The host has to + * respond on that ip,port for those connections, so they are shared + * between threads. + */ +struct doq_table { + /** the lock on the tree and config elements. insert and deletion, + * also lookup in the tree needs to hold the lock. */ + lock_rw_type lock; + /** rbtree of doq_conn, the connections to different destination + * addresses, and can be found by dcid. */ + struct rbtree_type* conn_tree; + /** lock for the conid tree, needed for the conid tree and also + * the conid elements */ + lock_rw_type conid_lock; + /** rbtree of doq_conid, connections can be found by their + * connection ids. Lookup by connection id, finds doq_conn. */ + struct rbtree_type* conid_tree; + /** the server scid length */ + int sv_scidlen; + /** the static secret for the server */ + uint8_t* static_secret; + /** length of the static secret */ + size_t static_secret_len; + /** the idle timeout in nanoseconds */ + uint64_t idle_timeout; + /** the list of write interested connections, hold the doq_table.lock + * to change them */ + struct doq_conn* write_list_first, *write_list_last; + /** rbtree of doq_timer. */ + struct rbtree_type* timer_tree; + /** lock on the current_size counter. */ + lock_basic_type size_lock; + /** current use, in bytes, of QUIC buffers. + * The doq_conn ngtcp2_conn structure, SSL structure and conid structs + * are not counted. */ + size_t current_size; +}; + +/** create doq table */ +struct doq_table* doq_table_create(struct config_file* cfg, + struct ub_randstate* rnd); + +/** delete doq table */ +void doq_table_delete(struct doq_table* table); + +/** + * Timer information for doq timer. + */ +struct doq_timer { + /** The rbnode in the tree sorted by timeout value. Key this struct. */ + struct rbnode_type node; + /** The timeout value. Absolute time value. */ + struct timeval time; + /** If the timer is in the time tree, with the node. */ + int timer_in_tree; + /** If there are more timers with the exact same timeout value, + * they form a set of timers. The rbnode timer has a link to the list + * with the other timers in the set. The rbnode timer is not a + * member of the list with the other timers. The other timers are not + * linked into the tree. */ + struct doq_timer* setlist_first, *setlist_last; + /** If the timer is on the setlist. */ + int timer_in_list; + /** If in the setlist, the next and prev element. */ + struct doq_timer* setlist_next, *setlist_prev; + /** The connection that is timeouted. */ + struct doq_conn* conn; + /** The worker that is waiting for the timeout event. + * Set for the rbnode tree linked element. If a worker is waiting + * for the event. If NULL, no worker is waiting for this timeout. */ + struct doq_server_socket* worker_doq_socket; +}; + +/** + * Key information that makes a doq_conn node in the tree lookup. + */ +struct doq_conn_key { + /** the remote endpoint and local endpoint and ifindex */ + struct doq_pkt_addr paddr; + /** the doq connection dcid */ + uint8_t* dcid; + /** length of dcid */ + size_t dcidlen; +}; + +/** + * DoQ connection, for DNS over QUIC. One connection to a remote endpoint + * with a number of streams in it. Every stream is like a tcp stream with + * a uint16_t length, query read, and a uint16_t length and answer written. + */ +struct doq_conn { + /** rbtree node, key is addresses and dcid */ + struct rbnode_type node; + /** lock on the connection */ + lock_basic_type lock; + /** the key information, with dcid and address endpoint */ + struct doq_conn_key key; + /** the doq server socket for inside callbacks */ + struct doq_server_socket* doq_socket; + /** the doq table this connection is part of */ + struct doq_table* table; + /** if the connection is about to be deleted. */ + uint8_t is_deleted; + /** the version, the client chosen version of QUIC */ + uint32_t version; + /** the ngtcp2 connection, a server connection */ + struct ngtcp2_conn* conn; + /** the connection ids that are associated with this doq_conn. + * There can be a number, that can change. They are linked here, + * so that upon removal, the list of actually associated conid + * elements can be removed as well. */ + struct doq_conid* conid_list; + /** the ngtcp2 last error for the connection */ +#ifdef HAVE_NGTCP2_CCERR_DEFAULT + struct ngtcp2_ccerr ccerr; +#else + struct ngtcp2_connection_close_error last_error; +#endif + /** the recent tls alert error code */ + uint8_t tls_alert; + /** the ssl context, SSL* */ + void* ssl; +#ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT + /** the connection reference for ngtcp2_conn and userdata in ssl */ + struct ngtcp2_crypto_conn_ref conn_ref; +#endif + /** closure packet, if any */ + uint8_t* close_pkt; + /** length of closure packet. */ + size_t close_pkt_len; + /** closure ecn */ + uint32_t close_ecn; + /** the streams for this connection, of type doq_stream */ + struct rbtree_type stream_tree; + /** the streams that want write, they have something to write. + * The list is ordered, the last have to wait for the first to + * get their data written. */ + struct doq_stream* stream_write_first, *stream_write_last; + /** the conn has write interest if true, no write interest if false. */ + uint8_t write_interest; + /** if the conn is on the connection write list */ + uint8_t on_write_list; + /** the connection write list prev and next, if on the write list */ + struct doq_conn* write_prev, *write_next; + /** The timer for the connection. If unused, it is not in the tree + * and not in the list. It is alloced here, so that it is prealloced. + * It has to be set after every read and write on the connection, so + * this improves performance, but also the allocation does not fail. */ + struct doq_timer timer; +}; + +/** + * Connection ID and the doq_conn that is that connection. A connection + * has an original dcid, and then more connection ids associated. + */ +struct doq_conid { + /** rbtree node, key is the connection id. */ + struct rbnode_type node; + /** the next and prev in the list of conids for the doq_conn */ + struct doq_conid* next, *prev; + /** key to the doq_conn that is the connection */ + struct doq_conn_key key; + /** the connection id, byte string */ + uint8_t* cid; + /** the length of cid */ + size_t cidlen; +}; + +/** + * DoQ stream, for DNS over QUIC. + */ +struct doq_stream { + /** the rbtree node for the stream, key is the stream_id */ + rbnode_type node; + /** the stream id */ + int64_t stream_id; + /** if the stream is closed */ + uint8_t is_closed; + /** if the query is complete */ + uint8_t is_query_complete; + /** the number of bytes read on the stream, up to querylen+2. */ + size_t nread; + /** the length of the input query bytes */ + size_t inlen; + /** the input bytes */ + uint8_t* in; + /** does the stream have an answer to send */ + uint8_t is_answer_available; + /** the answer bytes sent, up to outlen+2. */ + size_t nwrite; + /** the length of the output answer bytes */ + size_t outlen; + /** the output length in network wireformat */ + uint16_t outlen_wire; + /** the output packet bytes */ + uint8_t* out; + /** if the stream is on the write list */ + uint8_t on_write_list; + /** the prev and next on the write list, if on the list */ + struct doq_stream* write_prev, *write_next; +}; + +/** doq application error code that is sent when a stream is closed */ +#define DOQ_APP_ERROR_CODE 1 + +/** + * Create the doq connection. + * @param c: the comm point for the listening doq socket. + * @param paddr: with remote and local address and ifindex for the + * connection destination. This is where packets are sent. + * @param dcid: the dcid, Destination Connection ID. + * @param dcidlen: length of dcid. + * @param version: client chosen version. + * @return new doq connection or NULL on allocation failure. + */ +struct doq_conn* doq_conn_create(struct comm_point* c, + struct doq_pkt_addr* paddr, const uint8_t* dcid, size_t dcidlen, + uint32_t version); + +/** + * Delete the doq connection structure. + * @param conn: to delete. + * @param table: with memory size. + */ +void doq_conn_delete(struct doq_conn* conn, struct doq_table* table); + +/** compare function of doq_conn */ +int doq_conn_cmp(const void* key1, const void* key2); + +/** compare function of doq_conid */ +int doq_conid_cmp(const void* key1, const void* key2); + +/** compare function of doq_timer */ +int doq_timer_cmp(const void* key1, const void* key2); + +/** compare function of doq_stream */ +int doq_stream_cmp(const void* key1, const void* key2); + +/** setup the doq_socket server tls context */ +int doq_socket_setup_ctx(struct doq_server_socket* doq_socket); + +/** setup the doq connection callbacks, and settings. */ +int doq_conn_setup(struct doq_conn* conn, uint8_t* scid, size_t scidlen, + uint8_t* ocid, size_t ocidlen, const uint8_t* token, size_t tokenlen); + +/** fill a buffer with random data */ +void doq_fill_rand(struct ub_randstate* rnd, uint8_t* buf, size_t len); + +/** delete a doq_conid */ +void doq_conid_delete(struct doq_conid* conid); + +/** add a connection id to the doq_conn. + * caller must hold doq_table.conid_lock. */ +int doq_conn_associate_conid(struct doq_conn* conn, uint8_t* data, + size_t datalen); + +/** remove a connection id from the doq_conn. + * caller must hold doq_table.conid_lock. */ +void doq_conn_dissociate_conid(struct doq_conn* conn, const uint8_t* data, + size_t datalen); + +/** initial setup to link current connection ids to the doq_conn */ +int doq_conn_setup_conids(struct doq_conn* conn); + +/** remove the connection ids from the doq_conn. + * caller must hold doq_table.conid_lock. */ +void doq_conn_clear_conids(struct doq_conn* conn); + +/** find a conid in the doq_conn connection. + * caller must hold table.conid_lock. */ +struct doq_conid* doq_conid_find(struct doq_table* doq_table, + const uint8_t* data, size_t datalen); + +/** receive a packet for a connection */ +int doq_conn_recv(struct comm_point* c, struct doq_pkt_addr* paddr, + struct doq_conn* conn, struct ngtcp2_pkt_info* pi, int* err_retry, + int* err_drop); + +/** send packets for a connection */ +int doq_conn_write_streams(struct comm_point* c, struct doq_conn* conn, + int* err_drop); + +/** send the close packet for the connection, perhaps again. */ +int doq_conn_send_close(struct comm_point* c, struct doq_conn* conn); + +/** delete doq stream */ +void doq_stream_delete(struct doq_stream* stream); + +/** doq read a connection key from repinfo. It is not malloced, but points + * into the repinfo for the dcid. */ +void doq_conn_key_from_repinfo(struct doq_conn_key* key, + struct comm_reply* repinfo); + +/** doq find a stream in the connection */ +struct doq_stream* doq_stream_find(struct doq_conn* conn, int64_t stream_id); + +/** doq shutdown the stream. */ +int doq_stream_close(struct doq_conn* conn, struct doq_stream* stream, + int send_shutdown); + +/** send reply for a connection */ +int doq_stream_send_reply(struct doq_conn* conn, struct doq_stream* stream, + struct sldns_buffer* buf); + +/** the connection has write interest, wants to write packets */ +void doq_conn_write_enable(struct doq_conn* conn); + +/** the connection has no write interest, does not want to write packets */ +void doq_conn_write_disable(struct doq_conn* conn); + +/** set the connection on or off the write list, depending on write interest */ +void doq_conn_set_write_list(struct doq_table* table, struct doq_conn* conn); + +/** doq remove the connection from the write list */ +void doq_conn_write_list_remove(struct doq_table* table, + struct doq_conn* conn); + +/** doq get the first conn from the write list, if any, popped from list. + * Locks the conn that is returned. */ +struct doq_conn* doq_table_pop_first(struct doq_table* table); + +/** + * doq check if the timer for the conn needs to be changed. + * @param conn: connection, caller must hold lock on it. + * @param tv: time value, absolute time, returned. + * @return true if timer needs to be set to tv, false if no change is needed + * to the timer. The timer is already set to the right time in that case. + */ +int doq_conn_check_timer(struct doq_conn* conn, struct timeval* tv); + +/** doq remove timer from tree */ +void doq_timer_tree_remove(struct doq_table* table, struct doq_timer* timer); + +/** doq remove timer from list */ +void doq_timer_list_remove(struct doq_table* table, struct doq_timer* timer); + +/** doq unset the timer if it was set. */ +void doq_timer_unset(struct doq_table* table, struct doq_timer* timer); + +/** doq set the timer and add it. */ +void doq_timer_set(struct doq_table* table, struct doq_timer* timer, + struct doq_server_socket* worker_doq_socket, struct timeval* tv); + +/** doq find a timeout in the timer tree */ +struct doq_timer* doq_timer_find_time(struct doq_table* table, + struct timeval* tv); + +/** doq handle timeout for a connection. Pass conn locked. Returns false for + * deletion. */ +int doq_conn_handle_timeout(struct doq_conn* conn); + +/** doq add size to the current quic buffer counter */ +void doq_table_quic_size_add(struct doq_table* table, size_t add); + +/** doq subtract size from the current quic buffer counter */ +void doq_table_quic_size_subtract(struct doq_table* table, size_t subtract); + +/** doq check if mem is available for quic. */ +int doq_table_quic_size_available(struct doq_table* table, + struct config_file* cfg, size_t mem); + +/** doq get the quic size value */ +size_t doq_table_quic_size_get(struct doq_table* table); +#endif /* HAVE_NGTCP2 */ + char* set_ip_dscp(int socket, int addrfamily, int ds); /** for debug and profiling purposes only @@ -459,4 +846,14 @@ char* set_ip_dscp(int socket, int addrfamily, int ds); */ void verbose_print_unbound_socket(struct unbound_socket* ub_sock); +/** event callback for testcode/doqclient */ +void doq_client_event_cb(int fd, short event, void* arg); + +/** timer event callback for testcode/doqclient */ +void doq_client_timer_cb(int fd, short event, void* arg); + +#ifdef HAVE_NGTCP2 +/** get a timestamp in nanoseconds */ +ngtcp2_tstamp doq_get_timestamp_nanosec(void); +#endif #endif /* LISTEN_DNSPORT_H */ diff --git a/services/mesh.c b/services/mesh.c index 522118844b44..d512ab3d32d4 100644 --- a/services/mesh.c +++ b/services/mesh.c @@ -311,7 +311,7 @@ int mesh_make_new_space(struct mesh_area* mesh, sldns_buffer* qbuf) struct dns_msg* mesh_serve_expired_lookup(struct module_qstate* qstate, - struct query_info* lookup_qinfo) + struct query_info* lookup_qinfo, int* is_expired) { hashvalue_type h; struct lruhash_entry* e; @@ -321,6 +321,7 @@ mesh_serve_expired_lookup(struct module_qstate* qstate, time_t timenow = *qstate->env->now; int must_validate = (!(qstate->query_flags&BIT_CD) || qstate->env->cfg->ignore_cd) && qstate->env->need_to_validate; + *is_expired = 0; /* Lookup cache */ h = query_info_hash(lookup_qinfo, qstate->query_flags); e = slabhash_lookup(qstate->env->msg_cache, h, lookup_qinfo, 0); @@ -328,6 +329,7 @@ mesh_serve_expired_lookup(struct module_qstate* qstate, key = (struct msgreply_entry*)e->key; data = (struct reply_info*)e->data; + if(data->ttl < timenow) *is_expired = 1; msg = tomsg(qstate->env, &key->key, data, qstate->region, timenow, qstate->env->cfg->serve_expired, qstate->env->scratch); if(!msg) @@ -2176,6 +2178,7 @@ mesh_serve_expired_callback(void* arg) int must_validate = (!(qstate->query_flags&BIT_CD) || qstate->env->cfg->ignore_cd) && qstate->env->need_to_validate; int i = 0; + int is_expired; if(!qstate->serve_expired_data) return; verbose(VERB_ALGO, "Serve expired: Trying to reply with expired data"); comm_timer_delete(qstate->serve_expired_data->timer); @@ -2193,7 +2196,7 @@ mesh_serve_expired_callback(void* arg) fptr_ok(fptr_whitelist_serve_expired_lookup( qstate->serve_expired_data->get_cached_answer)); msg = (*qstate->serve_expired_data->get_cached_answer)(qstate, - lookup_qinfo); + lookup_qinfo, &is_expired); if(!msg) return; /* Reset these in case we pass a second time from here. */ @@ -2285,8 +2288,10 @@ mesh_serve_expired_callback(void* arg) /* Add EDE Stale Answer (RCF8914). Ignore global ede as this is * warning instead of an error */ - if (r->edns.edns_present && qstate->env->cfg->ede_serve_expired && - qstate->env->cfg->ede) { + if(r->edns.edns_present && + qstate->env->cfg->ede_serve_expired && + qstate->env->cfg->ede && + is_expired) { edns_opt_list_append_ede(&r->edns.opt_list_out, mstate->s.region, LDNS_EDE_STALE_ANSWER, NULL); } diff --git a/services/mesh.h b/services/mesh.h index 5bd53e065e8f..26ececbe6210 100644 --- a/services/mesh.h +++ b/services/mesh.h @@ -673,11 +673,12 @@ void mesh_serve_expired_callback(void* arg); * the same behavior as when replying from cache. * @param qstate: the module qstate. * @param lookup_qinfo: the query info to look for in the cache. + * @param is_expired: set if the cached answer is expired. * @return dns_msg if a cached answer was found, otherwise NULL. */ struct dns_msg* mesh_serve_expired_lookup(struct module_qstate* qstate, - struct query_info* lookup_qinfo); + struct query_info* lookup_qinfo, int* is_expired); /** * See if the mesh has space for more queries. You can allocate queries diff --git a/services/modstack.c b/services/modstack.c index 6c8af0505b69..fa68cc71d2ff 100644 --- a/services/modstack.c +++ b/services/modstack.c @@ -265,7 +265,7 @@ modstack_call_init(struct module_stack* stack, const char* module_conf, int i, changed = 0; env->need_to_validate = 0; /* set by module init below */ for(i=0; i<stack->num; i++) { - while(*module_conf && isspace(*module_conf)) + while(*module_conf && isspace((unsigned char)*module_conf)) module_conf++; if(strncmp(stack->mod[i]->name, module_conf, strlen(stack->mod[i]->name))) { diff --git a/services/rpz.c b/services/rpz.c index d8999a8a55eb..3b92ee53837e 100644 --- a/services/rpz.c +++ b/services/rpz.c @@ -1969,6 +1969,7 @@ rpz_synthesize_nodata(struct rpz* ATTR_UNUSED(r), struct module_qstate* ms, 0, /* ttl */ 0, /* prettl */ 0, /* expttl */ + 0, /* norecttl */ 0, /* an */ 0, /* ns */ 0, /* ar */ @@ -1999,6 +2000,7 @@ rpz_synthesize_nxdomain(struct rpz* r, struct module_qstate* ms, 0, /* ttl */ 0, /* prettl */ 0, /* expttl */ + 0, /* norecttl */ 0, /* an */ 0, /* ns */ 0, /* ar */ @@ -2031,6 +2033,7 @@ rpz_synthesize_localdata_from_rrset(struct rpz* ATTR_UNUSED(r), struct module_qs 0, /* ttl */ 0, /* prettl */ 0, /* expttl */ + 0, /* norecttl */ 1, /* an */ 0, /* ns */ 0, /* ar */ @@ -2176,6 +2179,7 @@ rpz_synthesize_cname_override_msg(struct rpz* r, struct module_qstate* ms, 0, /* ttl */ 0, /* prettl */ 0, /* expttl */ + 0, /* norecttl */ 1, /* an */ 0, /* ns */ 0, /* ar */ @@ -2288,15 +2292,18 @@ rpz_apply_nsip_trigger(struct module_qstate* ms, struct query_info* qchase, if(action == RPZ_LOCAL_DATA_ACTION && raddr->data == NULL) { verbose(VERB_ALGO, "rpz: bug: nsip local data action but no local data"); ret = rpz_synthesize_nodata(r, ms, qchase, az); + ms->rpz_applied = 1; goto done; } switch(action) { case RPZ_NXDOMAIN_ACTION: ret = rpz_synthesize_nxdomain(r, ms, qchase, az); + ms->rpz_applied = 1; break; case RPZ_NODATA_ACTION: ret = rpz_synthesize_nodata(r, ms, qchase, az); + ms->rpz_applied = 1; break; case RPZ_TCP_ONLY_ACTION: /* basically a passthru here but the tcp-only will be @@ -2306,11 +2313,13 @@ rpz_apply_nsip_trigger(struct module_qstate* ms, struct query_info* qchase, break; case RPZ_DROP_ACTION: ret = rpz_synthesize_nodata(r, ms, qchase, az); + ms->rpz_applied = 1; ms->is_drop = 1; break; case RPZ_LOCAL_DATA_ACTION: ret = rpz_synthesize_nsip_localdata(r, ms, qchase, raddr, az); if(ret == NULL) { ret = rpz_synthesize_nodata(r, ms, qchase, az); } + ms->rpz_applied = 1; break; case RPZ_PASSTHRU_ACTION: ret = NULL; @@ -2318,6 +2327,7 @@ rpz_apply_nsip_trigger(struct module_qstate* ms, struct query_info* qchase, break; case RPZ_CNAME_OVERRIDE_ACTION: ret = rpz_synthesize_cname_override_msg(r, ms, qchase); + ms->rpz_applied = 1; break; default: verbose(VERB_ALGO, "rpz: nsip: bug: unhandled or invalid action: '%s'", @@ -2352,9 +2362,11 @@ rpz_apply_nsdname_trigger(struct module_qstate* ms, struct query_info* qchase, switch(action) { case RPZ_NXDOMAIN_ACTION: ret = rpz_synthesize_nxdomain(r, ms, qchase, az); + ms->rpz_applied = 1; break; case RPZ_NODATA_ACTION: ret = rpz_synthesize_nodata(r, ms, qchase, az); + ms->rpz_applied = 1; break; case RPZ_TCP_ONLY_ACTION: /* basically a passthru here but the tcp-only will be @@ -2364,11 +2376,13 @@ rpz_apply_nsdname_trigger(struct module_qstate* ms, struct query_info* qchase, break; case RPZ_DROP_ACTION: ret = rpz_synthesize_nodata(r, ms, qchase, az); + ms->rpz_applied = 1; ms->is_drop = 1; break; case RPZ_LOCAL_DATA_ACTION: ret = rpz_synthesize_nsdname_localdata(r, ms, qchase, z, match, az); if(ret == NULL) { ret = rpz_synthesize_nodata(r, ms, qchase, az); } + ms->rpz_applied = 1; break; case RPZ_PASSTHRU_ACTION: ret = NULL; @@ -2376,6 +2390,7 @@ rpz_apply_nsdname_trigger(struct module_qstate* ms, struct query_info* qchase, break; case RPZ_CNAME_OVERRIDE_ACTION: ret = rpz_synthesize_cname_override_msg(r, ms, qchase); + ms->rpz_applied = 1; break; default: verbose(VERB_ALGO, "rpz: nsdname: bug: unhandled or invalid action: '%s'", @@ -2579,9 +2594,11 @@ struct dns_msg* rpz_callback_from_iterator_cname(struct module_qstate* ms, switch(localzone_type_to_rpz_action(lzt)) { case RPZ_NXDOMAIN_ACTION: ret = rpz_synthesize_nxdomain(r, ms, &is->qchase, a); + ms->rpz_applied = 1; break; case RPZ_NODATA_ACTION: ret = rpz_synthesize_nodata(r, ms, &is->qchase, a); + ms->rpz_applied = 1; break; case RPZ_TCP_ONLY_ACTION: /* basically a passthru here but the tcp-only will be @@ -2591,11 +2608,13 @@ struct dns_msg* rpz_callback_from_iterator_cname(struct module_qstate* ms, break; case RPZ_DROP_ACTION: ret = rpz_synthesize_nodata(r, ms, &is->qchase, a); + ms->rpz_applied = 1; ms->is_drop = 1; break; case RPZ_LOCAL_DATA_ACTION: ret = rpz_synthesize_qname_localdata_msg(r, ms, &is->qchase, z, a); if(ret == NULL) { ret = rpz_synthesize_nodata(r, ms, &is->qchase, a); } + ms->rpz_applied = 1; break; case RPZ_PASSTHRU_ACTION: ret = NULL; |
