diff options
author | Doug Barton <dougb@FreeBSD.org> | 2009-01-09 09:58:33 +0000 |
---|---|---|
committer | Doug Barton <dougb@FreeBSD.org> | 2009-01-09 09:58:33 +0000 |
commit | 0da30d61e624fa6fbd4d6b2057244e0c9e0c213b (patch) | |
tree | 76e88c380de063d6c34d9c662df82c2ee84b3bf1 /lib/isc | |
parent | 84c329fd83557f6415f6355007797dbcbaa685b0 (diff) |
Notes
Diffstat (limited to 'lib/isc')
-rw-r--r-- | lib/isc/Makefile.in | 10 | ||||
-rw-r--r-- | lib/isc/api | 4 | ||||
-rw-r--r-- | lib/isc/assertions.c | 8 | ||||
-rw-r--r-- | lib/isc/include/isc/assertions.h | 11 | ||||
-rw-r--r-- | lib/isc/include/isc/mem.h | 76 | ||||
-rw-r--r-- | lib/isc/include/isc/msgs.h | 10 | ||||
-rw-r--r-- | lib/isc/include/isc/platform.h.in | 19 | ||||
-rw-r--r-- | lib/isc/include/isc/portset.h | 141 | ||||
-rw-r--r-- | lib/isc/include/isc/resource.h | 15 | ||||
-rw-r--r-- | lib/isc/include/isc/socket.h | 75 | ||||
-rw-r--r-- | lib/isc/include/isc/timer.h | 2 | ||||
-rw-r--r-- | lib/isc/include/isc/types.h | 7 | ||||
-rw-r--r-- | lib/isc/mem.c | 25 | ||||
-rw-r--r-- | lib/isc/portset.c | 143 | ||||
-rw-r--r-- | lib/isc/print.c | 95 | ||||
-rw-r--r-- | lib/isc/pthreads/mutex.c | 74 | ||||
-rw-r--r-- | lib/isc/timer.c | 2 | ||||
-rw-r--r-- | lib/isc/unix/app.c | 19 | ||||
-rw-r--r-- | lib/isc/unix/include/isc/net.h | 25 | ||||
-rw-r--r-- | lib/isc/unix/net.c | 161 | ||||
-rw-r--r-- | lib/isc/unix/resource.c | 8 | ||||
-rw-r--r-- | lib/isc/unix/socket.c | 1676 | ||||
-rw-r--r-- | lib/isc/unix/socket_p.h | 15 | ||||
-rw-r--r-- | lib/isc/unix/time.c | 10 |
24 files changed, 2148 insertions, 483 deletions
diff --git a/lib/isc/Makefile.in b/lib/isc/Makefile.in index 7e53510c507db..78a820d767aa8 100644 --- a/lib/isc/Makefile.in +++ b/lib/isc/Makefile.in @@ -1,7 +1,7 @@ -# Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") +# Copyright (C) 2004, 2008 Internet Systems Consortium, Inc. ("ISC") # Copyright (C) 1998-2003 Internet Software Consortium. # -# Permission to use, copy, modify, and distribute this software for any +# Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # @@ -13,7 +13,7 @@ # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. -# $Id: Makefile.in,v 1.71.2.2.2.8 2004/07/20 07:01:58 marka Exp $ +# $Id: Makefile.in,v 1.71.2.2.2.10 2008/06/25 23:45:37 tbox Exp $ srcdir = @srcdir@ VPATH = @srcdir@ @@ -55,7 +55,7 @@ OBJS = @ISC_EXTRA_OBJS@ \ hash.@O@ heap.@O@ hex.@O@ hmacmd5.@O@ \ lex.@O@ lfsr.@O@ lib.@O@ log.@O@ md5.@O@ \ mem.@O@ mutexblock.@O@ netaddr.@O@ netscope.@O@ ondestroy.@O@ \ - parseint.@O@ quota.@O@ random.@O@ \ + parseint.@O@ portset.@O@ quota.@O@ random.@O@ \ ratelimiter.@O@ region.@O@ result.@O@ rwlock.@O@ \ serial.@O@ sha1.@O@ sockaddr.@O@ string.@O@ strtoul.@O@ \ symtab.@O@ task.@O@ taskpool.@O@ timer.@O@ version.@O@ \ @@ -68,7 +68,7 @@ SRCS = @ISC_EXTRA_SRCS@ \ heap.c hex.c hmacmd5.c \ lex.c lfsr.c lib.c log.c \ md5.c mem.c mutexblock.c netaddr.c netscope.c ondestroy.c \ - parseint.c quota.c random.c \ + parseint.c portset.c quota.c random.c \ ratelimiter.c result.c rwlock.c \ serial.c sha1.c sockaddr.c string.c strtoul.c symtab.c \ task.c taskpool.c timer.c version.c diff --git a/lib/isc/api b/lib/isc/api index a8be9070823bf..00b7f9242ef56 100644 --- a/lib/isc/api +++ b/lib/isc/api @@ -1,3 +1,3 @@ -LIBINTERFACE = 14 -LIBREVISION = 0 +LIBINTERFACE = 15 +LIBREVISION = 2 LIBAGE = 0 diff --git a/lib/isc/assertions.c b/lib/isc/assertions.c index 94c6732fd8bd1..ce4ea83c850ab 100644 --- a/lib/isc/assertions.c +++ b/lib/isc/assertions.c @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1997-2001 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: assertions.c,v 1.16.206.1 2004/03/06 08:14:27 marka Exp $ */ +/* $Id: assertions.c,v 1.16.206.3 2008/10/15 23:45:34 tbox Exp $ */ #include <config.h> @@ -28,7 +28,7 @@ /* * Forward. */ - +/* coverity[+kill] */ static void default_callback(const char *, int, isc_assertiontype_t, const char *); diff --git a/lib/isc/include/isc/assertions.h b/lib/isc/include/isc/assertions.h index 6091de9a63389..b6f68a6dfd5b1 100644 --- a/lib/isc/include/isc/assertions.h +++ b/lib/isc/include/isc/assertions.h @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1997-2001 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -16,7 +16,7 @@ */ /* - * $Id: assertions.h,v 1.17.206.1 2004/03/06 08:14:38 marka Exp $ + * $Id: assertions.h,v 1.17.206.3 2008/10/15 23:45:34 tbox Exp $ */ #ifndef ISC_ASSERTIONS_H @@ -37,6 +37,7 @@ typedef enum { typedef void (*isc_assertioncallback_t)(const char *, int, isc_assertiontype_t, const char *); +/* coverity[+kill] */ LIBISC_EXTERNAL_DATA extern isc_assertioncallback_t isc_assertion_failed; void @@ -45,14 +46,14 @@ isc_assertion_setcallback(isc_assertioncallback_t); const char * isc_assertion_typetotext(isc_assertiontype_t type); -#ifdef ISC_CHECK_ALL +#if defined(ISC_CHECK_ALL) || defined(__COVERITY__) #define ISC_CHECK_REQUIRE 1 #define ISC_CHECK_ENSURE 1 #define ISC_CHECK_INSIST 1 #define ISC_CHECK_INVARIANT 1 #endif -#ifdef ISC_CHECK_NONE +#if defined(ISC_CHECK_NONE) && !defined(__COVERITY__) #define ISC_CHECK_REQUIRE 0 #define ISC_CHECK_ENSURE 0 #define ISC_CHECK_INSIST 0 diff --git a/lib/isc/include/isc/mem.h b/lib/isc/include/isc/mem.h index bb94f5236b571..979407d89c1c3 100644 --- a/lib/isc/include/isc/mem.h +++ b/lib/isc/include/isc/mem.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004, 2007 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2007, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1997-2001, 2003 Internet Software Consortium. * * Permission to use, copy, modify, and/or distribute this software for any @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.h,v 1.54.12.7 2007/08/28 07:19:15 tbox Exp $ */ +/* $Id: mem.h,v 1.54.12.9 2008/04/28 23:45:38 tbox Exp $ */ #ifndef ISC_MEM_H #define ISC_MEM_H 1 @@ -170,11 +170,11 @@ LIBISC_EXTERNAL_DATA extern unsigned int isc_mem_debugging; #define isc_mempool_put(c, p) isc__mempool_put((c), (p) _ISC_MEM_FILELINE) #endif -isc_result_t +isc_result_t isc_mem_create(size_t max_size, size_t target_size, isc_mem_t **mctxp); -isc_result_t +isc_result_t isc_mem_createx(size_t max_size, size_t target_size, isc_memalloc_t memalloc, isc_memfree_t memfree, void *arg, isc_mem_t **mctxp); @@ -203,9 +203,9 @@ isc_mem_createx(size_t max_size, size_t target_size, * Requires: * mctxp != NULL && *mctxp == NULL */ -void +void isc_mem_attach(isc_mem_t *, isc_mem_t **); -void +void isc_mem_detach(isc_mem_t **); /* * Attach to / detach from a memory context. @@ -213,20 +213,20 @@ isc_mem_detach(isc_mem_t **); * This is intended for applications that use multiple memory contexts * in such a way that it is not obvious when the last allocations from * a given context has been freed and destroying the context is safe. - * + * * Most applications do not need to call these functions as they can * simply create a single memory context at the beginning of main() * and destroy it at the end of main(), thereby guaranteeing that it * is not destroyed while there are outstanding allocations. */ -void +void isc_mem_destroy(isc_mem_t **); /* * Destroy a memory context. */ -isc_result_t +isc_result_t isc_mem_ondestroy(isc_mem_t *ctx, isc_task_t *task, isc_event_t **event); @@ -235,13 +235,13 @@ isc_mem_ondestroy(isc_mem_t *ctx, * been successfully destroyed. */ -void +void isc_mem_stats(isc_mem_t *mctx, FILE *out); /* * Print memory usage statistics for 'mctx' on the stream 'out'. */ -void +void isc_mem_setdestroycheck(isc_mem_t *mctx, isc_boolean_t on); /* @@ -249,9 +249,9 @@ isc_mem_setdestroycheck(isc_mem_t *mctx, * destroyed and abort the program if any are present. */ -void +void isc_mem_setquota(isc_mem_t *, size_t); -size_t +size_t isc_mem_getquota(isc_mem_t *); /* * Set/get the memory quota of 'mctx'. This is a hard limit @@ -259,7 +259,7 @@ isc_mem_getquota(isc_mem_t *); * if it is exceeded, allocations will fail. */ -size_t +size_t isc_mem_inuse(isc_mem_t *mctx); /* * Get an estimate of the number of memory in use in 'mctx', in bytes. @@ -271,10 +271,28 @@ void isc_mem_setwater(isc_mem_t *mctx, isc_mem_water_t water, void *water_arg, size_t hiwater, size_t lowater); /* - * Set high and low water marks for this memory context. When the memory - * usage of 'mctx' exceeds 'hiwater', '(water)(water_arg, ISC_MEM_HIWATER)' - * will be called. When the usage drops below 'lowater', 'water' will - * again be called, this time with ISC_MEM_LOWATER. + * Set high and low water marks for this memory context. + * When the memory usage of 'mctx' exceeds 'hiwater', + * '(water)(water_arg, #ISC_MEM_HIWATER)' will be called. 'water' needs to + * call isc_mem_waterack() with #ISC_MEM_HIWATER to acknowlege the state + * change. 'water' may be called multiple times. + * + * When the usage drops below 'lowater', 'water' will again be called, this + * time with #ISC_MEM_LOWATER. 'water' need to calls isc_mem_waterack() with + * #ISC_MEM_LOWATER to acknowlege the change. + * + * static void + * water(void *arg, int mark) { + * struct foo *foo = arg; + * + * LOCK(&foo->marklock); + * if (foo->mark != mark) { + * foo->mark = mark; + * .... + * isc_mem_waterack(foo->mctx, mark); + * } + * UNLOCK(&foo->marklock); + * } * * If 'water' is NULL then 'water_arg', 'hi_water' and 'lo_water' are * ignored and the state is reset. @@ -285,6 +303,12 @@ isc_mem_setwater(isc_mem_t *mctx, isc_mem_water_t water, void *water_arg, * hi_water >= lo_water */ +void +isc_mem_waterack(isc_mem_t *ctx, int mark); +/*%< + * Called to acknowledge changes in signalled by calls to 'water'. + */ + /* * Memory pools */ @@ -429,22 +453,22 @@ isc_mempool_setfillcount(isc_mempool_t *mpctx, unsigned int limit); /* * Pseudo-private functions for use via macros. Do not call directly. */ -void * +void * isc__mem_get(isc_mem_t *, size_t _ISC_MEM_FLARG); -void +void isc__mem_putanddetach(isc_mem_t **, void *, size_t _ISC_MEM_FLARG); -void +void isc__mem_put(isc_mem_t *, void *, size_t _ISC_MEM_FLARG); -void * +void * isc__mem_allocate(isc_mem_t *, size_t _ISC_MEM_FLARG); -void +void isc__mem_free(isc_mem_t *, void * _ISC_MEM_FLARG); -char * +char * isc__mem_strdup(isc_mem_t *, const char *_ISC_MEM_FLARG); -void * +void * isc__mempool_get(isc_mempool_t * _ISC_MEM_FLARG); -void +void isc__mempool_put(isc_mempool_t *, void * _ISC_MEM_FLARG); ISC_LANG_ENDDECLS diff --git a/lib/isc/include/isc/msgs.h b/lib/isc/include/isc/msgs.h index 967005bf35317..584edcbd881c4 100644 --- a/lib/isc/include/isc/msgs.h +++ b/lib/isc/include/isc/msgs.h @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 2000-2003 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: msgs.h,v 1.5.2.2.8.3 2004/03/06 08:14:44 marka Exp $ */ +/* $Id: msgs.h,v 1.5.2.2.8.5 2008/08/08 06:27:25 tbox Exp $ */ #ifndef ISC_MSGS_H #define ISC_MSGS_H 1 @@ -146,7 +146,9 @@ #define ISC_MSG_ACCEPTRETURNED 1418 /* accept() returned %d/%s */ #define ISC_MSG_TOOMANYFDS 1419 /* %s: too many open file descriptors */ #define ISC_MSG_ZEROPORT 1420 /* dropping source port zero packet */ -#define ISC_MSG_FILTER 1420 /* setsockopt(SO_ACCEPTFILTER): %s */ +#define ISC_MSG_FILTER 1421 /* setsockopt(SO_ACCEPTFILTER): %s */ + +#define ISC_MSG_TOOMANYHANDLES 1422 /*%< %s: too many open WSA event handles: %s */ #define ISC_MSG_AWAKE 1502 /* "awake" */ #define ISC_MSG_WORKING 1503 /* "working" */ diff --git a/lib/isc/include/isc/platform.h.in b/lib/isc/include/isc/platform.h.in index 9c4edabb5facc..d1877931d4252 100644 --- a/lib/isc/include/isc/platform.h.in +++ b/lib/isc/include/isc/platform.h.in @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004, 2007 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2007, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2003 Internet Software Consortium. * * Permission to use, copy, modify, and/or distribute this software for any @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: platform.h.in,v 1.24.2.1.10.13 2007/09/13 23:45:58 tbox Exp $ */ +/* $Id: platform.h.in,v 1.24.2.1.10.15 2008/06/25 23:45:37 tbox Exp $ */ #ifndef ISC_PLATFORM_H #define ISC_PLATFORM_H 1 @@ -142,6 +142,21 @@ *** Printing. ***/ +/*! \brief + * Define if the system supports kqueue multiplexing + */ +@ISC_PLATFORM_HAVEKQUEUE@ + +/*! \brief + * Define if the system supports epoll multiplexing + */ +@ISC_PLATFORM_HAVEEPOLL@ + +/*! \brief + * Define if the system supports /dev/poll multiplexing + */ +@ISC_PLATFORM_HAVEDEVPOLL@ + /* * If this system needs vsnprintf() and snprintf(), ISC_PLATFORM_NEEDVSNPRINTF * will be defined. diff --git a/lib/isc/include/isc/portset.h b/lib/isc/include/isc/portset.h new file mode 100644 index 0000000000000..60cbb790cbe8a --- /dev/null +++ b/lib/isc/include/isc/portset.h @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2008 Internet Systems Consortium, Inc. ("ISC") + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* $Id: portset.h,v 1.3.6.2 2008/06/25 23:45:37 tbox Exp $ */ + +/*! \file isc/portset.h + * \brief Transport Protocol Port Manipuration Module + * + * This module provides simple utilities to handle a set of transport protocol + * (UDP or TCP) port numbers, e.g., for creating an ACL list. An isc_portset_t + * object is an opaque instance of a port set, for which the user can add or + * remove a specific port or a range of consecutive ports. This object is + * expected to be used as a temporary work space only, and does not protect + * simultaneous access from multiple threads. Therefore it must not be stored + * in a place that can be accessed from multiple threads. + */ + +#ifndef ISC_PORTSET_H +#define ISC_PORTSET_H 1 + +/*** + *** Imports + ***/ + +#include <isc/net.h> + +/*** + *** Functions + ***/ + +ISC_LANG_BEGINDECLS + +isc_result_t +isc_portset_create(isc_mem_t *mctx, isc_portset_t **portsetp); +/*%< + * Create a port set and initialize it as an empty set. + * + * Requires: + *\li 'mctx' to be valid. + *\li 'portsetp' to be non NULL and '*portsetp' to be NULL; + * + * Returns: + *\li #ISC_R_SUCCESS + *\li #ISC_R_NOMEMORY + */ + +void +isc_portset_destroy(isc_mem_t *mctx, isc_portset_t **portsetp); +/*%< + * Destroy a port set. + * + * Requires: + *\li 'mctx' to be valid and must be the same context given when the port set + * was created. + *\li '*portsetp' to be a valid set. + */ + +isc_boolean_t +isc_portset_isset(isc_portset_t *portset, in_port_t port); +/*%< + * Test whether the given port is stored in the portset. + * + * Requires: + *\li 'portset' to be a valid set. + * + * Returns + * \li #ISC_TRUE if the port is found, ISC_FALSE otherwise. + */ + +unsigned int +isc_portset_nports(isc_portset_t *portset); +/*%< + * Provides the number of ports stored in the given portset. + * + * Requires: + *\li 'portset' to be a valid set. + * + * Returns + * \li the number of ports stored in portset. + */ + +void +isc_portset_add(isc_portset_t *portset, in_port_t port); +/*%< + * Add the given port to the portset. The port may or may not be stored in + * the portset. + * + * Requires: + *\li 'portlist' to be valid. + */ + +void +isc_portset_remove(isc_portset_t *portset, in_port_t port); +/*%< + * Remove the given port to the portset. The port may or may not be stored in + * the portset. + * + * Requires: + *\li 'portlist' to be valid. + */ + +void +isc_portset_addrange(isc_portset_t *portset, in_port_t port_lo, + in_port_t port_hi); +/*%< + * Add a subset of [port_lo, port_hi] (inclusive) to the portset. Ports in the + * subset may or may not be stored in portset. + * + * Requires: + *\li 'portlist' to be valid. + *\li port_lo <= port_hi + */ + +void +isc_portset_removerange(isc_portset_t *portset, in_port_t port_lo, + in_port_t port_hi); +/*%< + * Subtract a subset of [port_lo, port_hi] (inclusive) from the portset. Ports + * in the subset may or may not be stored in portset. + * + * Requires: + *\li 'portlist' to be valid. + *\li port_lo <= port_hi + */ + +ISC_LANG_ENDDECLS + +#endif /* ISC_NETADDR_H */ diff --git a/lib/isc/include/isc/resource.h b/lib/isc/include/isc/resource.h index 1d4cb3023a4a3..fbd9e287ae849 100644 --- a/lib/isc/include/isc/resource.h +++ b/lib/isc/include/isc/resource.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: resource.h,v 1.4.206.1.34.2 2008/07/23 23:47:49 tbox Exp $ */ +/* $Id: resource.h,v 1.4.206.3 2008/08/01 23:45:29 tbox Exp $ */ #ifndef ISC_RESOURCE_H #define ISC_RESOURCE_H 1 @@ -80,16 +80,13 @@ isc_resource_getlimit(isc_resource_t resource, isc_resourcevalue_t *value); */ isc_result_t -isc_resource_curlimit(isc_resource_t resource, isc_resourcevalue_t *value); -/* - * Get the current limit on a resource. - * - * Requires: - * 'resource' is a valid member of the isc_resource_t enumeration. +isc_resource_getcurlimit(isc_resource_t resource, isc_resourcevalue_t *value); +/*%< + * Same as isc_resource_getlimit(), but returns the current (soft) limit. * * Returns: - * ISC_R_SUCCESS Success. - * ISC_R_NOTIMPLEMENTED 'resource' is not a type known by the OS. + *\li #ISC_R_SUCCESS Success. + *\li #ISC_R_NOTIMPLEMENTED 'resource' is not a type known by the OS. */ ISC_LANG_ENDDECLS diff --git a/lib/isc/include/isc/socket.h b/lib/isc/include/isc/socket.h index 0537a0d6693bf..73ffb2daffd91 100644 --- a/lib/isc/include/isc/socket.h +++ b/lib/isc/include/isc/socket.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket.h,v 1.54.12.7.4.4 2008/07/23 23:16:27 marka Exp $ */ +/* $Id: socket.h,v 1.54.12.16 2008/09/11 06:14:46 each Exp $ */ #ifndef ISC_SOCKET_H #define ISC_SOCKET_H 1 @@ -103,6 +103,7 @@ struct isc_socketevent { isc_time_t timestamp; /* timestamp of packet recv */ struct in6_pktinfo pktinfo; /* ipv6 pktinfo */ isc_uint32_t attributes; /* see below */ + isc_eventdestructor_t destroy; /* original destructor */ }; typedef struct isc_socket_newconnev isc_socket_newconnev_t; @@ -312,8 +313,53 @@ isc_socket_detach(isc_socket_t **socketp); */ isc_result_t +isc_socket_open(isc_socket_t *sock); +/* + * Open a new socket file descriptor of the given socket structure. It simply + * opens a new descriptor; all of the other parameters including the socket + * type are inherited from the existing socket. This function is provided to + * avoid overhead of destroying and creating sockets when many short-lived + * sockets are frequently opened and closed. When the efficiency is not an + * issue, it should be safer to detach the unused socket and re-create a new + * one. This optimization may not be available for some systems, in which + * case this function will return ISC_R_NOTIMPLEMENTED and must not be used. + * + * Requires: + * + * \li there must be no other reference to this socket. + * + * \li 'socket' is a valid and previously closed by isc_socket_close() + * + * Returns: + * Same as isc_socket_create(). + * \li ISC_R_NOTIMPLEMENTED + */ + +isc_result_t +isc_socket_close(isc_socket_t *sock); +/* + * Close a socket file descriptor of the given socket structure. This function + * is provided as an alternative to destroying an unused socket when overhead + * destroying/re-creating sockets can be significant, and is expected to be + * used with isc_socket_open(). This optimization may not be available for some + * systems, in which case this function will return ISC_R_NOTIMPLEMENTED and + * must not be used. + * + * Requires: + * + * \li The socket must have a valid descriptor. + * + * \li There must be no other reference to this socket. + * + * \li There must be no pending I/O requests. + * + * Returns: + * \li #ISC_R_NOTIMPLEMENTED + */ + +isc_result_t isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *addressp, - unsigned int reuseaddr); + unsigned int options); /* * Bind 'socket' to '*addressp'. * @@ -624,6 +670,7 @@ isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, * ISC_R_INPROGRESS * ISC_R_NOMEMORY * ISC_R_UNEXPECTED + * ISC_R_NOTIMPLEMENTED * * Event results: * @@ -634,8 +681,14 @@ isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_result_t isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp); + +isc_result_t +isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, + unsigned int maxsocks); /* - * Create a socket manager. + * Create a socket manager. If "maxsocks" is non-zero, it specifies the + * maximum number of sockets that the created manager should handle. + * isc_socketmgr_create() is equivalent of isc_socketmgr_create2() with * * Notes: * @@ -658,6 +711,22 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp); * ISC_R_UNEXPECTED */ +isc_result_t +isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp); +/*%< + * Returns in "*nsockp" the maximum number of sockets this manager may open. + * + * Requires: + * + *\li '*manager' is a valid isc_socketmgr_t. + *\li 'nsockp' is not NULL. + * + * Returns: + * + *\li #ISC_R_SUCCESS + *\li #ISC_R_NOTIMPLEMENTED + */ + void isc_socketmgr_destroy(isc_socketmgr_t **managerp); /* diff --git a/lib/isc/include/isc/timer.h b/lib/isc/include/isc/timer.h index 9d4cc00000a83..4b39c6220b609 100644 --- a/lib/isc/include/isc/timer.h +++ b/lib/isc/include/isc/timer.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: timer.h,v 1.28.12.9.4.2 2008/07/24 23:47:44 tbox Exp $ */ +/* $Id: timer.h,v 1.28.12.11 2008/06/25 23:45:37 tbox Exp $ */ #ifndef ISC_TIMER_H #define ISC_TIMER_H 1 diff --git a/lib/isc/include/isc/types.h b/lib/isc/include/isc/types.h index fad77da99e7be..d5f010092889e 100644 --- a/lib/isc/include/isc/types.h +++ b/lib/isc/include/isc/types.h @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2003 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: types.h,v 1.32.2.3.2.1 2004/03/06 08:14:50 marka Exp $ */ +/* $Id: types.h,v 1.32.2.3.2.3 2008/06/25 23:45:37 tbox Exp $ */ #ifndef ISC_TYPES_H #define ISC_TYPES_H 1 @@ -66,6 +66,7 @@ typedef struct isc_mempool isc_mempool_t; typedef struct isc_msgcat isc_msgcat_t; typedef struct isc_ondestroy isc_ondestroy_t; typedef struct isc_netaddr isc_netaddr_t; +typedef struct isc_portset isc_portset_t; typedef struct isc_quota isc_quota_t; typedef struct isc_random isc_random_t; typedef struct isc_ratelimiter isc_ratelimiter_t; diff --git a/lib/isc/mem.c b/lib/isc/mem.c index 8bfe967295c9b..69f6cab9ea7b5 100644 --- a/lib/isc/mem.c +++ b/lib/isc/mem.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004-2007 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004-2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1997-2003 Internet Software Consortium. * * Permission to use, copy, modify, and/or distribute this software for any @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.c,v 1.98.2.7.2.12 2007/11/26 23:45:51 tbox Exp $ */ +/* $Id: mem.c,v 1.98.2.7.2.14 2008/04/28 23:45:37 tbox Exp $ */ #include <config.h> @@ -604,7 +604,7 @@ mem_get(isc_mem_t *ctx, size_t size) { ret = (ctx->memalloc)(ctx->arg, size); if (ret == NULL) - ctx->memalloc_failures++; + ctx->memalloc_failures++; #if ISC_MEM_FILL if (ret != NULL) @@ -1036,7 +1036,6 @@ isc__mem_get(isc_mem_t *ctx, size_t size FLARG) { ADD_TRACE(ctx, ptr, size, file, line); if (ctx->hi_water != 0U && !ctx->hi_called && ctx->inuse > ctx->hi_water) { - ctx->hi_called = ISC_TRUE; call_water = ISC_TRUE; } if (ctx->inuse > ctx->maxinuse) { @@ -1078,10 +1077,8 @@ isc__mem_put(isc_mem_t *ctx, void *ptr, size_t size FLARG) * when the context was pushed over hi_water but then had * isc_mem_setwater() called with 0 for hi_water and lo_water. */ - if (ctx->hi_called && + if (ctx->hi_called && (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { - ctx->hi_called = ISC_FALSE; - if (ctx->water != NULL) call_water = ISC_TRUE; } @@ -1091,6 +1088,18 @@ isc__mem_put(isc_mem_t *ctx, void *ptr, size_t size FLARG) (ctx->water)(ctx->water_arg, ISC_MEM_LOWATER); } +void +isc_mem_waterack(isc_mem_t *ctx, int flag) { + REQUIRE(VALID_CONTEXT(ctx)); + + LOCK(&ctx->lock); + if (flag == ISC_MEM_LOWATER) + ctx->hi_called = ISC_FALSE; + else if (flag == ISC_MEM_HIWATER) + ctx->hi_called = ISC_TRUE; + UNLOCK(&ctx->lock); +} + #if ISC_MEM_TRACKLINES static void print_active(isc_mem_t *mctx, FILE *out) { @@ -1110,7 +1119,7 @@ print_active(isc_mem_t *mctx, FILE *out) { "\tptr %p size %u file %s line %u\n"); for (i = 0; i <= mctx->max_size; i++) { dl = ISC_LIST_HEAD(mctx->debuglist[i]); - + if (dl != NULL) found = ISC_TRUE; diff --git a/lib/isc/portset.c b/lib/isc/portset.c new file mode 100644 index 0000000000000..90e95968fc3b6 --- /dev/null +++ b/lib/isc/portset.c @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2008 Internet Systems Consortium, Inc. ("ISC") + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* $Id: portset.c,v 1.4.2.1 2008/06/25 00:03:29 jinmei Exp $ */ + +/*! \file */ + +#include <config.h> + +#include <isc/mem.h> +#include <isc/portset.h> +#include <isc/string.h> +#include <isc/types.h> +#include <isc/util.h> + +#define ISC_PORTSET_BUFSIZE (65536 / (sizeof(isc_uint32_t) * 8)) + +/*% + * Internal representation of portset. It's an array of 32-bit integers, each + * bit corresponding to a single port in the ascending order. For example, + * the second most significant bit of buf[0] corresponds to port 1. + */ +struct isc_portset { + unsigned int nports; /*%< number of ports in the set */ + isc_uint32_t buf[ISC_PORTSET_BUFSIZE]; +}; + +static inline isc_boolean_t +portset_isset(isc_portset_t *portset, in_port_t port) { + return (ISC_TF((portset->buf[port >> 5] & (1 << (port & 31))) != 0)); +} + +static inline void +portset_add(isc_portset_t *portset, in_port_t port) { + if (!portset_isset(portset, port)) { + portset->nports++; + portset->buf[port >> 5] |= (1 << (port & 31)); + } +} + +static inline void +portset_remove(isc_portset_t *portset, in_port_t port) { + if (portset_isset(portset, port)) { + portset->nports--; + portset->buf[port >> 5] &= ~(1 << (port & 31)); + } +} + +isc_result_t +isc_portset_create(isc_mem_t *mctx, isc_portset_t **portsetp) { + isc_portset_t *portset; + + REQUIRE(portsetp != NULL && *portsetp == NULL); + + portset = isc_mem_get(mctx, sizeof(*portset)); + if (portset == NULL) + return (ISC_R_NOMEMORY); + + /* Make the set 'empty' by default */ + memset(portset, 0, sizeof(*portset)); + *portsetp = portset; + + return (ISC_R_SUCCESS); +} + +void +isc_portset_destroy(isc_mem_t *mctx, isc_portset_t **portsetp) { + isc_portset_t *portset; + + REQUIRE(portsetp != NULL); + portset = *portsetp; + + isc_mem_put(mctx, portset, sizeof(*portset)); +} + +isc_boolean_t +isc_portset_isset(isc_portset_t *portset, in_port_t port) { + REQUIRE(portset != NULL); + + return (portset_isset(portset, port)); +} + +unsigned int +isc_portset_nports(isc_portset_t *portset) { + REQUIRE(portset != NULL); + + return (portset->nports); +} + +void +isc_portset_add(isc_portset_t *portset, in_port_t port) { + REQUIRE(portset != NULL); + + portset_add(portset, port); +} + +void +isc_portset_remove(isc_portset_t *portset, in_port_t port) { + portset_remove(portset, port); +} + +void +isc_portset_addrange(isc_portset_t *portset, in_port_t port_lo, + in_port_t port_hi) +{ + in_port_t p; + + REQUIRE(portset != NULL); + REQUIRE(port_lo <= port_hi); + + p = port_lo; + do { + portset_add(portset, p); + } while (p++ < port_hi); +} + +void +isc_portset_removerange(isc_portset_t *portset, in_port_t port_lo, + in_port_t port_hi) +{ + in_port_t p; + + REQUIRE(portset != NULL); + REQUIRE(port_lo <= port_hi); + + p = port_lo; + do { + portset_remove(portset, p); + } while (p++ < port_hi); +} diff --git a/lib/isc/print.c b/lib/isc/print.c index ee50b29e5d6d1..e4e4a709f7c47 100644 --- a/lib/isc/print.c +++ b/lib/isc/print.c @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004-2006 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004-2006, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2001, 2003 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: print.c,v 1.22.2.3.2.4 2006/04/17 18:27:20 explorer Exp $ */ +/* $Id: print.c,v 1.22.2.3.2.6 2008/04/28 23:45:38 tbox Exp $ */ /*! \file */ @@ -246,8 +246,24 @@ isc_print_vsnprintf(char *str, size_t size, const char *format, va_list ap) { head = ""; tmpui = tmpi; } - sprintf(buf, "%" ISC_PRINT_QUADFORMAT "u", - tmpui); + if (tmpui <= 0xffffffffU) + sprintf(buf, "%lu", + (unsigned long)tmpui); + else { + unsigned long mid; + unsigned long lo; + unsigned long hi; + lo = tmpui % 1000000000; + tmpui /= 1000000000; + mid = tmpui % 1000000000; + hi = tmpui / 1000000000; + if (hi != 0) + sprintf(buf, "%lu", hi); + else + buf[0] = '\n'; + sprintf(buf + strlen(buf), "%lu", mid); + sprintf(buf + strlen(buf), "%lu", lo); + } goto printint; case 'o': if (q) @@ -256,10 +272,29 @@ isc_print_vsnprintf(char *str, size_t size, const char *format, va_list ap) { tmpui = va_arg(ap, long int); else tmpui = va_arg(ap, int); - sprintf(buf, - alt ? "%#" ISC_PRINT_QUADFORMAT "o" - : "%" ISC_PRINT_QUADFORMAT "o", - tmpui); + if (tmpui <= 0xffffffffU) + sprintf(buf, alt ? "%#lo" : "%lo", + (unsigned long)tmpui); + else { + unsigned long mid; + unsigned long lo; + unsigned long hi; + lo = tmpui % 010000000000; + tmpui /= 010000000000; + mid = tmpui % 010000000000; + hi = tmpui / 010000000000; + if (hi != 0) { + sprintf(buf, + alt ? "%#lo" : "%lo", + hi); + sprintf(buf + strlen(buf), + "%lo", mid); + } else + sprintf(buf, + alt ? "%#lo" : "%lo", + mid); + sprintf(buf + strlen(buf), "%lo", lo); + } goto printint; case 'u': if (q) @@ -268,8 +303,24 @@ isc_print_vsnprintf(char *str, size_t size, const char *format, va_list ap) { tmpui = va_arg(ap, unsigned long int); else tmpui = va_arg(ap, unsigned int); - sprintf(buf, "%" ISC_PRINT_QUADFORMAT "u", - tmpui); + if (tmpui <= 0xffffffffU) + sprintf(buf, "%lu", + (unsigned long)tmpui); + else { + unsigned long mid; + unsigned long lo; + unsigned long hi; + lo = tmpui % 1000000000; + tmpui /= 1000000000; + mid = tmpui % 1000000000; + hi = tmpui / 1000000000; + if (hi != 0) + sprintf(buf, "%lu", hi); + else + buf[0] = '\n'; + sprintf(buf + strlen(buf), "%lu", mid); + sprintf(buf + strlen(buf), "%lu", lo); + } goto printint; case 'x': if (q) @@ -283,8 +334,15 @@ isc_print_vsnprintf(char *str, size_t size, const char *format, va_list ap) { if (precision > 2) precision -= 2; } - sprintf(buf, "%" ISC_PRINT_QUADFORMAT "x", - tmpui); + if (tmpui <= 0xffffffffU) + sprintf(buf, "%lx", + (unsigned long)tmpui); + else { + unsigned long hi = tmpui>>32; + unsigned long lo = tmpui & 0xffffffff; + sprintf(buf, "%lx", hi); + sprintf(buf + strlen(buf), "%lx", lo); + } goto printint; case 'X': if (q) @@ -298,8 +356,15 @@ isc_print_vsnprintf(char *str, size_t size, const char *format, va_list ap) { if (precision > 2) precision -= 2; } - sprintf(buf, "%" ISC_PRINT_QUADFORMAT "X", - tmpui); + if (tmpui <= 0xffffffffU) + sprintf(buf, "%lX", + (unsigned long)tmpui); + else { + unsigned long hi = tmpui>>32; + unsigned long lo = tmpui & 0xffffffff; + sprintf(buf, "%lX", hi); + sprintf(buf + strlen(buf), "%lX", lo); + } goto printint; printint: if (precision != 0 || width != 0) { diff --git a/lib/isc/pthreads/mutex.c b/lib/isc/pthreads/mutex.c index 17cca53fc6a72..ef7db0de9709d 100644 --- a/lib/isc/pthreads/mutex.c +++ b/lib/isc/pthreads/mutex.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004, 2005, 2007 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2005, 2007, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 2000-2003 Internet Software Consortium. * * Permission to use, copy, modify, and/or distribute this software for any @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mutex.c,v 1.6.26.8 2007/08/28 07:19:17 tbox Exp $ */ +/* $Id: mutex.c,v 1.6.26.10 2008/04/04 23:45:32 tbox Exp $ */ #include <config.h> @@ -31,23 +31,23 @@ /* Operations on timevals; adapted from FreeBSD's sys/time.h */ #define timevalclear(tvp) ((tvp)->tv_sec = (tvp)->tv_usec = 0) #define timevaladd(vvp, uvp) \ - do { \ - (vvp)->tv_sec += (uvp)->tv_sec; \ - (vvp)->tv_usec += (uvp)->tv_usec; \ - if ((vvp)->tv_usec >= 1000000) { \ - (vvp)->tv_sec++; \ - (vvp)->tv_usec -= 1000000; \ - } \ - } while (0) + do { \ + (vvp)->tv_sec += (uvp)->tv_sec; \ + (vvp)->tv_usec += (uvp)->tv_usec; \ + if ((vvp)->tv_usec >= 1000000) { \ + (vvp)->tv_sec++; \ + (vvp)->tv_usec -= 1000000; \ + } \ + } while (0) #define timevalsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_usec -= (uvp)->tv_usec; \ - if ((vvp)->tv_usec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_usec += 1000000; \ - } \ - } while (0) + do { \ + (vvp)->tv_sec -= (uvp)->tv_sec; \ + (vvp)->tv_usec -= (uvp)->tv_usec; \ + if ((vvp)->tv_usec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_usec += 1000000; \ + } \ + } while (0) #define ISC_MUTEX_MAX_LOCKERS 32 @@ -70,8 +70,11 @@ struct isc_mutexstats { isc_mutexlocker_t lockers[ISC_MUTEX_MAX_LOCKERS]; }; -#define TABLESIZE (8 * 1024) -static isc_mutexstats_t stats[TABLESIZE]; +#ifndef ISC_MUTEX_PROFTABLESIZE +#define ISC_MUTEX_PROFTABLESIZE (16 * 1024) +#endif +static isc_mutexstats_t stats[ISC_MUTEX_PROFTABLESIZE]; +static int stats_next = 0; static isc_boolean_t stats_init = ISC_FALSE; static pthread_mutex_t statslock = PTHREAD_MUTEX_INITIALIZER; @@ -85,21 +88,19 @@ isc_mutex_init_profile(isc_mutex_t *mp, const char *file, int line) { RUNTIME_CHECK(pthread_mutex_lock(&statslock) == 0); - if (stats_init == ISC_FALSE) { - for (i = 0; i < TABLESIZE; i++) { - stats[i].file = NULL; - } + if (stats_init == ISC_FALSE) stats_init = ISC_TRUE; - } - mp->stats = NULL; - for (i = 0; i < TABLESIZE; i++) { - if (stats[i].file == NULL) { - mp->stats = &stats[i]; - break; - } - } - RUNTIME_CHECK(mp->stats != NULL); + /* + * If all statistics entries have been used, give up and trigger an + * assertion failure. There would be no other way to deal with this + * because we'd like to keep record of all locks for the purpose of + * debugging and the number of necessary locks is unpredictable. + * If this failure is triggered while debugging, named should be + * rebuilt with an increased ISC_MUTEX_PROFTABLESIZE. + */ + RUNTIME_CHECK(stats_next < ISC_MUTEX_PROFTABLESIZE); + mp->stats = &stats[stats_next++]; RUNTIME_CHECK(pthread_mutex_unlock(&statslock) == 0); @@ -186,10 +187,9 @@ void isc_mutex_statsprofile(FILE *fp) { isc_mutexlocker_t *locker; int i, j; + fprintf(fp, "Mutex stats (in us)\n"); - for (i = 0; i < TABLESIZE; i++) { - if (stats[i].file == NULL) - continue; + for (i = 0; i < stats_next; i++) { fprintf(fp, "%-12s %4d: %10u %lu.%06lu %lu.%06lu\n", stats[i].file, stats[i].line, stats[i].count, stats[i].locked_total.tv_sec, @@ -225,7 +225,7 @@ isc_mutex_init_errcheck(isc_mutex_t *mp) if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK) != 0) return ISC_R_UNEXPECTED; - + if (pthread_mutex_init(mp, &attr) != 0) return ISC_R_UNEXPECTED; diff --git a/lib/isc/timer.c b/lib/isc/timer.c index 990f2943cd6cf..75508563ca1df 100644 --- a/lib/isc/timer.c +++ b/lib/isc/timer.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: timer.c,v 1.64.12.17.4.3 2008/07/29 18:34:29 jinmei Exp $ */ +/* $Id: timer.c,v 1.64.12.20 2008/08/22 05:59:24 marka Exp $ */ #include <config.h> diff --git a/lib/isc/unix/app.c b/lib/isc/unix/app.c index 382e445ead5c9..daf4b2f495e76 100644 --- a/lib/isc/unix/app.c +++ b/lib/isc/unix/app.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: app.c,v 1.43.2.3.8.8.4.1 2008/07/29 04:43:57 each Exp $ */ +/* $Id: app.c,v 1.43.2.3.8.11 2008/10/15 03:41:17 marka Exp $ */ #include <config.h> @@ -28,6 +28,9 @@ #include <unistd.h> #include <signal.h> #include <sys/time.h> +#ifdef HAVE_EPOLL +#include <sys/epoll.h> +#endif #include <isc/app.h> #include <isc/boolean.h> @@ -295,14 +298,13 @@ isc_app_onrun(isc_mem_t *mctx, isc_task_t *task, isc_taskaction_t action, * Event loop for nonthreaded programs. */ static isc_result_t -evloop() { +evloop(void) { isc_result_t result; while (!want_shutdown) { int n; isc_time_t when, now; struct timeval tv, *tvp; - fd_set *readfds, *writefds; - int maxfd; + isc_socketwait_t *swait; isc_boolean_t readytasks; isc_boolean_t call_timer_dispatch = ISC_FALSE; @@ -329,8 +331,8 @@ evloop() { } } - isc__socketmgr_getfdsets(&readfds, &writefds, &maxfd); - n = select(maxfd, readfds, writefds, NULL, tvp); + swait = NULL; + n = isc__socketmgr_waitevents(tvp, &swait); if (n == 0 || call_timer_dispatch) { /* @@ -350,8 +352,7 @@ evloop() { isc__timermgr_dispatch(); } if (n > 0) - (void)isc__socketmgr_dispatch(readfds, writefds, - maxfd); + (void)isc__socketmgr_dispatch(swait); (void)isc__taskmgr_dispatch(); if (want_reload) { @@ -432,10 +433,10 @@ isc_app_run(void) { #ifdef ISC_PLATFORM_USETHREADS sigset_t sset; char strbuf[ISC_STRERRORSIZE]; -#endif /* ISC_PLATFORM_USETHREADS */ #ifdef HAVE_SIGWAIT int sig; #endif +#endif /* ISC_PLATFORM_USETHREADS */ #ifdef HAVE_LINUXTHREADS REQUIRE(main_thread == pthread_self()); diff --git a/lib/isc/unix/include/isc/net.h b/lib/isc/unix/include/isc/net.h index f1a015f5bb1d7..01c808fea611e 100644 --- a/lib/isc/unix/include/isc/net.h +++ b/lib/isc/unix/include/isc/net.h @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2003 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: net.h,v 1.31.2.2.10.8 2004/04/29 01:31:23 marka Exp $ */ +/* $Id: net.h,v 1.31.2.2.10.10 2008/06/25 23:45:37 tbox Exp $ */ #ifndef ISC_NET_H #define ISC_NET_H 1 @@ -101,7 +101,7 @@ /* * Required for some pre RFC2133 implementations. * IN6ADDR_ANY_INIT and IN6ADDR_LOOPBACK_INIT were added in - * draft-ietf-ipngwg-bsd-api-04.txt or draft-ietf-ipngwg-bsd-api-05.txt. + * draft-ietf-ipngwg-bsd-api-04.txt or draft-ietf-ipngwg-bsd-api-05.txt. * If 's6_addr' is defined then assume that there is a union and three * levels otherwise assume two levels required. */ @@ -303,6 +303,23 @@ isc_net_enableipv4(void); void isc_net_enableipv6(void); +isc_result_t +isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high); +/* + * Returns system's default range of ephemeral UDP ports, if defined. + * If the range is not available or unknown, ISC_NET_PORTRANGELOW and + * ISC_NET_PORTRANGEHIGH will be returned. + * + * Requires: + * + *\li 'low' and 'high' must be non NULL. + * + * Returns: + * + *\li *low and *high will be the ports specifying the low and high ends of + * the range. + */ + #ifdef ISC_PLATFORM_NEEDNTOP const char * isc_net_ntop(int af, const void *src, char *dst, size_t size); diff --git a/lib/isc/unix/net.c b/lib/isc/unix/net.c index 42cadec7d7617..cd3abc64ed00e 100644 --- a/lib/isc/unix/net.c +++ b/lib/isc/unix/net.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004, 2005, 2007 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2005, 2007, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2003 Internet Software Consortium. * * Permission to use, copy, modify, and/or distribute this software for any @@ -15,10 +15,19 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: net.c,v 1.22.2.2.10.11 2007/09/13 23:45:58 tbox Exp $ */ +/* $Id: net.c,v 1.22.2.2.10.15 2008/07/04 23:45:39 tbox Exp $ */ #include <config.h> +#include <sys/types.h> + +#if defined(HAVE_SYS_SYSCTL_H) +#if defined(HAVE_SYS_PARAM_H) +#include <sys/param.h> +#endif +#include <sys/sysctl.h> +#endif + #include <errno.h> #include <unistd.h> @@ -30,6 +39,59 @@ #include <isc/string.h> #include <isc/util.h> +/* + * Definitions about UDP port range specification. This is a total mess of + * portability variants: some use sysctl (but the sysctl names vary), some use + * system-specific interfaces, some have the same interface for IPv4 and IPv6, + * some separate them, etc... + */ + +/* + * The last resort defaults: use all non well known port space + */ +#ifndef ISC_NET_PORTRANGELOW +#define ISC_NET_PORTRANGELOW 1024 +#endif /* ISC_NET_PORTRANGELOW */ +#ifndef ISC_NET_PORTRANGEHIGH +#define ISC_NET_PORTRANGEHIGH 65535 +#endif /* ISC_NET_PORTRANGEHIGH */ + +#ifdef HAVE_SYSCTLBYNAME + +/* + * sysctl variants + */ +#if defined(__FreeBSD__) || defined(__APPLE__) || defined(__DragonFly__) +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portrange.hifirst" +#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.portrange.hilast" +#define SYSCTL_V6PORTRANGE_LOW "net.inet.ip.portrange.hifirst" +#define SYSCTL_V6PORTRANGE_HIGH "net.inet.ip.portrange.hilast" +#endif + +#ifdef __NetBSD__ +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.anonportmin" +#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.anonportmax" +#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.anonportmin" +#define SYSCTL_V6PORTRANGE_HIGH "net.inet6.ip6.anonportmax" +#endif + +#else /* !HAVE_SYSCTLBYNAME */ + +#ifdef __OpenBSD__ +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW { CTL_NET, PF_INET, IPPROTO_IP, \ + IPCTL_IPPORT_HIFIRSTAUTO } +#define SYSCTL_V4PORTRANGE_HIGH { CTL_NET, PF_INET, IPPROTO_IP, \ + IPCTL_IPPORT_HILASTAUTO } +/* Same for IPv6 */ +#define SYSCTL_V6PORTRANGE_LOW SYSCTL_V4PORTRANGE_LOW +#define SYSCTL_V6PORTRANGE_HIGH SYSCTL_V4PORTRANGE_HIGH +#endif + +#endif /* HAVE_SYSCTLBYNAME */ + #if defined(ISC_PLATFORM_HAVEIPV6) # if defined(ISC_PLATFORM_NEEDIN6ADDRANY) const struct in6_addr isc_net_in6addrany = IN6ADDR_ANY_INIT; @@ -328,6 +390,101 @@ isc_net_probe_ipv6pktinfo(void) { return (ipv6pktinfo_result); } +#if defined(USE_SYSCTL_PORTRANGE) +#if defined(HAVE_SYSCTLBYNAME) +static isc_result_t +getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { + int port_low, port_high; + size_t portlen; + const char *sysctlname_lowport, *sysctlname_hiport; + + if (af == AF_INET) { + sysctlname_lowport = SYSCTL_V4PORTRANGE_LOW; + sysctlname_hiport = SYSCTL_V4PORTRANGE_HIGH; + } else { + sysctlname_lowport = SYSCTL_V6PORTRANGE_LOW; + sysctlname_hiport = SYSCTL_V6PORTRANGE_HIGH; + } + portlen = sizeof(portlen); + if (sysctlbyname(sysctlname_lowport, &port_low, &portlen, + NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + portlen = sizeof(portlen); + if (sysctlbyname(sysctlname_hiport, &port_high, &portlen, + NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0) + return (ISC_R_RANGE); + + *low = (in_port_t)port_low; + *high = (in_port_t)port_high; + + return (ISC_R_SUCCESS); +} +#else /* !HAVE_SYSCTLBYNAME */ +static isc_result_t +getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { + int mib_lo4[4] = SYSCTL_V4PORTRANGE_LOW; + int mib_hi4[4] = SYSCTL_V4PORTRANGE_HIGH; + int mib_lo6[4] = SYSCTL_V6PORTRANGE_LOW; + int mib_hi6[4] = SYSCTL_V6PORTRANGE_HIGH; + int *mib_lo, *mib_hi, miblen; + int port_low, port_high; + size_t portlen; + + if (af == AF_INET) { + mib_lo = mib_lo4; + mib_hi = mib_hi4; + miblen = sizeof(mib_lo4) / sizeof(mib_lo4[0]); + } else { + mib_lo = mib_lo6; + mib_hi = mib_hi6; + miblen = sizeof(mib_lo6) / sizeof(mib_lo6[0]); + } + + portlen = sizeof(portlen); + if (sysctl(mib_lo, miblen, &port_low, &portlen, NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + + portlen = sizeof(portlen); + if (sysctl(mib_hi, miblen, &port_high, &portlen, NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + + if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0) + return (ISC_R_RANGE); + + *low = (in_port_t) port_low; + *high = (in_port_t) port_high; + + return (ISC_R_SUCCESS); +} +#endif /* HAVE_SYSCTLBYNAME */ +#endif /* USE_SYSCTL_PORTRANGE */ + +isc_result_t +isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) { + int result = ISC_R_FAILURE; + + REQUIRE(low != NULL && high != NULL); + +#if defined(USE_SYSCTL_PORTRANGE) + result = getudpportrange_sysctl(af, low, high); +#else + UNUSED(af); +#endif + + if (result != ISC_R_SUCCESS) { + *low = ISC_NET_PORTRANGELOW; + *high = ISC_NET_PORTRANGEHIGH; + } + + return (ISC_R_SUCCESS); /* we currently never fail in this function */ +} + void isc_net_disableipv4(void) { initialize(); diff --git a/lib/isc/unix/resource.c b/lib/isc/unix/resource.c index a6280b7477e8a..dac9fcaa6c08c 100644 --- a/lib/isc/unix/resource.c +++ b/lib/isc/unix/resource.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: resource.c,v 1.11.206.3.4.3 2008/07/28 22:45:53 marka Exp $ */ +/* $Id: resource.c,v 1.11.206.7 2008/08/05 07:23:56 marka Exp $ */ #include <config.h> @@ -32,7 +32,7 @@ #include <linux/fs.h> /* To get the large NR_OPEN. */ #endif -#ifdef __hpux +#if defined(__hpux) && defined(HAVE_SYS_DYNTUNE_H) #include <sys/dyntune.h> #endif @@ -170,7 +170,7 @@ isc_resource_setlimit(isc_resource_t resource, isc_resourcevalue_t value) { if (unixresult == 0) return (ISC_R_SUCCESS); } -#elif defined(__hpux) +#elif defined(__hpux) && defined(HAVE_SYS_DYNTUNE_H) if (resource == isc_resource_openfiles && rlim_value == RLIM_INFINITY) { uint64_t maxfiles; if (gettune("maxfiles_lim", &maxfiles) == 0) { @@ -210,7 +210,7 @@ isc_resource_getlimit(isc_resource_t resource, isc_resourcevalue_t *value) { } isc_result_t -isc_resource_curlimit(isc_resource_t resource, isc_resourcevalue_t *value) { +isc_resource_getcurlimit(isc_resource_t resource, isc_resourcevalue_t *value) { int unixresult; int unixresource; struct rlimit rl; diff --git a/lib/isc/unix/socket.c b/lib/isc/unix/socket.c index 3ee796c24bbaa..1508487945beb 100644 --- a/lib/isc/unix/socket.c +++ b/lib/isc/unix/socket.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket.c,v 1.207.2.19.2.35.4.6 2008/07/29 04:43:57 each Exp $ */ +/* $Id: socket.c,v 1.207.2.19.2.62 2008/11/20 00:07:38 jinmei Exp $ */ #include <config.h> @@ -52,20 +52,66 @@ #include <isc/thread.h> #include <isc/util.h> +#ifdef ISC_PLATFORM_HAVESYSUNH +#include <sys/un.h> +#endif +#ifdef ISC_PLATFORM_HAVEKQUEUE +#include <sys/event.h> +#endif +#ifdef ISC_PLATFORM_HAVEEPOLL +#include <sys/epoll.h> +#endif +#ifdef ISC_PLATFORM_HAVEDEVPOLL +#include <sys/devpoll.h> +#endif + #include "errno2result.h" #ifndef ISC_PLATFORM_USETHREADS #include "socket_p.h" #endif /* ISC_PLATFORM_USETHREADS */ -#if defined(SO_BSDCOMPAT) && defined(__linux__) -#include <sys/utsname.h> -#endif +/* + * Choose the most preferable multiplex method. + */ +#ifdef ISC_PLATFORM_HAVEKQUEUE +#define USE_KQUEUE +#elif defined (ISC_PLATFORM_HAVEEPOLL) +#define USE_EPOLL +#elif defined (ISC_PLATFORM_HAVEDEVPOLL) +#define USE_DEVPOLL +typedef struct { + unsigned int want_read : 1, + want_write : 1; +} pollinfo_t; +#else +#define USE_SELECT +#endif /* ISC_PLATFORM_HAVEKQUEUE */ -/*% - * Max number of open sockets. In the vast majority of cases the default size - * of FD_SETSIZE should be fine, and this constant should be increased only - * when absolutely necessary and possible, i.e., the server is exhausting all +#ifndef ISC_PLATFORM_USETHREADS +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +struct isc_socketwait { + int nevents; +}; +#elif defined (USE_SELECT) +struct isc_socketwait { + fd_set *readset; + fd_set *writeset; + int nfds; + int maxfd; +}; +#endif /* USE_KQUEUE */ +#endif /* !ISC_PLATFORM_USETHREADS */ + +/* + * Maximum number of allowable open sockets. This is also the maximum + * allowable socket file descriptor. + * + * Care should be taken before modifying this value for select(): + * The API standard doesn't ensure select() accept more than (the system default + * of) FD_SETSIZE descriptors, and the default size should in fact be fine in + * the vast majority of cases. This constant should therefore be increased only + * when absolutely necessary and possible, i.e., the server is exhausting all * available file descriptors (up to FD_SETSIZE) and the select() function * and FD_xxx macros support larger values than FD_SETSIZE (which may not * always by true, but we keep using some of them to ensure as much @@ -76,21 +122,79 @@ * As a special note, this value shouldn't have to be touched if * this is a build for an authoritative only DNS server. */ - -#ifndef ISC_SOCKET_FDSETSIZE -#define ISC_SOCKET_FDSETSIZE FD_SETSIZE -#endif - +#ifndef ISC_SOCKET_MAXSOCKETS +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +#define ISC_SOCKET_MAXSOCKETS 4096 +#elif defined(USE_SELECT) +#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE +#endif /* USE_KQUEUE... */ +#endif /* ISC_SOCKET_MAXSOCKETS */ + +#ifdef USE_SELECT /*% - * Mac OS X needs a special definition to support larger values in select() + * Mac OS X needs a special definition to support larger values in select(). + * We always define this because a larger value can be specified run-time. */ -#if ISC_SOCKET_FDSETSIZE > FD_SETSIZE #ifdef __APPLE__ #define _DARWIN_UNLIMITED_SELECT #endif /* __APPLE__ */ +#endif /* USE_SELECT */ + +#ifdef ISC_SOCKET_USE_POLLWATCH +/* + * If this macro is defined, enable workaround for a Solaris /dev/poll kernel + * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for + * some of the specified FD. The idea is based on the observation that it's + * likely for a busy server to keep receiving packets. It specifically works + * as follows: the socket watcher is first initialized with the state of + * "poll_idle". While it's in the idle state it keeps sleeping until a socket + * event occurs. When it wakes up for a socket I/O event, it moves to the + * poll_active state, and sets the poll timeout to a short period + * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the + * watcher goes to the poll_checking state with the same timeout period. + * In this state, the watcher tries to detect whether this is a break + * during intermittent events or the kernel bug is triggered. If the next + * polling reports an event within the short period, the previous timeout is + * likely to be a kernel bug, and so the watcher goes back to the active state. + * Otherwise, it moves to the idle state again. + * + * It's not clear whether this is a thread-related bug, but since we've only + * seen this with threads, this workaround is used only when enabling threads. + */ + +typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; + +#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT +#define ISC_SOCKET_POLLWATCH_TIMEOUT 10 +#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ +#endif /* ISC_SOCKET_USE_POLLWATCH */ + +/* + * Size of per-FD lock buckets. + */ +#ifdef ISC_PLATFORM_USETHREADS +#define FDLOCK_COUNT 1024 +#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) +#else +#define FDLOCK_COUNT 1 +#define FDLOCK_ID(fd) 0 +#endif /* ISC_PLATFORM_USETHREADS */ + +/* + * Maximum number of events communicated with the kernel. There should normally + * be no need for having a large number. + */ +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +#ifndef ISC_SOCKET_MAXEVENTS +#define ISC_SOCKET_MAXEVENTS 64 +#endif #endif -/*% +#if defined(SO_BSDCOMPAT) && defined(__linux__) +#include <sys/utsname.h> +#endif + +/* * Some systems define the socket length argument as an int, some as size_t, * some as socklen_t. This is here so it can be easily changed if needed. */ @@ -218,22 +322,50 @@ struct isc_socketmgr { unsigned int magic; isc_mem_t *mctx; isc_mutex_t lock; + isc_mutex_t *fdlock; +#ifdef USE_KQUEUE + int kqueue_fd; + int nevents; + struct kevent *events; +#endif /* USE_KQUEUE */ +#ifdef USE_EPOLL + int epoll_fd; + int nevents; + struct epoll_event *events; +#endif /* USE_EPOLL */ +#ifdef USE_DEVPOLL + int devpoll_fd; + int nevents; + struct pollfd *events; +#endif /* USE_DEVPOLL */ +#ifdef USE_SELECT int fd_bufsize; - int fdsize; +#endif /* USE_SELECT */ + unsigned int maxsocks; +#ifdef ISC_PLATFORM_USETHREADS + int pipe_fds[2]; +#endif + + /* Locked by fdlock. */ + isc_socket_t **fds; + int *fdstate; +#ifdef USE_DEVPOLL + pollinfo_t *fdpollinfo; +#endif + /* Locked by manager lock. */ ISC_LIST(isc_socket_t) socklist; +#ifdef USE_SELECT fd_set *read_fds; fd_set *read_fds_copy; fd_set *write_fds; fd_set *write_fds_copy; - isc_socket_t **fds; - int *fdstate; int maxfd; - int reserved; /* unlocked */ +#endif /* USE_SELECT */ + int reserved; /* unlocked */ #ifdef ISC_PLATFORM_USETHREADS isc_thread_t watcher; isc_condition_t shutdown_ok; - int pipe_fds[2]; #else /* ISC_PLATFORM_USETHREADS */ unsigned int refs; #endif /* ISC_PLATFORM_USETHREADS */ @@ -272,8 +404,9 @@ static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *, struct msghdr *, struct iovec *, size_t *); static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *, struct msghdr *, struct iovec *, size_t *); -static void cleanup_fdsets(isc_socketmgr_t *, isc_mem_t *); -static isc_result_t create_fdsets(isc_socketmgr_t *, isc_mem_t *); +#ifdef ISC_PLATFORM_USETHREADS +static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager); +#endif #define SELECT_POKE_SHUTDOWN (-1) #define SELECT_POKE_NOTHING (-2) @@ -342,9 +475,195 @@ socket_log(isc_socket_t *sock, isc_sockaddr_t *address, } } +#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \ + defined(USE_CMSG) && defined(IPV6_RECVPKTINFO) +/* + * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by + * setting IPV6_V6ONLY. + */ +static void +FIX_IPV6_RECVPKTINFO(isc_socket_t *sock) +{ + char strbuf[ISC_STRERRORSIZE]; + int on = 1; + + if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp) + return; + + if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, + (void *)&on, sizeof(on)) < 0) { + + UNEXPECTED_ERROR(__FILE__, __LINE__, + "setsockopt(%d, IPV6_RECVPKTINFO) " + "%s: %s", sock->fd, + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed"), + strbuf); + } +} +#else +#define FIX_IPV6_RECVPKTINFO(sock) (void)0 +#endif + +static inline isc_result_t +watch_fd(isc_socketmgr_t *manager, int fd, int msg) { + isc_result_t result = ISC_R_SUCCESS; + +#ifdef USE_KQUEUE + struct kevent evchange; + + memset(&evchange, 0, sizeof(evchange)); + if (msg == SELECT_POKE_READ) + evchange.filter = EVFILT_READ; + else + evchange.filter = EVFILT_WRITE; + evchange.flags = EV_ADD; + evchange.ident = fd; + if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) + result = isc__errno2result(errno); + + return (result); +#elif defined(USE_EPOLL) + struct epoll_event event; + + if (msg == SELECT_POKE_READ) + event.events = EPOLLIN; + else + event.events = EPOLLOUT; + event.data.fd = fd; + if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && + errno != EEXIST) { + result = isc__errno2result(errno); + } + + return (result); +#elif defined(USE_DEVPOLL) + struct pollfd pfd; + int lockid = FDLOCK_ID(fd); + + memset(&pfd, 0, sizeof(pfd)); + if (msg == SELECT_POKE_READ) + pfd.events = POLLIN; + else + pfd.events = POLLOUT; + pfd.fd = fd; + pfd.revents = 0; + LOCK(&manager->fdlock[lockid]); + if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) + result = isc__errno2result(errno); + else { + if (msg == SELECT_POKE_READ) + manager->fdpollinfo[fd].want_read = 1; + else + manager->fdpollinfo[fd].want_write = 1; + } + UNLOCK(&manager->fdlock[lockid]); + + return (result); +#elif defined(USE_SELECT) + LOCK(&manager->lock); + if (msg == SELECT_POKE_READ) + FD_SET(fd, manager->read_fds); + if (msg == SELECT_POKE_WRITE) + FD_SET(fd, manager->write_fds); + UNLOCK(&manager->lock); + + return (result); +#endif +} + +static inline isc_result_t +unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) { + isc_result_t result = ISC_R_SUCCESS; + +#ifdef USE_KQUEUE + struct kevent evchange; + + memset(&evchange, 0, sizeof(evchange)); + if (msg == SELECT_POKE_READ) + evchange.filter = EVFILT_READ; + else + evchange.filter = EVFILT_WRITE; + evchange.flags = EV_DELETE; + evchange.ident = fd; + if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) + result = isc__errno2result(errno); + + return (result); +#elif defined(USE_EPOLL) + struct epoll_event event; + + if (msg == SELECT_POKE_READ) + event.events = EPOLLIN; + else + event.events = EPOLLOUT; + event.data.fd = fd; + if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && + errno != ENOENT) { + char strbuf[ISC_STRERRORSIZE]; + isc__strerror(errno, strbuf, sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "epoll_ctl(DEL), %d: %s", fd, strbuf); + result = ISC_R_UNEXPECTED; + } + return (result); +#elif defined(USE_DEVPOLL) + struct pollfd pfds[2]; + size_t writelen = sizeof(pfds[0]); + int lockid = FDLOCK_ID(fd); + + memset(pfds, 0, sizeof(pfds)); + pfds[0].events = POLLREMOVE; + pfds[0].fd = fd; + + /* + * Canceling read or write polling via /dev/poll is tricky. Since it + * only provides a way of canceling per FD, we may need to re-poll the + * socket for the other operation. + */ + LOCK(&manager->fdlock[lockid]); + if (msg == SELECT_POKE_READ && + manager->fdpollinfo[fd].want_write == 1) { + pfds[1].events = POLLOUT; + pfds[1].fd = fd; + writelen += sizeof(pfds[1]); + } + if (msg == SELECT_POKE_WRITE && + manager->fdpollinfo[fd].want_read == 1) { + pfds[1].events = POLLIN; + pfds[1].fd = fd; + writelen += sizeof(pfds[1]); + } + + if (write(manager->devpoll_fd, pfds, writelen) == -1) + result = isc__errno2result(errno); + else { + if (msg == SELECT_POKE_READ) + manager->fdpollinfo[fd].want_read = 0; + else + manager->fdpollinfo[fd].want_write = 0; + } + UNLOCK(&manager->fdlock[lockid]); + + return (result); +#elif defined(USE_SELECT) + LOCK(&manager->lock); + if (msg == SELECT_POKE_READ) + FD_CLR(fd, manager->read_fds); + else if (msg == SELECT_POKE_WRITE) + FD_CLR(fd, manager->write_fds); + UNLOCK(&manager->lock); + + return (result); +#endif +} + static void wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { - isc_socket_t *sock; + isc_result_t result; + int lockid = FDLOCK_ID(fd); /* * This is a wakeup on a socket. If the socket is not in the @@ -352,27 +671,54 @@ wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { * or writes. */ - INSIST(fd >= 0 && fd < manager->fdsize); + INSIST(fd >= 0 && fd < (int)manager->maxsocks); - if (manager->fdstate[fd] == CLOSE_PENDING) { + if (msg == SELECT_POKE_CLOSE) { + /* No one should be updating fdstate, so no need to lock it */ + INSIST(manager->fdstate[fd] == CLOSE_PENDING); manager->fdstate[fd] = CLOSED; - FD_CLR(fd, manager->read_fds); - FD_CLR(fd, manager->write_fds); + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); (void)close(fd); return; } - if (manager->fdstate[fd] != MANAGED) - return; - sock = manager->fds[fd]; + LOCK(&manager->fdlock[lockid]); + if (manager->fdstate[fd] == CLOSE_PENDING) { + UNLOCK(&manager->fdlock[lockid]); + /* + * We accept (and ignore) any error from unwatch_fd() as we are + * closing the socket, hoping it doesn't leave dangling state in + * the kernel. + * Note that unwatch_fd() must be called after releasing the + * fdlock; otherwise it could cause deadlock due to a lock order + * reversal. + */ + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + return; + } + if (manager->fdstate[fd] != MANAGED) { + UNLOCK(&manager->fdlock[lockid]); + return; + } + UNLOCK(&manager->fdlock[lockid]); /* * Set requested bit. */ - if (msg == SELECT_POKE_READ) - FD_SET(sock->fd, manager->read_fds); - if (msg == SELECT_POKE_WRITE) - FD_SET(sock->fd, manager->write_fds); + result = watch_fd(manager, fd, msg); + if (result != ISC_R_SUCCESS) { + /* + * XXXJT: what should we do? Ignoring the failure of watching + * a socket will make the application dysfunctional, but there + * seems to be no reasonable recovery process. + */ + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, + "failed to start watching FD (%d): %s", + fd, isc_result_totext(result)); + } } #ifdef ISC_PLATFORM_USETHREADS @@ -667,7 +1013,7 @@ build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, memset(msg, 0, sizeof(*msg)); - if (sock->type == isc_sockettype_udp) { + if (!sock->connected) { msg->msg_name = (void *)&dev->address.type.sa; msg->msg_namelen = dev->address.length; } else { @@ -941,15 +1287,17 @@ dump_msg(struct msghdr *msg) { unsigned int i; printf("MSGHDR %p\n", msg); - printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen); - printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen); + printf("\tname %p, namelen %ld\n", msg->msg_name, + (long) msg->msg_namelen); + printf("\tiov %p, iovlen %ld\n", msg->msg_iov, + (long) msg->msg_iovlen); for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) - printf("\t\t%d\tbase %p, len %d\n", i, + printf("\t\t%d\tbase %p, len %ld\n", i, msg->msg_iov[i].iov_base, - msg->msg_iov[i].iov_len); + (long) msg->msg_iov[i].iov_len); #ifdef ISC_NET_BSD44MSGHDR - printf("\tcontrol %p, controllen %d\n", msg->msg_control, - msg->msg_controllen); + printf("\tcontrol %p, controllen %ld\n", msg->msg_control, + (long) msg->msg_controllen); #endif } #endif @@ -1017,6 +1365,14 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { /* HPUX 11.11 can return EADDRNOTAVAIL. */ SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); + /* + * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 + * errors. + */ +#ifdef EPROTO + SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); +#endif + SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); #undef SOFT_OR_HARD #undef ALWAYS_HARD @@ -1222,7 +1578,54 @@ doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { * references exist. */ static void +closesocket(isc_socketmgr_t *manager, isc_sockettype_t type, int fd) { + int lockid = FDLOCK_ID(fd); + + UNUSED(type); + + /* + * No one has this socket open, so the watcher doesn't have to be + * poked, and the socket doesn't have to be locked. + */ + LOCK(&manager->fdlock[lockid]); + manager->fds[fd] = NULL; + manager->fdstate[fd] = CLOSE_PENDING; + UNLOCK(&manager->fdlock[lockid]); + select_poke(manager, fd, SELECT_POKE_CLOSE); + + /* + * update manager->maxfd here (XXX: this should be implemented more + * efficiently) + */ +#ifdef USE_SELECT + LOCK(&manager->lock); + if (manager->maxfd == fd) { + int i; + + manager->maxfd = 0; + for (i = fd - 1; i >= 0; i--) { + lockid = FDLOCK_ID(i); + + LOCK(&manager->fdlock[lockid]); + if (manager->fdstate[i] == MANAGED) { + manager->maxfd = i; + UNLOCK(&manager->fdlock[lockid]); + break; + } + UNLOCK(&manager->fdlock[lockid]); + } +#ifdef ISC_PLATFORM_USETHREADS + if (manager->maxfd < manager->pipe_fds[0]) + manager->maxfd = manager->pipe_fds[0]; +#endif + } + UNLOCK(&manager->lock); +#endif /* USE_SELECT */ +} + +static void destroy(isc_socket_t **sockp) { + int fd; isc_socket_t *sock = *sockp; isc_socketmgr_t *manager = sock->manager; @@ -1233,17 +1636,16 @@ destroy(isc_socket_t **sockp) { INSIST(ISC_LIST_EMPTY(sock->recv_list)); INSIST(ISC_LIST_EMPTY(sock->send_list)); INSIST(sock->connect_ev == NULL); - REQUIRE(sock->fd >= 0 && sock->fd < (int)manager->fdsize); + REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); + + if (sock->fd >= 0) { + fd = sock->fd; + sock->fd = -1; + closesocket(manager, sock->type, fd); + } LOCK(&manager->lock); - /* - * No one has this socket open, so the watcher doesn't have to be - * poked, and the socket doesn't have to be locked. - */ - manager->fds[sock->fd] = NULL; - manager->fdstate[sock->fd] = CLOSE_PENDING; - select_poke(manager, sock->fd, SELECT_POKE_CLOSE); ISC_LIST_UNLINK(manager->socklist, sock, link); #ifdef ISC_PLATFORM_USETHREADS @@ -1251,10 +1653,6 @@ destroy(isc_socket_t **sockp) { SIGNAL(&manager->shutdown_ok); #endif /* ISC_PLATFORM_USETHREADS */ - /* - * XXX should reset manager->maxfd here - */ - UNLOCK(&manager->lock); free_socket(sockp); @@ -1446,40 +1844,22 @@ clear_bsdcompat(void) { } #endif -/*% - * Create a new 'type' socket managed by 'manager'. Events - * will be posted to 'task' and when dispatched 'action' will be - * called with 'arg' as the arg value. The new socket is returned - * in 'socketp'. - */ -isc_result_t -isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, - isc_socket_t **socketp) -{ - isc_socket_t *sock = NULL; - isc_result_t ret; -#if defined(USE_CMSG) || defined(SO_BSDCOMPAT) - int on = 1; -#endif +static isc_result_t +opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) { char strbuf[ISC_STRERRORSIZE]; const char *err = "socket"; int tries = 0; +#if defined(USE_CMSG) || defined(SO_BSDCOMPAT) + int on = 1; +#endif - REQUIRE(VALID_MANAGER(manager)); - REQUIRE(socketp != NULL && *socketp == NULL); - - ret = allocate_socket(manager, type, &sock); - if (ret != ISC_R_SUCCESS) - return (ret); - - sock->pf = pf; again: - switch (type) { + switch (sock->type) { case isc_sockettype_udp: - sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP); + sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); break; case isc_sockettype_tcp: - sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP); + sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); break; } if (sock->fd == -1 && errno == EINTR && tries++ < 42) @@ -1489,7 +1869,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, /* * Leave a space for stdio and TCP to work in. */ - if (manager->reserved != 0 && type == isc_sockettype_udp && + if (manager->reserved != 0 && sock->type == isc_sockettype_udp && sock->fd >= 0 && sock->fd < manager->reserved) { int new, tmp; new = fcntl(sock->fd, F_DUPFD, manager->reserved); @@ -1509,23 +1889,28 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #endif - if (sock->fd >= (int)manager->fdsize) { + if (sock->fd >= (int)manager->maxsocks) { (void)close(sock->fd); isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_TOOMANYFDS, - "%s: too many open file descriptors", "socket"); - free_socket(&sock); + "socket: file descriptor exceeds limit (%d/%u)", + sock->fd, manager->maxsocks); return (ISC_R_NORESOURCES); } if (sock->fd < 0) { - free_socket(&sock); - switch (errno) { case EMFILE: case ENFILE: + isc__strerror(errno, strbuf, sizeof(strbuf)); + isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, + isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_TOOMANYFDS, + "%s: %s", err, strbuf); + /* fallthrough */ case ENOBUFS: return (ISC_R_NORESOURCES); @@ -1554,7 +1939,6 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, if (make_nonblock(sock->fd) != ISC_R_SUCCESS) { (void)close(sock->fd); - free_socket(&sock); return (ISC_R_UNEXPECTED); } @@ -1562,7 +1946,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, RUNTIME_CHECK(isc_once_do(&bsdcompat_once, clear_bsdcompat) == ISC_R_SUCCESS); if (bsdcompat && setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, - (void *)&on, sizeof(on)) < 0) { + (void *)&on, sizeof(on)) < 0) { isc__strerror(errno, strbuf, sizeof(strbuf)); UNEXPECTED_ERROR(__FILE__, __LINE__, "setsockopt(%d, SO_BSDCOMPAT) %s: %s", @@ -1588,9 +1972,10 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #endif -#if defined(USE_CMSG) - if (type == isc_sockettype_udp) { +#if defined(USE_CMSG) || defined(SO_RCVBUF) + if (sock->type == isc_sockettype_udp) { +#if defined(USE_CMSG) #if defined(SO_TIMESTAMP) if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on, sizeof(on)) < 0 @@ -1609,7 +1994,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #endif /* SO_TIMESTAMP */ #if defined(ISC_PLATFORM_HAVEIPV6) - if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { + if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { /* * Warn explicitly because this anomaly can be hidden * in usual operation (and unexpectedly appear later). @@ -1621,7 +2006,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #ifdef ISC_PLATFORM_HAVEIN6PKTINFO #ifdef IPV6_RECVPKTINFO /* RFC 3542 */ - if ((pf == AF_INET6) + if ((sock->pf == AF_INET6) && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, (void *)&on, sizeof(on)) < 0)) { isc__strerror(errno, strbuf, sizeof(strbuf)); @@ -1636,7 +2021,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #else /* RFC 2292 */ - if ((pf == AF_INET6) + if ((sock->pf == AF_INET6) && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, (void *)&on, sizeof(on)) < 0)) { isc__strerror(errno, strbuf, sizeof(strbuf)); @@ -1653,33 +2038,93 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ #ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/ /* use minimum MTU */ - if (pf == AF_INET6) { + if (sock->pf == AF_INET6) { (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, (void *)&on, sizeof(on)); } #endif #endif /* ISC_PLATFORM_HAVEIPV6 */ +#endif /* defined(USE_CMSG) */ +#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) + /* + * Turn off Path MTU discovery on IPv4/UDP sockets. + */ + if (sock->pf == AF_INET) { + int action = IP_PMTUDISC_DONT; + (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, + &action, sizeof(action)); + } +#endif +#if defined(IP_DONTFRAG) + /* + * Turn off Path MTU discovery on IPv4/UDP sockets. + */ + if (sock->pf == AF_INET) { + int off = 0; + (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, + &off, sizeof(off)); + } +#endif + } +#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ + + return (ISC_R_SUCCESS); +} + +/* + * Create a new 'type' socket managed by 'manager'. Events + * will be posted to 'task' and when dispatched 'action' will be + * called with 'arg' as the arg value. The new socket is returned + * in 'socketp'. + */ +isc_result_t +isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, + isc_socket_t **socketp) +{ + isc_socket_t *sock = NULL; + isc_result_t result; + int lockid; + + REQUIRE(VALID_MANAGER(manager)); + REQUIRE(socketp != NULL && *socketp == NULL); + + result = allocate_socket(manager, type, &sock); + if (result != ISC_R_SUCCESS) + return (result); + + sock->pf = pf; + result = opensocket(manager, sock); + if (result != ISC_R_SUCCESS) { + free_socket(&sock); + return (result); } -#endif /* USE_CMSG */ sock->references = 1; *socketp = sock; - LOCK(&manager->lock); - /* * Note we don't have to lock the socket like we normally would because * there are no external references to it yet. */ + lockid = FDLOCK_ID(sock->fd); + LOCK(&manager->fdlock[lockid]); manager->fds[sock->fd] = sock; manager->fdstate[sock->fd] = MANAGED; +#ifdef USE_DEVPOLL + INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && + sock->manager->fdpollinfo[sock->fd].want_write == 0); +#endif + UNLOCK(&manager->fdlock[lockid]); + + LOCK(&manager->lock); ISC_LIST_APPEND(manager->socklist, sock, link); +#ifdef USE_SELECT if (manager->maxfd < sock->fd) manager->maxfd = sock->fd; - +#endif UNLOCK(&manager->lock); socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, @@ -1688,6 +2133,48 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, return (ISC_R_SUCCESS); } +isc_result_t +isc_socket_open(isc_socket_t *sock) { + isc_result_t result; + + REQUIRE(VALID_SOCKET(sock)); + + LOCK(&sock->lock); + REQUIRE(sock->references == 1); + UNLOCK(&sock->lock); + /* + * We don't need to retain the lock hereafter, since no one else has + * this socket. + */ + REQUIRE(sock->fd == -1); + + result = opensocket(sock->manager, sock); + if (result != ISC_R_SUCCESS) + sock->fd = -1; + + if (result == ISC_R_SUCCESS) { + int lockid = FDLOCK_ID(sock->fd); + + LOCK(&sock->manager->fdlock[lockid]); + sock->manager->fds[sock->fd] = sock; + sock->manager->fdstate[sock->fd] = MANAGED; +#ifdef USE_DEVPOLL + INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && + sock->manager->fdpollinfo[sock->fd].want_write == 0); +#endif + UNLOCK(&sock->manager->fdlock[lockid]); + +#ifdef USE_SELECT + LOCK(&sock->manager->lock); + if (sock->manager->maxfd < sock->fd) + sock->manager->maxfd = sock->fd; + UNLOCK(&sock->manager->lock); +#endif + } + + return (result); +} + /* * Attach to a socket. Caller must explicitly detach when it is done. */ @@ -1729,6 +2216,44 @@ isc_socket_detach(isc_socket_t **socketp) { *socketp = NULL; } +isc_result_t +isc_socket_close(isc_socket_t *sock) { + int fd; + + REQUIRE(VALID_SOCKET(sock)); + + LOCK(&sock->lock); + REQUIRE(sock->references == 1); + UNLOCK(&sock->lock); + /* + * We don't need to retain the lock hereafter, since no one else has + * this socket. + */ + + REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); + + INSIST(!sock->connecting); + INSIST(!sock->pending_recv); + INSIST(!sock->pending_send); + INSIST(!sock->pending_accept); + INSIST(ISC_LIST_EMPTY(sock->recv_list)); + INSIST(ISC_LIST_EMPTY(sock->send_list)); + INSIST(ISC_LIST_EMPTY(sock->accept_list)); + INSIST(sock->connect_ev == NULL); + + fd = sock->fd; + sock->fd = -1; + sock->listener = 0; + sock->connected = 0; + sock->connecting = 0; + sock->bound = 0; + isc_sockaddr_any(&sock->address); + + closesocket(sock->manager, sock->type, fd); + + return (ISC_R_SUCCESS); +} + /* * I/O is possible on a given socket. Schedule an event to this task that * will call an internal function to do the I/O. This will charge the @@ -2041,13 +2566,14 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { sock->pf); (void)close(fd); goto soft_error; - } else if (fd >= (int)manager->fdsize) { + } else if (fd >= (int)manager->maxsocks) { isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_TOOMANYFDS, - "%s: too many open file descriptors", - "accept"); + "accept: " + "file descriptor exceeds limit (%d/%u)", + fd, manager->maxsocks); (void)close(fd); goto soft_error; } @@ -2081,6 +2607,13 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { * -1 means the new socket didn't happen. */ if (fd != -1) { + int lockid = FDLOCK_ID(fd); + + LOCK(&manager->fdlock[lockid]); + manager->fds[fd] = dev->newsocket; + manager->fdstate[fd] = MANAGED; + UNLOCK(&manager->fdlock[lockid]); + LOCK(&manager->lock); ISC_LIST_APPEND(manager->socklist, dev->newsocket, link); @@ -2093,10 +2626,10 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { */ dev->address = dev->newsocket->address; - manager->fds[fd] = dev->newsocket; - manager->fdstate[fd] = MANAGED; +#ifdef USE_SELECT if (manager->maxfd < fd) manager->maxfd = fd; +#endif socket_log(sock, &dev->newsocket->address, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, @@ -2245,77 +2778,256 @@ internal_send(isc_task_t *me, isc_event_t *ev) { UNLOCK(&sock->lock); } +/* + * Process read/writes on each fd here. Avoid locking + * and unlocking twice if both reads and writes are possible. + */ static void -process_fds(isc_socketmgr_t *manager, int maxfd, - fd_set *readfds, fd_set *writefds) +process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable, + isc_boolean_t writeable) { - int i; isc_socket_t *sock; isc_boolean_t unlock_sock; - - REQUIRE(maxfd <= (int)manager->fdsize); + int lockid = FDLOCK_ID(fd); /* - * Process read/writes on other fds here. Avoid locking - * and unlocking twice if both reads and writes are possible. + * If the socket is going to be closed, don't do more I/O. */ - for (i = 0; i < maxfd; i++) { + LOCK(&manager->fdlock[lockid]); + if (manager->fdstate[fd] == CLOSE_PENDING) { + UNLOCK(&manager->fdlock[lockid]); + + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + return; + } + + sock = manager->fds[fd]; + UNLOCK(&manager->fdlock[lockid]); + unlock_sock = ISC_FALSE; + if (readable) { + if (sock == NULL) { + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + goto check_write; + } + unlock_sock = ISC_TRUE; + LOCK(&sock->lock); + if (!SOCK_DEAD(sock)) { + if (sock->listener) + dispatch_accept(sock); + else + dispatch_recv(sock); + } + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + } +check_write: + if (writeable) { + if (sock == NULL) { + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + return; + } + if (!unlock_sock) { + unlock_sock = ISC_TRUE; + LOCK(&sock->lock); + } + if (!SOCK_DEAD(sock)) { + if (sock->connecting) + dispatch_connect(sock); + else + dispatch_send(sock); + } + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + } + if (unlock_sock) + UNLOCK(&sock->lock); +} + +#ifdef USE_KQUEUE +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) { + int i; + isc_boolean_t readable, writable; + isc_boolean_t done = ISC_FALSE; #ifdef ISC_PLATFORM_USETHREADS - if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) + isc_boolean_t have_ctlevent = ISC_FALSE; +#endif + + if (nevents == manager->nevents) { + /* + * This is not an error, but something unexpected. If this + * happens, it may indicate the need for increasing + * ISC_SOCKET_MAXEVENTS. + */ + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].ident < manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { + have_ctlevent = ISC_TRUE; continue; -#endif /* ISC_PLATFORM_USETHREADS */ + } +#endif + readable = ISC_TF(events[i].filter == EVFILT_READ); + writable = ISC_TF(events[i].filter == EVFILT_WRITE); + process_fd(manager, events[i].ident, readable, writable); + } - if (manager->fdstate[i] == CLOSE_PENDING) { - manager->fdstate[i] = CLOSED; - FD_CLR(i, manager->read_fds); - FD_CLR(i, manager->write_fds); +#ifdef ISC_PLATFORM_USETHREADS + if (have_ctlevent) + done = process_ctlfd(manager); +#endif - (void)close(i); + return (done); +} +#elif defined(USE_EPOLL) +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) { + int i; + isc_boolean_t done = ISC_FALSE; +#ifdef ISC_PLATFORM_USETHREADS + isc_boolean_t have_ctlevent = ISC_FALSE; +#endif + if (nevents == manager->nevents) { + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].data.fd < (int)manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].data.fd == manager->pipe_fds[0]) { + have_ctlevent = ISC_TRUE; continue; } - - sock = manager->fds[i]; - unlock_sock = ISC_FALSE; - if (FD_ISSET(i, readfds)) { - if (sock == NULL) { - FD_CLR(i, manager->read_fds); - goto check_write; - } - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - if (!SOCK_DEAD(sock)) { - if (sock->listener) - dispatch_accept(sock); - else - dispatch_recv(sock); - } - FD_CLR(i, manager->read_fds); +#endif + if ((events[i].events & EPOLLERR) != 0 || + (events[i].events & EPOLLHUP) != 0) { + /* + * epoll does not set IN/OUT bits on an erroneous + * condition, so we need to try both anyway. This is a + * bit inefficient, but should be okay for such rare + * events. Note also that the read or write attempt + * won't block because we use non-blocking sockets. + */ + events[i].events |= (EPOLLIN | EPOLLOUT); } - check_write: - if (FD_ISSET(i, writefds)) { - if (sock == NULL) { - FD_CLR(i, manager->write_fds); - continue; - } - if (!unlock_sock) { - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - } - if (!SOCK_DEAD(sock)) { - if (sock->connecting) - dispatch_connect(sock); - else - dispatch_send(sock); - } - FD_CLR(i, manager->write_fds); + process_fd(manager, events[i].data.fd, + (events[i].events & EPOLLIN) != 0, + (events[i].events & EPOLLOUT) != 0); + } + +#ifdef ISC_PLATFORM_USETHREADS + if (have_ctlevent) + done = process_ctlfd(manager); +#endif + + return (done); +} +#elif defined(USE_DEVPOLL) +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) { + int i; + isc_boolean_t done = ISC_FALSE; +#ifdef ISC_PLATFORM_USETHREADS + isc_boolean_t have_ctlevent = ISC_FALSE; +#endif + + if (nevents == manager->nevents) { + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].fd < (int)manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].fd == manager->pipe_fds[0]) { + have_ctlevent = ISC_TRUE; + continue; } - if (unlock_sock) - UNLOCK(&sock->lock); +#endif + process_fd(manager, events[i].fd, + (events[i].events & POLLIN) != 0, + (events[i].events & POLLOUT) != 0); + } + +#ifdef ISC_PLATFORM_USETHREADS + if (have_ctlevent) + done = process_ctlfd(manager); +#endif + + return (done); +} +#elif defined(USE_SELECT) +static void +process_fds(isc_socketmgr_t *manager, int maxfd, + fd_set *readfds, fd_set *writefds) +{ + int i; + + REQUIRE(maxfd <= (int)manager->maxsocks); + + for (i = 0; i < maxfd; i++) { +#ifdef ISC_PLATFORM_USETHREADS + if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) + continue; +#endif /* ISC_PLATFORM_USETHREADS */ + process_fd(manager, i, FD_ISSET(i, readfds), + FD_ISSET(i, writefds)); } } +#endif #ifdef ISC_PLATFORM_USETHREADS +static isc_boolean_t +process_ctlfd(isc_socketmgr_t *manager) { + int msg, fd; + + for (;;) { + select_readmsg(manager, &fd, &msg); + + manager_log(manager, IOEVENT, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_WATCHERMSG, + "watcher got message %d " + "for socket %d"), msg, fd); + + /* + * Nothing to read? + */ + if (msg == SELECT_POKE_NOTHING) + break; + + /* + * Handle shutdown message. We really should + * jump out of this loop right away, but + * it doesn't matter if we have to do a little + * more work first. + */ + if (msg == SELECT_POKE_SHUTDOWN) + return (ISC_TRUE); + + /* + * This is a wakeup on a socket. Look + * at the event queue for both read and write, + * and decide if we need to watch on it now + * or not. + */ + wakeup_socket(manager, fd, msg); + } + + return (ISC_FALSE); +} + /* * This is the thread that will loop forever, always in a select or poll * call. @@ -2329,98 +3041,115 @@ watcher(void *uap) { isc_boolean_t done; int ctlfd; int cc; - int msg, fd; +#ifdef USE_KQUEUE + const char *fnname = "kevent()"; +#elif defined (USE_EPOLL) + const char *fnname = "epoll_wait()"; +#elif defined(USE_DEVPOLL) + const char *fnname = "ioctl(DP_POLL)"; + struct dvpoll dvp; +#elif defined (USE_SELECT) + const char *fnname = "select()"; int maxfd; +#endif char strbuf[ISC_STRERRORSIZE]; +#ifdef ISC_SOCKET_USE_POLLWATCH + pollstate_t pollstate = poll_idle; +#endif /* * Get the control fd here. This will never change. */ - LOCK(&manager->lock); ctlfd = manager->pipe_fds[0]; - done = ISC_FALSE; while (!done) { do { +#ifdef USE_KQUEUE + cc = kevent(manager->kqueue_fd, NULL, 0, + manager->events, manager->nevents, NULL); +#elif defined(USE_EPOLL) + cc = epoll_wait(manager->epoll_fd, manager->events, + manager->nevents, -1); +#elif defined(USE_DEVPOLL) + dvp.dp_fds = manager->events; + dvp.dp_nfds = manager->nevents; +#ifndef ISC_SOCKET_USE_POLLWATCH + dvp.dp_timeout = -1; +#else + if (pollstate == poll_idle) + dvp.dp_timeout = -1; + else + dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT; +#endif /* ISC_SOCKET_USE_POLLWATCH */ + cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); +#elif defined(USE_SELECT) + LOCK(&manager->lock); memcpy(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); memcpy(manager->write_fds_copy, manager->write_fds, manager->fd_bufsize); maxfd = manager->maxfd + 1; - UNLOCK(&manager->lock); cc = select(maxfd, manager->read_fds_copy, manager->write_fds_copy, NULL, NULL); - if (cc < 0) { - if (!SOFT_ERROR(errno)) { - isc__strerror(errno, strbuf, - sizeof(strbuf)); - FATAL_ERROR(__FILE__, __LINE__, - "select() %s: %s", - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, - "failed"), - strbuf); - } +#endif /* USE_KQUEUE */ + + if (cc < 0 && !SOFT_ERROR(errno)) { + isc__strerror(errno, strbuf, sizeof(strbuf)); + FATAL_ERROR(__FILE__, __LINE__, + "%s %s: %s", fnname, + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed"), strbuf); } - LOCK(&manager->lock); +#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) + if (cc == 0) { + if (pollstate == poll_active) + pollstate = poll_checking; + else if (pollstate == poll_checking) + pollstate = poll_idle; + } else if (cc > 0) { + if (pollstate == poll_checking) { + /* + * XXX: We'd like to use a more + * verbose log level as it's actually an + * unexpected event, but the kernel bug + * reportedly happens pretty frequently + * (and it can also be a false positive) + * so it would be just too noisy. + */ + manager_log(manager, + ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, + ISC_LOG_DEBUG(1), + "unexpected POLL timeout"); + } + pollstate = poll_active; + } +#endif } while (cc < 0); +#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) + done = process_fds(manager, manager->events, cc); +#elif defined(USE_SELECT) + process_fds(manager, maxfd, manager->read_fds_copy, + manager->write_fds_copy); /* * Process reads on internal, control fd. */ - if (FD_ISSET(ctlfd, manager->read_fds_copy)) { - for (;;) { - select_readmsg(manager, &fd, &msg); - - manager_log(manager, IOEVENT, - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_SOCKET, - ISC_MSG_WATCHERMSG, - "watcher got message %d"), - msg); - - /* - * Nothing to read? - */ - if (msg == SELECT_POKE_NOTHING) - break; - - /* - * Handle shutdown message. We really should - * jump out of this loop right away, but - * it doesn't matter if we have to do a little - * more work first. - */ - if (msg == SELECT_POKE_SHUTDOWN) { - done = ISC_TRUE; - - break; - } - - /* - * This is a wakeup on a socket. Look - * at the event queue for both read and write, - * and decide if we need to watch on it now - * or not. - */ - wakeup_socket(manager, fd, msg); - } - } - - process_fds(manager, maxfd, manager->read_fds_copy, - manager->write_fds_copy); + if (FD_ISSET(ctlfd, manager->read_fds_copy)) + done = process_ctlfd(manager); +#endif } manager_log(manager, TRACE, isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_EXITING, "watcher exiting")); - UNLOCK(&manager->lock); return ((isc_threadresult_t)0); } #endif /* ISC_PLATFORM_USETHREADS */ @@ -2434,69 +3163,208 @@ isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) { } /* - * Initialize fdsets in socketmgr structure. + * Create a new socket manager. */ + static isc_result_t -create_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) { -#if ISC_SOCKET_FDSETSIZE > FD_SETSIZE - manager->fdsize = ISC_SOCKET_FDSETSIZE; - manager->fd_bufsize = howmany(ISC_SOCKET_FDSETSIZE, NFDBITS) * +setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { + isc_result_t result; +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) + char strbuf[ISC_STRERRORSIZE]; +#endif + +#ifdef USE_KQUEUE + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct kevent) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + manager->kqueue_fd = kqueue(); + if (manager->kqueue_fd == -1) { + result = isc__errno2result(errno); + isc__strerror(errno, strbuf, sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "kqueue %s: %s", + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed"), + strbuf); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); + return (result); + } + +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->kqueue_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_EPOLL) + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + manager->epoll_fd = epoll_create(manager->nevents); + if (manager->epoll_fd == -1) { + result = isc__errno2result(errno); + isc__strerror(errno, strbuf, sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "epoll_create %s: %s", + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed"), + strbuf); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); + return (result); + } +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->epoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_DEVPOLL) + /* + * XXXJT: /dev/poll seems to reject large numbers of events, + * so we should be careful about redefining ISC_SOCKET_MAXEVENTS. + */ + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + /* + * Note: fdpollinfo should be able to support all possible FDs, so + * it must have maxsocks entries (not nevents). + */ + manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * + manager->maxsocks); + if (manager->fdpollinfo == NULL) { + isc_mem_put(mctx, manager->events, + sizeof(pollinfo_t) * manager->maxsocks); + return (ISC_R_NOMEMORY); + } + memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); + manager->devpoll_fd = open("/dev/poll", O_RDWR); + if (manager->devpoll_fd == -1) { + result = isc__errno2result(errno); + isc__strerror(errno, strbuf, sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "open(/dev/poll) %s: %s", + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed"), + strbuf); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); + return (result); + } +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->devpoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_SELECT) + UNUSED(result); + +#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE + /* + * Note: this code should also cover the case of MAXSOCKETS <= + * FD_SETSIZE, but we separate the cases to avoid possible portability + * issues regarding howmany() and the actual representation of fd_set. + */ + manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * sizeof(fd_mask); #else - manager->fdsize = FD_SETSIZE; manager->fd_bufsize = sizeof(fd_set); #endif - manager->fds = NULL; - manager->fdstate = NULL; manager->read_fds = NULL; manager->read_fds_copy = NULL; manager->write_fds = NULL; manager->write_fds_copy = NULL; - manager->fds = isc_mem_get(mctx, - manager->fdsize * sizeof(manager->fds[0])); - if (manager->fds == NULL) - goto fail; - - manager->fdstate = isc_mem_get(mctx, manager->fdsize * - sizeof(manager->fdstate[0])); - if (manager->fdstate == NULL) - goto fail; - manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize); - if (manager->read_fds == NULL) - goto fail; - manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); - if (manager->read_fds_copy == NULL) - goto fail; - manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); - if (manager->write_fds == NULL) - goto fail; - manager->write_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); - if (manager->write_fds_copy == NULL) - goto fail; + if (manager->read_fds != NULL) + manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); + if (manager->read_fds_copy != NULL) + manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); + if (manager->write_fds != NULL) { + manager->write_fds_copy = isc_mem_get(mctx, + manager->fd_bufsize); + } + if (manager->write_fds_copy == NULL) { + if (manager->write_fds != NULL) { + isc_mem_put(mctx, manager->write_fds, + manager->fd_bufsize); + } + if (manager->read_fds_copy != NULL) { + isc_mem_put(mctx, manager->read_fds_copy, + manager->fd_bufsize); + } + if (manager->read_fds != NULL) { + isc_mem_put(mctx, manager->read_fds, + manager->fd_bufsize); + } + return (ISC_R_NOMEMORY); + } + memset(manager->read_fds, 0, manager->fd_bufsize); + memset(manager->write_fds, 0, manager->fd_bufsize); - return (ISC_R_SUCCESS); +#ifdef ISC_PLATFORM_USETHREADS + (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + manager->maxfd = manager->pipe_fds[0]; +#else /* ISC_PLATFORM_USETHREADS */ + manager->maxfd = 0; +#endif /* ISC_PLATFORM_USETHREADS */ +#endif /* USE_KQUEUE */ - fail: - cleanup_fdsets(manager, mctx); - return (ISC_R_NOMEMORY); + return (ISC_R_SUCCESS); } -/* - * Clean up fdsets in socketmgr structure. - */ static void -cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) { - if (manager->fds != NULL) { - isc_mem_put(mctx, manager->fds, - manager->fdsize * sizeof(manager->fds[0])); - } - if (manager->fdstate != NULL) { - isc_mem_put(mctx, manager->fdstate, - manager->fdsize * sizeof(manager->fdstate[0])); +cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { +#ifdef ISC_PLATFORM_USETHREADS + isc_result_t result; + + result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + UNEXPECTED_ERROR(__FILE__, __LINE__, + "epoll_ctl(DEL) %s", + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed")); } +#endif /* ISC_PLATFORM_USETHREADS */ + +#ifdef USE_KQUEUE + close(manager->kqueue_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); +#elif defined(USE_EPOLL) + close(manager->epoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); +#elif defined(USE_DEVPOLL) + close(manager->devpoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); +#elif defined(USE_SELECT) if (manager->read_fds != NULL) isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize); if (manager->read_fds_copy != NULL) @@ -2505,13 +3373,19 @@ cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) { isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize); if (manager->write_fds_copy != NULL) isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize); +#endif /* USE_KQUEUE */ } -/* - * Create a new socket manager. - */ isc_result_t isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { + return (isc_socketmgr_create2(mctx, managerp, 0)); +} + +isc_result_t +isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, + unsigned int maxsocks) +{ + int i; isc_socketmgr_t *manager; #ifdef ISC_PLATFORM_USETHREADS char strbuf[ISC_STRERRORSIZE]; @@ -2522,46 +3396,71 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { #ifndef ISC_PLATFORM_USETHREADS if (socketmgr != NULL) { + /* Don't allow maxsocks to be updated */ + if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) + return (ISC_R_EXISTS); + socketmgr->refs++; *managerp = socketmgr; return (ISC_R_SUCCESS); } #endif /* ISC_PLATFORM_USETHREADS */ + if (maxsocks == 0) + maxsocks = ISC_SOCKET_MAXSOCKETS; + manager = isc_mem_get(mctx, sizeof(*manager)); if (manager == NULL) return (ISC_R_NOMEMORY); - result = create_fdsets(manager, mctx); - if (result != ISC_R_SUCCESS) { - cleanup_fdsets(manager, mctx); - isc_mem_put(mctx, manager, sizeof(*manager)); - return (result); + /* zero-clear so that necessary cleanup on failure will be easy */ + memset(manager, 0, sizeof(*manager)); + manager->maxsocks = maxsocks; + manager->reserved = 0; + manager->fds = isc_mem_get(mctx, + manager->maxsocks * sizeof(isc_socket_t *)); + if (manager->fds == NULL) { + result = ISC_R_NOMEMORY; + goto free_manager; + } + manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); + if (manager->fds == NULL) { + result = ISC_R_NOMEMORY; + goto free_manager; } manager->magic = SOCKET_MANAGER_MAGIC; manager->mctx = NULL; + memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); ISC_LIST_INIT(manager->socklist); - result = isc_mutex_init(&manager->lock); - if (result != ISC_R_SUCCESS) { - cleanup_fdsets(manager, mctx); - isc_mem_put(mctx, manager, sizeof(*manager)); - UNEXPECTED_ERROR(__FILE__, __LINE__, - "isc_mutex_init() %s", - isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, "failed")); - return (ISC_R_UNEXPECTED); + result = isc_mutex_init(&manager->lock); + if (result != ISC_R_SUCCESS) + goto free_manager; + manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); + if (manager->fdlock == NULL) { + result = ISC_R_NOMEMORY; + goto cleanup_lock; } + for (i = 0; i < FDLOCK_COUNT; i++) { + result = isc_mutex_init(&manager->fdlock[i]); + if (result != ISC_R_SUCCESS) { + while (--i >= 0) + DESTROYLOCK(&manager->fdlock[i]); + isc_mem_put(mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + manager->fdlock = NULL; + goto cleanup_lock; + } + } + #ifdef ISC_PLATFORM_USETHREADS if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { - cleanup_fdsets(manager, mctx); - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_condition_init() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed")); - return (ISC_R_UNEXPECTED); + result = ISC_R_UNEXPECTED; + goto cleanup_lock; } /* @@ -2569,17 +3468,14 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { * select/poll loop when something internal needs to be done. */ if (pipe(manager->pipe_fds) != 0) { - cleanup_fdsets(manager, mctx); - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); isc__strerror(errno, strbuf, sizeof(strbuf)); UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() %s: %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed"), strbuf); - - return (ISC_R_UNEXPECTED); + result = ISC_R_UNEXPECTED; + goto cleanup_condition; } RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); @@ -2593,33 +3489,23 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { /* * Set up initial state for the select loop */ - memset(manager->read_fds, 0, manager->fd_bufsize); - memset(manager->write_fds, 0, manager->fd_bufsize); -#ifdef ISC_PLATFORM_USETHREADS - FD_SET(manager->pipe_fds[0], manager->read_fds); - manager->maxfd = manager->pipe_fds[0]; -#else /* ISC_PLATFORM_USETHREADS */ - manager->maxfd = 0; -#endif /* ISC_PLATFORM_USETHREADS */ - manager->reserved = 0; - memset(manager->fdstate, 0, - manager->fdsize * sizeof(manager->fdstate[0])); - + result = setup_watcher(mctx, manager); + if (result != ISC_R_SUCCESS) + goto cleanup; + memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); #ifdef ISC_PLATFORM_USETHREADS /* * Start up the select/poll thread. */ if (isc_thread_create(watcher, manager, &manager->watcher) != ISC_R_SUCCESS) { - (void)close(manager->pipe_fds[0]); - (void)close(manager->pipe_fds[1]); - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_thread_create() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed")); - return (ISC_R_UNEXPECTED); + cleanup_watcher(mctx, manager); + result = ISC_R_UNEXPECTED; + goto cleanup; } #endif /* ISC_PLATFORM_USETHREADS */ isc_mem_attach(mctx, &manager->mctx); @@ -2630,6 +3516,52 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { *managerp = manager; return (ISC_R_SUCCESS); + +cleanup: +#ifdef ISC_PLATFORM_USETHREADS + (void)close(manager->pipe_fds[0]); + (void)close(manager->pipe_fds[1]); +#endif /* ISC_PLATFORM_USETHREADS */ + +#ifdef ISC_PLATFORM_USETHREADS +cleanup_condition: + (void)isc_condition_destroy(&manager->shutdown_ok); +#endif /* ISC_PLATFORM_USETHREADS */ + + +cleanup_lock: + if (manager->fdlock != NULL) { + for (i = 0; i < FDLOCK_COUNT; i++) + DESTROYLOCK(&manager->fdlock[i]); + } + DESTROYLOCK(&manager->lock); + +free_manager: + if (manager->fdlock != NULL) { + isc_mem_put(mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + } + if (manager->fdstate != NULL) { + isc_mem_put(mctx, manager->fdstate, + manager->maxsocks * sizeof(int)); + } + if (manager->fds != NULL) { + isc_mem_put(mctx, manager->fds, + manager->maxsocks * sizeof(isc_socket_t *)); + } + isc_mem_put(mctx, manager, sizeof(*manager)); + + return (result); +} + +isc_result_t +isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) { + REQUIRE(VALID_MANAGER(manager)); + REQUIRE(nsockp != NULL); + + *nsockp = manager->maxsocks; + + return (ISC_R_SUCCESS); } void @@ -2703,18 +3635,30 @@ isc_socketmgr_destroy(isc_socketmgr_t **managerp) { /* * Clean up. */ + cleanup_watcher(manager->mctx, manager); + #ifdef ISC_PLATFORM_USETHREADS (void)close(manager->pipe_fds[0]); (void)close(manager->pipe_fds[1]); (void)isc_condition_destroy(&manager->shutdown_ok); #endif /* ISC_PLATFORM_USETHREADS */ - for (i = 0; i < (int)manager->fdsize; i++) - if (manager->fdstate[i] == CLOSE_PENDING) + for (i = 0; i < (int)manager->maxsocks; i++) + if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ (void)close(i); + isc_mem_put(manager->mctx, manager->fds, + manager->maxsocks * sizeof(isc_socket_t *)); + isc_mem_put(manager->mctx, manager->fdstate, + manager->maxsocks * sizeof(int)); + + if (manager->fdlock != NULL) { + for (i = 0; i < FDLOCK_COUNT; i++) + DESTROYLOCK(&manager->fdlock[i]); + isc_mem_put(manager->mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + } DESTROYLOCK(&manager->lock); - cleanup_fdsets(manager, manager->mctx); manager->magic = 0; mctx= manager->mctx; isc_mem_put(mctx, manager, sizeof(*manager)); @@ -2767,7 +3711,7 @@ socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, * Enqueue the request. If the socket was previously not being * watched, poke the watcher to start paying attention to it. */ - if (ISC_LIST_EMPTY(sock->recv_list)) + if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) select_poke(sock->manager, sock->fd, SELECT_POKE_READ); ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); @@ -2964,7 +3908,8 @@ socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, * not being watched, poke the watcher to start * paying attention to it. */ - if (ISC_LIST_EMPTY(sock->send_list)) + if (ISC_LIST_EMPTY(sock->send_list) && + !sock->pending_send) select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); @@ -3222,7 +4167,7 @@ isc_socket_listen(isc_socket_t *sock, unsigned int backlog) { } /* - * This should try to do agressive accept() XXXMLG + * This should try to do aggressive accept() XXXMLG */ isc_result_t isc_socket_accept(isc_socket_t *sock, @@ -3333,6 +4278,16 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, sock->address = *addr; cc = connect(sock->fd, &addr->type.sa, addr->length); if (cc < 0) { + /* + * HP-UX "fails" to connect a UDP socket and sets errno to + * EINPROGRESS if it's non-blocking. We'd rather regard this as + * a success and let the user detect it if it's really an error + * at the time of sending a packet on the socket. + */ + if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { + cc = 0; + goto success; + } if (SOFT_ERROR(errno) || errno == EINPROGRESS) goto queue; @@ -3374,6 +4329,7 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, /* * If connect completed, fire off the done event. */ + success: if (cc == 0) { sock->connected = 1; sock->bound = 1; @@ -3733,37 +4689,107 @@ isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) { #ifdef IPV6_V6ONLY if (sock->pf == AF_INET6) { - (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, - (void *)&onoff, sizeof(onoff)); + if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, + (void *)&onoff, sizeof(int)) < 0) { + char strbuf[ISC_STRERRORSIZE]; + + UNEXPECTED_ERROR(__FILE__, __LINE__, + "setsockopt(%d, IPV6_V6ONLY) " + "%s: %s", sock->fd, + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed"), + strbuf); + } } + FIX_IPV6_RECVPKTINFO(sock); /* AIX */ #endif } #ifndef ISC_PLATFORM_USETHREADS -void -isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd) { +/* In our assumed scenario, we can simply use a single static object. */ +static isc_socketwait_t swait_private; + +int +isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) { + int n; +#ifdef USE_KQUEUE + struct timespec ts, *tsp; +#endif +#ifdef USE_EPOLL + int timeout; +#endif +#ifdef USE_DEVPOLL + struct dvpoll dvp; +#endif + + REQUIRE(swaitp != NULL && *swaitp == NULL); + if (socketmgr == NULL) - *maxfd = 0; - else { - /* Prepare duplicates of fd_sets, as select() will modify */ - memcpy(socketmgr->read_fds_copy, socketmgr->read_fds, - socketmgr->fd_bufsize); - memcpy(socketmgr->write_fds_copy, socketmgr->write_fds, - socketmgr->fd_bufsize); - *readset = socketmgr->read_fds_copy; - *writeset = socketmgr->write_fds_copy; - *maxfd = socketmgr->maxfd + 1; - } + return (0); + +#ifdef USE_KQUEUE + if (tvp != NULL) { + ts.tv_sec = tvp->tv_sec; + ts.tv_nsec = tvp->tv_usec * 1000; + tsp = &ts; + } else + tsp = NULL; + swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0, + socketmgr->events, socketmgr->nevents, + tsp); + n = swait_private.nevents; +#elif defined(USE_EPOLL) + if (tvp != NULL) + timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; + else + timeout = -1; + swait_private.nevents = epoll_wait(socketmgr->epoll_fd, + socketmgr->events, + socketmgr->nevents, timeout); + n = swait_private.nevents; +#elif defined(USE_DEVPOLL) + dvp.dp_fds = socketmgr->events; + dvp.dp_nfds = socketmgr->nevents; + if (tvp != NULL) { + dvp.dp_timeout = tvp->tv_sec * 1000 + + (tvp->tv_usec + 999) / 1000; + } else + dvp.dp_timeout = -1; + swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp); + n = swait_private.nevents; +#elif defined(USE_SELECT) + memcpy(socketmgr->read_fds_copy, socketmgr->read_fds, + socketmgr->fd_bufsize); + memcpy(socketmgr->write_fds_copy, socketmgr->write_fds, + socketmgr->fd_bufsize); + + swait_private.readset = socketmgr->read_fds_copy; + swait_private.writeset = socketmgr->write_fds_copy; + swait_private.maxfd = socketmgr->maxfd + 1; + + n = select(swait_private.maxfd, swait_private.readset, + swait_private.writeset, NULL, tvp); +#endif + + *swaitp = &swait_private; + return (n); } isc_result_t -isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) { - isc_socketmgr_t *manager = socketmgr; +isc__socketmgr_dispatch(isc_socketwait_t *swait) { + REQUIRE(swait == &swait_private); - if (manager == NULL) + if (socketmgr == NULL) return (ISC_R_NOTFOUND); - process_fds(manager, maxfd, readset, writeset); +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) + (void)process_fds(socketmgr, socketmgr->events, swait->nevents); return (ISC_R_SUCCESS); +#elif defined(USE_SELECT) + process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset); + return (ISC_R_SUCCESS); +#endif } #endif /* ISC_PLATFORM_USETHREADS */ diff --git a/lib/isc/unix/socket_p.h b/lib/isc/unix/socket_p.h index 07720eef656c7..5786facad9c89 100644 --- a/lib/isc/unix/socket_p.h +++ b/lib/isc/unix/socket_p.h @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 2000, 2001 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket_p.h,v 1.6.206.1.34.1 2008/07/29 04:43:57 each Exp $ */ +/* $Id: socket_p.h,v 1.6.206.3 2008/06/25 23:45:37 tbox Exp $ */ #ifndef ISC_SOCKET_P_H #define ISC_SOCKET_P_H @@ -24,10 +24,7 @@ #include <sys/select.h> #endif -void -isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd); - -isc_result_t -isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd); - +typedef struct isc_socketwait isc_socketwait_t; +int isc__socketmgr_waitevents(struct timeval *, isc_socketwait_t **); +isc_result_t isc__socketmgr_dispatch(isc_socketwait_t *); #endif /* ISC_SOCKET_P_H */ diff --git a/lib/isc/unix/time.c b/lib/isc/unix/time.c index 39c851cebe9a9..c4b7f13600eba 100644 --- a/lib/isc/unix/time.c +++ b/lib/isc/unix/time.c @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1998-2001, 2003 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: time.c,v 1.34.2.6.2.4 2004/03/06 08:15:03 marka Exp $ */ +/* $Id: time.c,v 1.34.2.6.2.6 2008/04/28 23:45:41 tbox Exp $ */ #include <config.h> @@ -225,7 +225,7 @@ isc_time_nowplusinterval(isc_time_t *t, const isc_interval_t *i) { t->seconds = tv.tv_sec + i->seconds; t->nanoseconds = tv.tv_usec * NS_PER_US + i->nanoseconds; - if (t->nanoseconds > NS_PER_S) { + if (t->nanoseconds >= NS_PER_S) { t->seconds++; t->nanoseconds -= NS_PER_S; } @@ -408,5 +408,5 @@ isc_time_formattimestamp(const isc_time_t *t, char *buf, unsigned int len) { snprintf(buf + flen, len - flen, ".%03u", t->nanoseconds / 1000000); else - snprintf(buf, len, "99-Bad-9999 99:99:99.999"); + snprintf(buf, len, "99-Bad-9999 99:99:99.999"); } |