diff options
74 files changed, 1039 insertions, 560 deletions
@@ -27,6 +27,9 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 16.x IS SLOW: world, or to merely disable the most expensive debugging functionality at runtime, run "ln -s 'abort:false,junk:false' /etc/malloc.conf".) +20251105: + pf(4) now supports nat64 via the af-to keyword. + 20251102: Commit e5aa60d06958 changed the internal KAPI between the NFS modules. As such, they all need to be rebuilt diff --git a/bin/date/date.1 b/bin/date/date.1 index b86a660a924d..f68892bd408d 100644 --- a/bin/date/date.1 +++ b/bin/date/date.1 @@ -29,7 +29,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd September 1, 2025 +.Dd November 5, 2025 .Dt DATE 1 .Os .Sh NAME @@ -143,7 +143,8 @@ values are .Cm minutes , .Cm seconds , and -.Cm ns No Pq for nanoseconds . +.Cm ns +.Pq for nanoseconds . The date and time is formatted to the specified precision. When .Ar FMT @@ -172,7 +173,7 @@ Obsolete flag, accepted and ignored for compatibility. .It Fl R Use RFC 2822 date and time output format. This is equivalent to using -.Dq Li %a, %d %b %Y \&%T %z +.Ql %a, %d %b %Y \&%T %z as .Ar output_fmt while @@ -194,9 +195,7 @@ and can be specified in decimal, octal, or hex. Print the date and time of the last modification of .Ar filename . .It Fl u -Display or set the date in -.Tn UTC -(Coordinated Universal) time. +Display or set the date in UTC (Coordinated Universal) time. By default .Nm displays the time in the time zone described by @@ -328,7 +327,7 @@ The format string may contain any of the conversion specifications described in the .Xr strftime 3 manual page and -.Ql %N +.Ql \&%N for nanoseconds, as well as any arbitrary text. A newline .Pq Ql \en @@ -468,7 +467,7 @@ will display: .Dl "Sun Jan 4 04:15:24 GMT 1998" .Pp where it is currently -.Li "Mon Aug 4 04:15:24 BST 1997" . +.Ql "Mon Aug 4 04:15:24 BST 1997" . .Pp The command: .Pp @@ -493,29 +492,31 @@ will display the last Friday of the month: .Dl "Fri Aug 29 04:31:11 BST 1997" .Pp where it is currently -.Li "Mon Aug 4 04:31:11 BST 1997" . +.Ql "Mon Aug 4 04:31:11 BST 1997" . .Pp The command: .Pp .Dl "date 8506131627" .Pp sets the date to -.Dq Li "June 13, 1985, 4:27 PM" . +.Ql "June 13, 1985, 4:27 PM" . .Pp .Dl "date ""+%Y%m%d%H%M.%S""" .Pp may be used on one machine to print out the date suitable for setting on another. -.Qq ( Li "+%m%d%H%M%Y.%S" -for use on -.Tn Linux . ) +.Po Use +.Ql "+%m%d%H%M%Y.%S" +with GNU date on +Linux . +.Pc .Pp The command: .Pp .Dl "date 1432" .Pp sets the time to -.Li "2:32 PM" , +.Ql "2:32 PM" , without modifying the date. .Pp The command @@ -591,10 +592,10 @@ flag is compatible with .St -iso8601 . .Pp The -.Ql %N +.Ql \&%N conversion specification for nanoseconds is a non-standard extension. It is compatible with GNU date's -.Ql %N . +.Ql \&%N . .Sh HISTORY A .Nm @@ -615,6 +616,6 @@ flag was added in .Fx 12.0 . .Pp The -.Ql %N +.Ql \&%N conversion specification was added in .Fx 14.1 . diff --git a/cddl/contrib/opensolaris/cmd/dtrace/dtrace.1 b/cddl/contrib/opensolaris/cmd/dtrace/dtrace.1 index f09cbe1ac27b..456a9e319987 100644 --- a/cddl/contrib/opensolaris/cmd/dtrace/dtrace.1 +++ b/cddl/contrib/opensolaris/cmd/dtrace/dtrace.1 @@ -20,7 +20,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 3, 2025 +.Dd November 4, 2025 .Dt DTRACE 1 .Os .Sh NAME @@ -1292,6 +1292,7 @@ in .Xr cpp 1 , .Xr dwatch 1 , .Xr dtrace_audit 4 , +.Xr dtrace_callout_execute 4 , .Xr dtrace_dtrace 4 , .Xr dtrace_fbt 4 , .Xr dtrace_io 4 , diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index ea333a38d889..97f2194a3fa1 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -2,17 +2,17 @@ # Please see the file src/etc/mtree/README before making changes to this file. # -/set type=dir uname=root gname=wheel mode=0755 +/set type=dir uname=root gname=wheel mode=0755 tags=package=clibs-dev . arpa .. - atf-c + atf-c tags=package=atf-dev .. - atf-c++ + atf-c++ tags=package=atf-dev .. - bsm + bsm tags=package=audit-dev .. - bsnmp + bsnmp tags=package=bsnmp-dev .. c++ v1 @@ -118,7 +118,7 @@ scsi .. .. - casper + casper tags=package=libcasper-dev .. crypto .. @@ -190,10 +190,10 @@ wg .. .. - devdctl + devdctl tags=package=utilities-dev .. - edit - readline + edit tags=package=runtime-dev + readline tags=package=runtime-dev .. .. fs @@ -252,39 +252,39 @@ virstor .. .. - gssapi + gssapi tags=package=kerberos-dev .. - gssrpc + gssrpc tags=package=kerberos-dev .. - infiniband - complib + infiniband tags=package=utilities-dev + complib tags=package=utilities-dev .. - iba + iba tags=package=utilities-dev .. - opensm + opensm tags=package=utilities-dev .. - vendor + vendor tags=package=utilities-dev .. .. isofs cd9660 .. .. - kadm5 + kadm5 tags=package=kerberos-dev .. krb5 tags=package=kerberos-dev .. - lib80211 + lib80211 tags=package=runtime-dev .. - lib9p + lib9p tags=package=lib9p-dev .. - libipt + libipt tags=package=libipt-dev .. - libmilter + libmilter tags=package=libmilter-dev .. - libxo + libxo tags=package=runtime-dev .. - lzma + lzma tags=package=xz-dev .. machine pc @@ -323,7 +323,7 @@ .. .. netpfil - pf + pf tags=package=pf-dev .. .. netsmb @@ -348,20 +348,20 @@ stm .. .. - openssl + openssl tags=package=openssl-dev .. - pcap + pcap tags=package=utilities-dev .. protocols .. - rdma + rdma tags=package=utilities-dev .. rpc .. rpcsvc .. security - audit + audit tags=package=audit-dev .. mac_biba .. diff --git a/etc/mtree/BSD.usr.dist b/etc/mtree/BSD.usr.dist index 6a8c155e5e73..54d408865fa5 100644 --- a/etc/mtree/BSD.usr.dist +++ b/etc/mtree/BSD.usr.dist @@ -6,35 +6,35 @@ . bin .. - include - private - bsddialog + include tags=package=clibs-dev + private tags=package=clibs-dev + bsddialog tags=package=utilities-dev .. - bsdstat + bsdstat tags=package=libbsdstat-dev .. - event1 + event1 tags=package=libevent1-dev .. - gmock - internal - custom + gmock tags=package=utilities-dev + internal tags=package=utilities-dev + custom tags=package=utilities-dev .. .. .. - gtest - internal - custom + gtest tags=package=utilities-dev + internal tags=package=utilities-dev + custom tags=package=utilities-dev .. .. .. - samplerate + samplerate tags=package=sound-dev .. - sqlite3 + sqlite3 tags=package=libsqlite3-dev .. - ucl + ucl tags=package=libucl-dev .. - yaml + yaml tags=package=libyaml-dev .. - zstd + zstd tags=package=runtime-dev .. .. .. diff --git a/krb5/lib/kadm5clnt/Makefile b/krb5/lib/kadm5clnt/Makefile index 52a7187cf9bb..ef01a5f779e3 100644 --- a/krb5/lib/kadm5clnt/Makefile +++ b/krb5/lib/kadm5clnt/Makefile @@ -86,7 +86,8 @@ ${CHPASS_UTIL_STRINGS_ERR_C}: ${CHPASS_UTIL_STRINGS_ERR} rm -f et-c-${.PREFIX}.et et-c-${.PREFIX}.c afterinstall: - ${INSTALL_LIBSYMLINK} ${SHLIB_LINK} ${DESTDIR}${LIBDIR}/libkadm5clnt.so + ${INSTALL_LIBSYMLINK} ${DEV_TAG_ARGS} ${SHLIB_LINK} \ + ${DESTDIR}${LIBDIR}/libkadm5clnt.so .include <bsd.lib.mk> diff --git a/lib/libcasper/Makefile.inc b/lib/libcasper/Makefile.inc index 00bd221feb27..73a761ba4ce3 100644 --- a/lib/libcasper/Makefile.inc +++ b/lib/libcasper/Makefile.inc @@ -1,5 +1,7 @@ .include <src.opts.mk> +PACKAGE?= libcasper + .if ${MK_CASPER} != "no" CFLAGS+=-DWITH_CASPER .endif diff --git a/lib/libcasper/libcasper/Makefile b/lib/libcasper/libcasper/Makefile index 4db26f665f19..1a794791570f 100644 --- a/lib/libcasper/libcasper/Makefile +++ b/lib/libcasper/libcasper/Makefile @@ -1,5 +1,3 @@ -PACKAGE= runtime - SHLIBDIR?= /lib .include <src.opts.mk> diff --git a/lib/libcasper/services/cap_dns/Makefile b/lib/libcasper/services/cap_dns/Makefile index 4b11c97d29e5..b090c553bd28 100644 --- a/lib/libcasper/services/cap_dns/Makefile +++ b/lib/libcasper/services/cap_dns/Makefile @@ -2,8 +2,6 @@ SHLIBDIR?= /lib .include <src.opts.mk> -PACKAGE= runtime - SHLIB_MAJOR= 2 INCSDIR?= ${INCLUDEDIR}/casper diff --git a/lib/libcasper/services/cap_fileargs/Makefile b/lib/libcasper/services/cap_fileargs/Makefile index 2c52d0887a48..9d70d0ab9237 100644 --- a/lib/libcasper/services/cap_fileargs/Makefile +++ b/lib/libcasper/services/cap_fileargs/Makefile @@ -2,8 +2,6 @@ SHLIBDIR?= /lib .include <src.opts.mk> -PACKAGE= runtime - SHLIB_MAJOR= 1 INCSDIR?= ${INCLUDEDIR}/casper diff --git a/lib/libcasper/services/cap_grp/Makefile b/lib/libcasper/services/cap_grp/Makefile index a921dfa87e7c..13e695813bcf 100644 --- a/lib/libcasper/services/cap_grp/Makefile +++ b/lib/libcasper/services/cap_grp/Makefile @@ -2,8 +2,6 @@ SHLIBDIR?= /lib .include <src.opts.mk> -PACKAGE= runtime - SHLIB_MAJOR= 1 INCSDIR?= ${INCLUDEDIR}/casper diff --git a/lib/libcasper/services/cap_net/Makefile b/lib/libcasper/services/cap_net/Makefile index 1ba35a674a05..4e9814118c41 100644 --- a/lib/libcasper/services/cap_net/Makefile +++ b/lib/libcasper/services/cap_net/Makefile @@ -2,8 +2,6 @@ SHLIBDIR?= /lib .include <src.opts.mk> -PACKAGE=libcasper - SHLIB_MAJOR= 1 INCSDIR?= ${INCLUDEDIR}/casper diff --git a/lib/libcasper/services/cap_netdb/Makefile b/lib/libcasper/services/cap_netdb/Makefile index 853052e78d04..a330eeedeb11 100644 --- a/lib/libcasper/services/cap_netdb/Makefile +++ b/lib/libcasper/services/cap_netdb/Makefile @@ -2,8 +2,6 @@ SHLIBDIR?= /lib .include <src.opts.mk> -PACKAGE= runtime - SHLIB_MAJOR= 1 INCSDIR?= ${INCLUDEDIR}/casper diff --git a/lib/libcasper/services/cap_pwd/Makefile b/lib/libcasper/services/cap_pwd/Makefile index a1e97845c736..ba8df80d5ad7 100644 --- a/lib/libcasper/services/cap_pwd/Makefile +++ b/lib/libcasper/services/cap_pwd/Makefile @@ -2,8 +2,6 @@ SHLIBDIR?= /lib .include <src.opts.mk> -PACKAGE= runtime - SHLIB_MAJOR= 1 INCSDIR?= ${INCLUDEDIR}/casper diff --git a/lib/libcasper/services/cap_sysctl/Makefile b/lib/libcasper/services/cap_sysctl/Makefile index 522313df4ffc..4408bad4efb4 100644 --- a/lib/libcasper/services/cap_sysctl/Makefile +++ b/lib/libcasper/services/cap_sysctl/Makefile @@ -2,8 +2,6 @@ SHLIBDIR?= /lib .include <src.opts.mk> -PACKAGE= runtime - SHLIB_MAJOR= 2 INCSDIR?= ${INCLUDEDIR}/casper diff --git a/lib/libcasper/services/cap_syslog/Makefile b/lib/libcasper/services/cap_syslog/Makefile index 88979d8bed23..d18ad6d76ede 100644 --- a/lib/libcasper/services/cap_syslog/Makefile +++ b/lib/libcasper/services/cap_syslog/Makefile @@ -2,8 +2,6 @@ SHLIBDIR?= /lib .include <src.opts.mk> -PACKAGE= runtime - SHLIB_MAJOR= 1 INCSDIR?= ${INCLUDEDIR}/casper diff --git a/lib/libcasper/services/tests/Makefile b/lib/libcasper/services/tests/Makefile index 29b1b564beca..4b6c72fd86e8 100644 --- a/lib/libcasper/services/tests/Makefile +++ b/lib/libcasper/services/tests/Makefile @@ -1,4 +1,6 @@ .PATH: ${SRCTOP}/tests + +PACKAGE= tests KYUAFILE= yes .include <bsd.test.mk> diff --git a/lib/libcasper/tests/Makefile b/lib/libcasper/tests/Makefile index 29b1b564beca..4b6c72fd86e8 100644 --- a/lib/libcasper/tests/Makefile +++ b/lib/libcasper/tests/Makefile @@ -1,4 +1,6 @@ .PATH: ${SRCTOP}/tests + +PACKAGE= tests KYUAFILE= yes .include <bsd.test.mk> diff --git a/lib/libpfctl/libpfctl.c b/lib/libpfctl/libpfctl.c index e739e55033e2..e747763ae6ef 100644 --- a/lib/libpfctl/libpfctl.c +++ b/lib/libpfctl/libpfctl.c @@ -1491,7 +1491,7 @@ snl_attr_get_pf_rule_labels(struct snl_state *ss, struct nlattr *nla, bool ret; if (l->i >= PF_RULE_MAX_LABEL_COUNT) - return (E2BIG); + return (false); ret = snl_attr_copy_string(ss, nla, (void *)PF_RULE_LABEL_SIZE, l->labels[l->i]); @@ -1561,7 +1561,7 @@ snl_attr_get_pf_timeout(struct snl_state *ss, struct nlattr *nla, bool ret; if (t->i >= PFTM_MAX) - return (E2BIG); + return (false); ret = snl_attr_get_uint32(ss, nla, NULL, &t->timeouts[t->i]); if (ret) @@ -2742,6 +2742,88 @@ int pfctl_table_get_addrs(int dev, struct pfr_table *tbl, struct pfr_addr *addr, return (0); } +struct nl_addrs { + size_t max; + struct pfr_addr *addrs; + size_t count; + size_t total_count; +}; + +#define _OUT(_field) offsetof(struct pfr_addr, _field) +static const struct snl_attr_parser ap_pfr_addr[] = { + { .type = PFR_A_AF, .off = _OUT(pfra_af), .cb = snl_attr_get_uint32 }, + { .type = PFR_A_NET, .off = _OUT(pfra_net), .cb = snl_attr_get_uint8 }, + { .type = PFR_A_NOT, .off = _OUT(pfra_not), .cb = snl_attr_get_bool }, + { .type = PFR_A_ADDR, .off = _OUT(pfra_ip6addr), .cb = snl_attr_get_in6_addr }, +}; +#undef _OUT +SNL_DECLARE_ATTR_PARSER(pfr_addr_parser, ap_pfr_addr); + +static bool +snl_attr_get_pfr_addrs(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *target) +{ + struct nl_addrs *a = (struct nl_addrs *)target; + bool ret; + + if (a->count >= a->max) + return (false); + + ret = snl_parse_header(ss, NLA_DATA(nla), NLA_DATA_LEN(nla), + &pfr_addr_parser, &a->addrs[a->count]); + if (ret) + a->count++; + + return (ret); +} + +#define _OUT(_field) offsetof(struct nl_addrs, _field) +static struct snl_attr_parser ap_table_get_addr[] = { + { .type = PF_TA_ADDR, .off = 0, .cb = snl_attr_get_pfr_addrs }, + { .type = PF_TA_ADDR_COUNT, .off = _OUT(total_count), .cb = snl_attr_get_uint32 }, +}; +#undef _OUT +SNL_DECLARE_PARSER(table_get_addr_parser, struct genlmsghdr, snl_f_p_empty, ap_table_get_addr); +int +pfctl_table_get_addrs_h(struct pfctl_handle *h, struct pfr_table *tbl, + struct pfr_addr *addr, int *size, int flags) +{ + struct nl_addrs addrs = { 0 }; + struct snl_writer nw; + struct snl_errmsg_data e = {}; + struct nlmsghdr *hdr; + uint32_t seq_id; + int family_id; + + family_id = snl_get_genl_family(&h->ss, PFNL_FAMILY_NAME); + if (family_id == 0) + return (ENOTSUP); + + snl_init_writer(&h->ss, &nw); + hdr = snl_create_genl_msg_request(&nw, family_id, PFNL_CMD_TABLE_GET_ADDR); + + snl_add_msg_attr_table(&nw, PF_TA_TABLE, tbl); + snl_add_msg_attr_u32(&nw, PF_TA_FLAGS, flags); + + if ((hdr = snl_finalize_msg(&nw)) == NULL) + return (ENXIO); + + seq_id = hdr->nlmsg_seq; + if (! snl_send_message(&h->ss, hdr)) + return (ENXIO); + + addrs.addrs = addr; + addrs.max = *size; + while ((hdr = snl_read_reply_multi(&h->ss, seq_id, &e)) != NULL) { + if (! snl_parse_nlmsg(&h->ss, hdr, &table_get_addr_parser, &addrs)) + continue; + } + + *size = addrs.total_count; + + return (e.error); +} + int pfctl_set_statusif(struct pfctl_handle *h, const char *ifname) { diff --git a/lib/libpfctl/libpfctl.h b/lib/libpfctl/libpfctl.h index ae4b18dabe75..a5b7e1c23bd0 100644 --- a/lib/libpfctl/libpfctl.h +++ b/lib/libpfctl/libpfctl.h @@ -529,6 +529,8 @@ int pfctl_table_set_addrs_h(struct pfctl_handle *h, struct pfr_table *tbl, int pfctl_table_set_addrs(int dev, struct pfr_table *tbl, struct pfr_addr *addr, int size, int *size2, int *nadd, int *ndel, int *nchange, int flags); +int pfctl_table_get_addrs_h(struct pfctl_handle *h, struct pfr_table *tbl, struct pfr_addr *addr, + int *size, int flags); int pfctl_table_get_addrs(int dev, struct pfr_table *tbl, struct pfr_addr *addr, int *size, int flags); int pfctl_set_statusif(struct pfctl_handle *h, const char *ifname); diff --git a/lib/libsys/Makefile.sys b/lib/libsys/Makefile.sys index bd65b58083c2..1d1a4f1136ce 100644 --- a/lib/libsys/Makefile.sys +++ b/lib/libsys/Makefile.sys @@ -243,6 +243,7 @@ MAN+= abort2.2 \ jail.2 \ kcmp.2 \ kenv.2 \ + kexec_load.2 \ kill.2 \ kldfind.2 \ kldfirstmod.2 \ diff --git a/lib/libsys/kexec_load.2 b/lib/libsys/kexec_load.2 new file mode 100644 index 000000000000..c3d458f34100 --- /dev/null +++ b/lib/libsys/kexec_load.2 @@ -0,0 +1,119 @@ +.\" +.\" SPDX-License-Identifier: BSD-3-Clause +.\" +.\" Copyright (c) 2025 Juniper Networks, Inc. +.\" +.Dd October 29, 2025 +.Dt KEXEC_LOAD 2 +.Os +.Sh NAME +.Nm kexec_load +.Nd prepare new kernel to reboot into +.Sh SYNOPSIS +.Lb libc +.In sys/kexec.h +.Ft int +.Fn kexec_load "uint64_t entry" "unsigned long count" \ + "struct kexec_segment *segments" "unsigned long flags" +.Sh DESCRIPTION +The +.Fn kexec_load +system call loads a new kernel that can be executed later by +.Xr reboot 2 . +Subsequent calls will replace previously loaded images. +.Pp +The +.Fa flags +argument is a bitmask of flags that control the operation of the call. +This argument is present for compatibility with Linux, although it is currently +unused and must be 0. +.Pp +The +.Fa entry +argument is the physical address of the entry point of the new kernel image. +.Pp +The +.Fa count +argument is the number of segments in the image, currently limited to 16. +A value of 0 will unload the currently staged image, if one exists, without +staging a new image. +.Pp +The +.Fa segments +argument is an array of +.Fa count +members of the following structure: +.Bd -literal -offset indent +struct kexec_segment { + void *buf; + size_t bufsz; + vm_paddr_t mem; + vm_size_t memsz; +}; +.Ed +.Pp +The +.Va buf +and +.Va bufsz +members specify a memory region in the caller's address space containing the +source of the segment. +The +.Va mem +and +.Va memsz +members specify the target physical region of the segment. +.Va bufsz +must be less than or equal to +.Va memsz , +and +.Va mem +and +.Va memsz +must be page aligned. +The region covered by +.Va mem +must be in the list covered by the +.Va vm.phys_segs +sysctl. +.Pp +The +.Fn kexec_load +system call stages the kernel image in safe memory along with all +machine-dependent image data until +.Xr reboot 2 +is called with the +.Va RB_KEXEC +flag to load the image and execute the new kernel. +.Sh RETURN VALUES +The +.Fn kexec_load +system call returns 0 on success. +On failure, -1 is returned, and +.Va errno +is set to indicate the error. +On success any previously loaded image is unloaded and replaced with the new +image. +On failure, the previously loaded image is unchanged. +.Sh ERRORS +The following errors may be returned: +.Bl -tag -width Er +.It Bq Er EINVAL +Too many segments in image. +.It Bq Er EINVAL +The value of +.Va bufsz +is larger than +.Va memsz +in one or more segments. +.It Bq Er EINVAL +Machine-dependent load error. +.It Bq Er EBUSY +Another +.Fn kexec_load +call is in progress. +.Sh HISTORY +The +.Nm +system call appeared in +.Fx 16.0 . diff --git a/lib/libsys/posix_fallocate.2 b/lib/libsys/posix_fallocate.2 index 8be075b41331..94858c4a0f90 100644 --- a/lib/libsys/posix_fallocate.2 +++ b/lib/libsys/posix_fallocate.2 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd March 30, 2020 +.Dd November 2, 2025 .Dt POSIX_FALLOCATE 2 .Os .Sh NAME @@ -105,8 +105,7 @@ The .Fa len argument was less than or equal to zero, the .Fa offset -argument was less than zero, -or the operation is not supported by the file system. +argument was less than zero. .It Bq Er EIO An I/O error occurred while reading from or writing to a file system. .It Bq Er EINTEGRITY @@ -123,6 +122,8 @@ media. The file descriptor .Fa fd has insufficient rights. +.It Bq Er EOPNOTSUPP +The operation is not supported by the file system. .It Bq Er ESPIPE The .Fa fd @@ -137,12 +138,29 @@ argument is associated with a pipe or FIFO. The .Fn posix_fallocate system call conforms to -.St -p1003.1-2004 . +.St -p1003.1-2024 . .Sh HISTORY The .Fn posix_fallocate function appeared in .Fx 9.0 . +.Pp +Previous versions of +.Nm +used +.Er EINVAL +to indicate that the operation is not supported by the file system, as specified +in +.St -p1003.1 +Base Specifications, Issue 7. +.St -p1003.1 +Base Specifications, Issue 8 switched to requiring +.Er EOPNOTSUPP +for this error case. +ZFS adopted the latter convention in +.Fx 15.0 , +and the remaining filesystems in base adopted it in +.Fx 15.1 . .Sh AUTHORS .Fn posix_fallocate and this manual page were initially written by diff --git a/lib/libsys/reboot.2 b/lib/libsys/reboot.2 index f6c7bf6c83cc..54fa8b599cd5 100644 --- a/lib/libsys/reboot.2 +++ b/lib/libsys/reboot.2 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd July 10, 2018 +.Dd October 29, 2025 .Dt REBOOT 2 .Os .Sh NAME @@ -126,6 +126,10 @@ on the console. is actually interpreted by the .Xr init 8 program in the newly booted system. +.It Dv RB_KEXEC +Execute a new kernel loaded via +.Fn kexec_load 2 . +If no kernel was loaded, reboot as normal. .El .Pp When no options are given (i.e., @@ -149,6 +153,7 @@ variable The caller is not the super-user. .El .Sh SEE ALSO +.Xr kexec_load 2 , .Xr crash 8 , .Xr halt 8 , .Xr init 8 , diff --git a/release/packages/sets/base-dbg.ucl b/release/packages/sets/base-dbg.ucl index 79e5de22522e..c96b10416dcb 100644 --- a/release/packages/sets/base-dbg.ucl +++ b/release/packages/sets/base-dbg.ucl @@ -28,5 +28,8 @@ deps { }, "set-devel-dbg" { version = "${VERSION}" + }, + "set-optional-dbg" { + version = "${VERSION}" } } diff --git a/sbin/ifconfig/ifbridge.c b/sbin/ifconfig/ifbridge.c index eff443447c13..8bcf4a638adf 100644 --- a/sbin/ifconfig/ifbridge.c +++ b/sbin/ifconfig/ifbridge.c @@ -811,7 +811,7 @@ unsetbridge_private(if_ctx *ctx, const char *val, int dummy __unused) static int parse_vlans(ifbvlan_set_t *set, const char *str) { - char *s, *token; + char *s, *free_s, *token; /* "none" means the empty vlan set */ if (strcmp(str, "none") == 0) { @@ -829,6 +829,8 @@ parse_vlans(ifbvlan_set_t *set, const char *str) if ((s = strdup(str)) == NULL) return (-1); + /* Keep the original value of s, since strsep() will modify it */ + free_s = s; while ((token = strsep(&s, ",")) != NULL) { unsigned long first, last; @@ -856,11 +858,11 @@ parse_vlans(ifbvlan_set_t *set, const char *str) BRVLAN_SET(set, vlan); } - free(s); + free(free_s); return (0); err: - free(s); + free(free_s); return (-1); } diff --git a/sbin/ipf/libipf/interror.c b/sbin/ipf/libipf/interror.c index 981823ca6bb9..a8dc3be2d5d1 100644 --- a/sbin/ipf/libipf/interror.c +++ b/sbin/ipf/libipf/interror.c @@ -17,7 +17,7 @@ typedef struct { static ipf_error_entry_t *find_error(int); -#define IPF_NUM_ERRORS 477 +#define IPF_NUM_ERRORS sizeof(ipf_errors) / sizeof(ipf_error_entry_t) /* * NO REUSE OF NUMBERS! @@ -25,7 +25,7 @@ static ipf_error_entry_t *find_error(int); * IF YOU WANT TO ADD AN ERROR TO THIS TABLE, _ADD_ A NEW NUMBER. * DO _NOT_ USE AN EMPTY NUMBER OR FILL IN A GAP. */ -static ipf_error_entry_t ipf_errors[IPF_NUM_ERRORS] = { +static ipf_error_entry_t ipf_errors[] = { { 1, "auth table locked/full" }, { 2, "" }, { 3, "copyinptr received bad address" }, @@ -228,6 +228,8 @@ static ipf_error_entry_t ipf_errors[IPF_NUM_ERRORS] = { { 30024, "object size incorrect for hash table" }, { 30025, "hash table size must be at least 1"}, { 30026, "cannot allocate memory for hash table context" }, + { 30027, "hash table larger than maximum allowed" }, + { 30028, "hash table multiplication overflow" }, /* -------------------------------------------------------------------------- */ { 40001, "invalid minor device number for log read" }, { 40002, "read size too small" }, diff --git a/sbin/pfctl/pfctl_radix.c b/sbin/pfctl/pfctl_radix.c index 3b7161420e33..823921953eaf 100644 --- a/sbin/pfctl/pfctl_radix.c +++ b/sbin/pfctl/pfctl_radix.c @@ -182,7 +182,7 @@ pfr_get_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int *size, { int ret; - ret = pfctl_table_get_addrs(dev, tbl, addr, size, flags); + ret = pfctl_table_get_addrs_h(pfh, tbl, addr, size, flags); if (ret) { errno = ret; return (-1); diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index 95618227a010..34edf6ad455d 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -1005,6 +1005,7 @@ _ccd.4= ccd.4 .if ${MK_CDDL} != "no" _dtrace_provs= dtrace_audit.4 \ + dtrace_callout_execute.4 \ dtrace_dtrace.4 \ dtrace_fbt.4 \ dtrace_io.4 \ diff --git a/share/man/man4/dtrace_callout_execute.4 b/share/man/man4/dtrace_callout_execute.4 new file mode 100644 index 000000000000..1154ed066b97 --- /dev/null +++ b/share/man/man4/dtrace_callout_execute.4 @@ -0,0 +1,68 @@ +.\" +.\" Copyright (c) 2025 Mateusz Piotrowski <0mp@FreeBSD.org> +.\" +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.Dd November 4, 2025 +.Dt DTRACE_CALLOUT_EXECUTE 4 +.Os +.Sh NAME +.Nm dtrace_callout_execute +.Nd a DTrace provider for the callout API +.Sh SYNOPSIS +.Nm callout_execute Ns Cm :kernel::callout_start +.Nm callout_execute Ns Cm :kernel::callout_end +.Sh DESCRIPTION +The +.Nm callout_execute +provider allows for tracing the +.Xr callout 9 +mechanism. +.Pp +The +.Nm callout_execute Ns Cm :kernel::callout_start +probe fires just before a callout. +.Pp +The +.Nm callout_execute Ns Cm :kernel::callout_end +probe fires right after a callout. +.Pp +The only argument to the +.Nm callout_execute +probes, +.Fa args[0] , +is a callout handler +.Ft struct callout * +of the invoked callout. +.Sh EXAMPLES +.Ss Example 1: Graph of Callout Execution Time +The following +.Xr d 7 +script generates a distribution graph of +.Xr callout 9 +execution times: +.Bd -literal -offset 2n +callout_execute:::callout_start +{ + self->cstart = timestamp; +} + +callout_execute:::callout_end +{ + @length = quantize(timestamp - self->cstart); +} +.Ed +.Sh SEE ALSO +.Xr dtrace 1 , +.Xr tracing 7 , +.Xr callout 9 , +.Xr SDT 9 +.Sh AUTHORS +.An -nosplit +The +.Nm callout_execute +provider was written by +.An Robert N. M. Watson Aq Mt rwatson@FreeBSD.org . +.Pp +This manual page was written by +.An Mateusz Piotrowski Aq Mt 0mp@FreeBSD.org . diff --git a/share/man/man9/buf.9 b/share/man/man9/buf.9 index ecd4a1487735..ff9a1d0d46e0 100644 --- a/share/man/man9/buf.9 +++ b/share/man/man9/buf.9 @@ -36,44 +36,70 @@ The kernel implements a KVM abstraction of the buffer cache which allows it to map potentially disparate vm_page's into contiguous KVM for use by (mainly file system) devices and device I/O. This abstraction supports -block sizes from DEV_BSIZE (usually 512) to upwards of several pages or more. +block sizes from +.Dv DEV_BSIZE +(usually 512) to upwards of several pages or more. It also supports a relatively primitive byte-granular valid range and dirty range currently hardcoded for use by NFS. The code implementing the VM Buffer abstraction is mostly concentrated in -.Pa /usr/src/sys/kern/vfs_bio.c . +.Pa sys/kern/vfs_bio.c +in the +.Fx +source tree. .Pp One of the most important things to remember when dealing with buffer pointers -(struct buf) is that the underlying pages are mapped directly from the buffer +.Pq Vt struct buf +is that the underlying pages are mapped directly from the buffer cache. No data copying occurs in the scheme proper, though some file systems such as UFS do have to copy a little when dealing with file fragments. The second most important thing to remember is that due to the underlying page -mapping, the b_data base pointer in a buf is always *page* aligned, not -*block* aligned. -When you have a VM buffer representing some b_offset and -b_size, the actual start of the buffer is (b_data + (b_offset & PAGE_MASK)) -and not just b_data. +mapping, the +.Va b_data +base pointer in a buf is always +.Em page Ns -aligned , +not +.Em block Ns -aligned . +When you have a VM buffer representing some +.Va b_offset +and +.Va b_size , +the actual start of the buffer is +.Ql b_data + (b_offset & PAGE_MASK) +and not just +.Ql b_data . Finally, the VM system's core buffer cache supports -valid and dirty bits (m->valid, m->dirty) for pages in DEV_BSIZE chunks. +valid and dirty bits +.Pq Va m->valid , m->dirty +for pages in +.Dv DEV_BSIZE +chunks. Thus a platform with a hardware page size of 4096 bytes has 8 valid and 8 dirty bits. These bits are generally set and cleared in groups based on the device block size of the device backing the page. Complete page's worth are often -referred to using the VM_PAGE_BITS_ALL bitmask (i.e., 0xFF if the hardware page +referred to using the +.Dv VM_PAGE_BITS_ALL +bitmask (i.e., 0xFF if the hardware page size is 4096). .Pp VM buffers also keep track of a byte-granular dirty range and valid range. This feature is normally only used by the NFS subsystem. I am not sure why it -is used at all, actually, since we have DEV_BSIZE valid/dirty granularity +is used at all, actually, since we have +.Dv DEV_BSIZE +valid/dirty granularity within the VM buffer. -If a buffer dirty operation creates a 'hole', +If a buffer dirty operation creates a +.Dq hole , the dirty range will extend to cover the hole. If a buffer validation -operation creates a 'hole' the byte-granular valid range is left alone and +operation creates a +.Dq hole +the byte-granular valid range is left alone and will not take into account the new extension. Thus the whole byte-granular abstraction is considered a bad hack and it would be nice if we could get rid @@ -81,16 +107,24 @@ of it completely. .Pp A VM buffer is capable of mapping the underlying VM cache pages into KVM in order to allow the kernel to directly manipulate the data associated with -the (vnode,b_offset,b_size). +the +.Pq Va vnode , b_offset , b_size . The kernel typically unmaps VM buffers the moment -they are no longer needed but often keeps the 'struct buf' structure -instantiated and even bp->b_pages array instantiated despite having unmapped +they are no longer needed but often keeps the +.Vt struct buf +structure +instantiated and even +.Va bp->b_pages +array instantiated despite having unmapped them from KVM. If a page making up a VM buffer is about to undergo I/O, the -system typically unmaps it from KVM and replaces the page in the b_pages[] +system typically unmaps it from KVM and replaces the page in the +.Va b_pages[] array with a place-marker called bogus_page. The place-marker forces any kernel -subsystems referencing the associated struct buf to re-lookup the associated +subsystems referencing the associated +.Vt struct buf +to re-lookup the associated page. I believe the place-marker hack is used to allow sophisticated devices such as file system devices to remap underlying pages in order to deal with, @@ -107,18 +141,29 @@ you wind up with pages marked clean that are actually still dirty. If not treated carefully, these pages could be thrown away! Indeed, a number of -serious bugs related to this hack were not fixed until the 2.2.8/3.0 release. -The kernel uses an instantiated VM buffer (i.e., struct buf) to place-mark pages +serious bugs related to this hack were not fixed until the +.Fx 2.2.8 / +.Fx 3.0 +release. +The kernel uses an instantiated VM buffer (i.e., +.Vt struct buf ) +to place-mark pages in this special state. -The buffer is typically flagged B_DELWRI. +The buffer is typically flagged +.Dv B_DELWRI . When a -device no longer needs a buffer it typically flags it as B_RELBUF. +device no longer needs a buffer it typically flags it as +.Dv B_RELBUF . Due to -the underlying pages being marked clean, the B_DELWRI|B_RELBUF combination must +the underlying pages being marked clean, the +.Ql B_DELWRI|B_RELBUF +combination must be interpreted to mean that the buffer is still actually dirty and must be written to its backing store before it can actually be released. In the case -where B_DELWRI is not set, the underlying dirty pages are still properly +where +.Dv B_DELWRI +is not set, the underlying dirty pages are still properly marked as dirty and the buffer can be completely freed without losing that clean/dirty state information. (XXX do we have to check other flags in @@ -128,7 +173,9 @@ The kernel reserves a portion of its KVM space to hold VM Buffer's data maps. Even though this is virtual space (since the buffers are mapped from the buffer cache), we cannot make it arbitrarily large because -instantiated VM Buffers (struct buf's) prevent their underlying pages in the +instantiated VM Buffers +.Pq Vt struct buf Ap s +prevent their underlying pages in the buffer cache from being freed. This can complicate the life of the paging system. diff --git a/share/man/man9/callout.9 b/share/man/man9/callout.9 index 0e59ef8ab2b1..637049ec1ef5 100644 --- a/share/man/man9/callout.9 +++ b/share/man/man9/callout.9 @@ -27,7 +27,7 @@ .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd January 22, 2024 +.Dd November 4, 2025 .Dt CALLOUT 9 .Os .Sh NAME @@ -789,6 +789,8 @@ and functions return a value of one if the callout was still pending when it was called, a zero if the callout could not be stopped and a negative one is it was either not running or has already completed. +.Sh SEE ALSO +.Xr dtrace_callout_execute 4 .Sh HISTORY .Fx initially used the long standing diff --git a/share/man/man9/make_dev.9 b/share/man/man9/make_dev.9 index de56f350faa5..9f2c36fb39a4 100644 --- a/share/man/man9/make_dev.9 +++ b/share/man/man9/make_dev.9 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd January 19, 2025 +.Dd November 4, 2025 .Dt MAKE_DEV 9 .Os .Sh NAME @@ -387,14 +387,18 @@ function is the same as: destroy_dev_sched_cb(cdev, NULL, NULL); .Ed .Pp -The +Neither the .Fn d_close -driver method cannot call +driver method, nor a +.Xr devfs_cdevpriv 9 +.Fa dtr +method can .Fn destroy_dev directly. Doing so causes deadlock when .Fn destroy_dev -waits for all threads to leave the driver methods. +waits for all threads to leave the driver methods and finish executing any +per-open destructors. Also, because .Fn destroy_dev sleeps, no non-sleepable locks may be held over the call. diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index ad67510fecf3..5cf1ae2d769c 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -122,33 +122,7 @@ enum x2apic_state { #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) -/* - * The VM name has to fit into the pathname length constraints of devfs, - * governed primarily by SPECNAMELEN. The length is the total number of - * characters in the full path, relative to the mount point and not - * including any leading '/' characters. - * A prefix and a suffix are added to the name specified by the user. - * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters - * longer for future use. - * The suffix is a string that identifies a bootrom image or some similar - * image that is attached to the VM. A separator character gets added to - * the suffix automatically when generating the full path, so it must be - * accounted for, reducing the effective length by 1. - * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 - * bytes for FreeBSD 12. A minimum length is set for safety and supports - * a SPECNAMELEN as small as 32 on old systems. - */ -#define VM_MAX_PREFIXLEN 10 -#define VM_MAX_SUFFIXLEN 15 -#define VM_MIN_NAMELEN 6 -#define VM_MAX_NAMELEN \ - (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) - #ifdef _KERNEL -#include <sys/kassert.h> - -CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); - struct vm; struct vm_exception; struct vm_mem; @@ -232,8 +206,6 @@ struct vmm_ops { extern const struct vmm_ops vmm_ops_intel; extern const struct vmm_ops vmm_ops_amd; -extern u_int vm_maxcpu; /* maximum virtual cpus */ - int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); @@ -383,7 +355,8 @@ vcpu_should_yield(struct vcpu *vcpu) #endif void *vcpu_stats(struct vcpu *vcpu); -void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); +void vcpu_notify_event(struct vcpu *vcpu); +void vcpu_notify_lapic(struct vcpu *vcpu); struct vm_mem *vm_mem(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 441330fd57b8..f1c07a983a4b 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -34,6 +34,8 @@ #include <machine/vmm.h> #include <machine/vmm_snapshot.h> +#include <dev/vmm/vmm_param.h> + struct vm_memmap { vm_paddr_t gpa; int segid; /* memory segment */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 842281ab862e..4189c1214b40 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -27,7 +27,6 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> #include "opt_bhyve_snapshot.h" #include <sys/param.h> @@ -58,6 +57,7 @@ #include <machine/vmm_instruction_emul.h> #include <machine/vmm_snapshot.h> +#include <dev/vmm/vmm_dev.h> #include <dev/vmm/vmm_ktr.h> #include <dev/vmm/vmm_mem.h> diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index 2cb459fb848f..6feac5dcbbed 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -336,13 +336,6 @@ ppt_teardown_msix(struct pptdev *ppt) } int -ppt_avail_devices(void) -{ - - return (num_pptdevs); -} - -int ppt_assigned_devices(struct vm *vm) { struct pptdev *ppt; diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h index f97c399564d7..9377f34d50e6 100644 --- a/sys/amd64/vmm/io/ppt.h +++ b/sys/amd64/vmm/io/ppt.h @@ -43,12 +43,6 @@ int ppt_assigned_devices(struct vm *vm); bool ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); /* - * Returns the number of devices sequestered by the ppt driver for assignment - * to virtual machines. - */ -int ppt_avail_devices(void); - -/* * The following functions should never be called directly. * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. */ diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 9879dfa164a4..afd5045de574 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -456,7 +456,7 @@ vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) - vcpu_notify_event(vlapic->vcpu, true); + vcpu_notify_lapic(vlapic->vcpu); break; case APIC_LVT_DM_NMI: vm_inject_nmi(vlapic->vcpu); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index f2bea0d82b5c..2890e990633d 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -31,7 +31,6 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/module.h> #include <sys/sysctl.h> #include <sys/malloc.h> #include <sys/pcpu.h> @@ -189,8 +188,6 @@ struct vm { #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) -static int vmm_initialized; - static void vmmops_panic(void); static void @@ -270,11 +267,7 @@ static int trap_wbinvd; SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, "WBINVD triggers a VM-exit"); -u_int vm_maxcpu; -SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, - &vm_maxcpu, 0, "Maximum number of vCPUs"); - -static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); +static void vcpu_notify_event_locked(struct vcpu *vcpu); /* global statistics */ VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); @@ -299,14 +292,6 @@ VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); -/* - * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU - * counts as well as range of vpid values for VT-x and by the capacity - * of cpuset_t masks. The call to new_unrhdr() in vpid_init() in - * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. - */ -#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) - #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) @@ -402,22 +387,12 @@ vm_exitinfo_cpuset(struct vcpu *vcpu) return (&vcpu->exitinfo_cpuset); } -static int -vmm_init(void) +int +vmm_modinit(void) { if (!vmm_is_hw_supported()) return (ENXIO); - vm_maxcpu = mp_ncpus; - TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); - - if (vm_maxcpu > VM_MAXCPU) { - printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); - vm_maxcpu = VM_MAXCPU; - } - if (vm_maxcpu == 0) - vm_maxcpu = 1; - vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : @@ -431,70 +406,17 @@ vmm_init(void) return (vmmops_modinit(vmm_ipinum)); } -static int -vmm_handler(module_t mod, int what, void *arg) +int +vmm_modcleanup(void) { - int error; - - switch (what) { - case MOD_LOAD: - if (vmm_is_hw_supported()) { - error = vmmdev_init(); - if (error != 0) - break; - error = vmm_init(); - if (error == 0) - vmm_initialized = 1; - else - (void)vmmdev_cleanup(); - } else { - error = ENXIO; - } - break; - case MOD_UNLOAD: - if (vmm_is_hw_supported()) { - error = vmmdev_cleanup(); - if (error == 0) { - vmm_suspend_p = NULL; - vmm_resume_p = NULL; - iommu_cleanup(); - if (vmm_ipinum != IPI_AST) - lapic_ipi_free(vmm_ipinum); - error = vmmops_modcleanup(); - /* - * Something bad happened - prevent new - * VMs from being created - */ - if (error) - vmm_initialized = 0; - } - } else { - error = 0; - } - break; - default: - error = 0; - break; - } - return (error); + vmm_suspend_p = NULL; + vmm_resume_p = NULL; + iommu_cleanup(); + if (vmm_ipinum != IPI_AST) + lapic_ipi_free(vmm_ipinum); + return (vmmops_modcleanup()); } -static moduledata_t vmm_kmod = { - "vmm", - vmm_handler, - NULL -}; - -/* - * vmm initialization has the following dependencies: - * - * - VT-x initialization requires smp_rendezvous() and therefore must happen - * after SMP is fully functional (after SI_SUB_SMP). - * - vmm device initialization requires an initialized devfs. - */ -DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); -MODULE_VERSION(vmm, 1); - static void vm_init(struct vm *vm, bool create) { @@ -573,29 +495,12 @@ vm_unlock_vcpus(struct vm *vm) sx_unlock(&vm->vcpus_init_lock); } -/* - * The default CPU topology is a single thread per package. - */ -u_int cores_per_package = 1; -u_int threads_per_core = 1; - int vm_create(const char *name, struct vm **retvm) { struct vm *vm; int error; - /* - * If vmm.ko could not be successfully initialized then don't attempt - * to create the virtual machine. - */ - if (!vmm_initialized) - return (ENXIO); - - if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == - VM_MAX_NAMELEN + 1) - return (EINVAL); - vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); error = vm_mem_init(&vm->mem, 0, VM_MAXUSER_ADDRESS_LA48); if (error != 0) { @@ -609,8 +514,8 @@ vm_create(const char *name, struct vm **retvm) M_ZERO); vm->sockets = 1; - vm->cores = cores_per_package; /* XXX backwards compatibility */ - vm->threads = threads_per_core; /* XXX backwards compatibility */ + vm->cores = 1; /* XXX backwards compatibility */ + vm->threads = 1; /* XXX backwards compatibility */ vm->maxcpus = vm_maxcpu; vm_init(vm, true); @@ -1028,7 +933,7 @@ vcpu_wait_idle(struct vcpu *vcpu) KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle")); vcpu->reqidle = 1; - vcpu_notify_event_locked(vcpu, false); + vcpu_notify_event_locked(vcpu); VMM_CTR1(vcpu, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); @@ -1509,7 +1414,7 @@ vm_handle_suspend(struct vcpu *vcpu, bool *retu) */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { - vcpu_notify_event(vm_vcpu(vm, i), false); + vcpu_notify_event(vm_vcpu(vm, i)); } } @@ -1583,7 +1488,7 @@ vm_suspend(struct vm *vm, enum vm_suspend_how how) */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) - vcpu_notify_event(vm_vcpu(vm, i), false); + vcpu_notify_event(vm_vcpu(vm, i)); } return (0); @@ -2063,7 +1968,7 @@ vm_inject_nmi(struct vcpu *vcpu) { vcpu->nmi_pending = 1; - vcpu_notify_event(vcpu, false); + vcpu_notify_event(vcpu); return (0); } @@ -2090,7 +1995,7 @@ vm_inject_extint(struct vcpu *vcpu) { vcpu->extint_pending = 1; - vcpu_notify_event(vcpu, false); + vcpu_notify_event(vcpu); return (0); } @@ -2261,14 +2166,14 @@ vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) vm->debug_cpus = vm->active_cpus; for (int i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) - vcpu_notify_event(vm_vcpu(vm, i), false); + vcpu_notify_event(vm_vcpu(vm, i)); } } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); - vcpu_notify_event(vcpu, false); + vcpu_notify_event(vcpu); } return (0); } @@ -2376,7 +2281,7 @@ vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void -vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) +vcpu_notify_event_locked(struct vcpu *vcpu) { int hostcpu; @@ -2384,12 +2289,7 @@ vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { - if (lapic_intr) { - vlapic_post_intr(vcpu->vlapic, hostcpu, - vmm_ipinum); - } else { - ipi_cpu(hostcpu, vmm_ipinum); - } + ipi_cpu(hostcpu, vmm_ipinum); } else { /* * If the 'vcpu' is running on 'curcpu' then it must @@ -2407,10 +2307,21 @@ vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) } void -vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) +vcpu_notify_event(struct vcpu *vcpu) { vcpu_lock(vcpu); - vcpu_notify_event_locked(vcpu, lapic_intr); + vcpu_notify_event_locked(vcpu); + vcpu_unlock(vcpu); +} + +void +vcpu_notify_lapic(struct vcpu *vcpu) +{ + vcpu_lock(vcpu); + if (vcpu->state == VCPU_RUNNING && vcpu->hostcpu != curcpu) + vlapic_post_intr(vcpu->vlapic, vcpu->hostcpu, vmm_ipinum); + else + vcpu_notify_event_locked(vcpu); vcpu_unlock(vcpu); } @@ -2472,7 +2383,7 @@ restart: */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) - vcpu_notify_event(vm_vcpu(vm, i), false); + vcpu_notify_event(vm_vcpu(vm, i)); } return (vm_handle_rendezvous(vcpu)); diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 0cae01f172ec..63bdee69bb59 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -61,7 +61,7 @@ lapic_set_intr(struct vcpu *vcpu, int vector, bool level) vlapic = vm_lapic(vcpu); if (vlapic_set_intr_ready(vlapic, vector, level)) - vcpu_notify_event(vcpu, true); + vcpu_notify_lapic(vcpu); return (0); } diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h index 696a69669a2a..e67540eac66d 100644 --- a/sys/arm64/include/vmm.h +++ b/sys/arm64/include/vmm.h @@ -106,27 +106,6 @@ enum vm_reg_name { #define VM_GUEST_BASE_IPA 0x80000000UL /* Guest kernel start ipa */ -/* - * The VM name has to fit into the pathname length constraints of devfs, - * governed primarily by SPECNAMELEN. The length is the total number of - * characters in the full path, relative to the mount point and not - * including any leading '/' characters. - * A prefix and a suffix are added to the name specified by the user. - * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters - * longer for future use. - * The suffix is a string that identifies a bootrom image or some similar - * image that is attached to the VM. A separator character gets added to - * the suffix automatically when generating the full path, so it must be - * accounted for, reducing the effective length by 1. - * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 - * bytes for FreeBSD 12. A minimum length is set for safety and supports - * a SPECNAMELEN as small as 32 on old systems. - */ -#define VM_MAX_PREFIXLEN 10 -#define VM_MAX_SUFFIXLEN 15 -#define VM_MAX_NAMELEN \ - (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) - #ifdef _KERNEL struct vm; struct vm_exception; diff --git a/sys/arm64/include/vmm_dev.h b/sys/arm64/include/vmm_dev.h index 219f1116c728..289ff0fe1fc9 100644 --- a/sys/arm64/include/vmm_dev.h +++ b/sys/arm64/include/vmm_dev.h @@ -31,6 +31,8 @@ #include <machine/vmm.h> +#include <dev/vmm/vmm_param.h> + struct vm_memmap { vm_paddr_t gpa; int segid; /* memory segment */ diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c index e7b2b5d8c360..31d2fb3f516b 100644 --- a/sys/arm64/vmm/vmm.c +++ b/sys/arm64/vmm/vmm.c @@ -33,7 +33,6 @@ #include <sys/linker.h> #include <sys/lock.h> #include <sys/malloc.h> -#include <sys/module.h> #include <sys/mutex.h> #include <sys/pcpu.h> #include <sys/proc.h> @@ -125,7 +124,7 @@ struct vm { volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ struct vm_mem mem; /* (i) guest memory */ - char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + char name[VM_MAX_NAMELEN + 1]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (i) guest vcpus */ struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS]; /* (o) guest MMIO regions */ @@ -138,8 +137,6 @@ struct vm { struct sx vcpus_init_lock; /* (o) */ }; -static bool vmm_initialized = false; - static int vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu); @@ -208,10 +205,6 @@ static const struct vmm_regs vmm_arch_regs_masks = { /* Host registers masked by vmm_arch_regs_masks. */ static struct vmm_regs vmm_arch_regs; -u_int vm_maxcpu; -SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, - &vm_maxcpu, 0, "Maximum number of vCPUs"); - static void vcpu_notify_event_locked(struct vcpu *vcpu); /* global statistics */ @@ -231,12 +224,6 @@ VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception"); VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception"); VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception"); -/* - * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this - * is a safe value for now. - */ -#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) - static int vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks) { @@ -323,20 +310,14 @@ vmm_unsupported_quirk(void) return (0); } -static int -vmm_init(void) +int +vmm_modinit(void) { int error; - vm_maxcpu = mp_ncpus; - TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); - - if (vm_maxcpu > VM_MAXCPU) { - printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); - vm_maxcpu = VM_MAXCPU; - } - if (vm_maxcpu == 0) - vm_maxcpu = 1; + error = vmm_unsupported_quirk(); + if (error != 0) + return (error); error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks); if (error != 0) @@ -345,61 +326,12 @@ vmm_init(void) return (vmmops_modinit(0)); } -static int -vmm_handler(module_t mod, int what, void *arg) +int +vmm_modcleanup(void) { - int error; - - switch (what) { - case MOD_LOAD: - error = vmm_unsupported_quirk(); - if (error != 0) - break; - error = vmmdev_init(); - if (error != 0) - break; - error = vmm_init(); - if (error == 0) - vmm_initialized = true; - else - (void)vmmdev_cleanup(); - break; - case MOD_UNLOAD: - error = vmmdev_cleanup(); - if (error == 0 && vmm_initialized) { - error = vmmops_modcleanup(); - if (error) { - /* - * Something bad happened - prevent new - * VMs from being created - */ - vmm_initialized = false; - } - } - break; - default: - error = 0; - break; - } - return (error); + return (vmmops_modcleanup()); } -static moduledata_t vmm_kmod = { - "vmm", - vmm_handler, - NULL -}; - -/* - * vmm initialization has the following dependencies: - * - * - HYP initialization requires smp_rendezvous() and therefore must happen - * after SMP is fully functional (after SI_SUB_SMP). - * - vmm device initialization requires an initialized devfs. - */ -DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); -MODULE_VERSION(vmm, 1); - static void vm_init(struct vm *vm, bool create) { @@ -441,10 +373,6 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) return (NULL); - /* Some interrupt controllers may have a CPU limit */ - if (vcpuid >= vgic_max_cpu_count(vm->cookie)) - return (NULL); - vcpu = (struct vcpu *) atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]); if (__predict_true(vcpu != NULL)) @@ -453,6 +381,12 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) sx_xlock(&vm->vcpus_init_lock); vcpu = vm->vcpu[vcpuid]; if (vcpu == NULL && !vm->dying) { + /* Some interrupt controllers may have a CPU limit */ + if (vcpuid >= vgic_max_cpu_count(vm->cookie)) { + sx_xunlock(&vm->vcpus_init_lock); + return (NULL); + } + vcpu = vcpu_alloc(vm, vcpuid); vcpu_init(vcpu); @@ -485,16 +419,6 @@ vm_create(const char *name, struct vm **retvm) struct vm *vm; int error; - /* - * If vmm.ko could not be successfully initialized then don't attempt - * to create the virtual machine. - */ - if (!vmm_initialized) - return (ENXIO); - - if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) - return (EINVAL); - vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); error = vm_mem_init(&vm->mem, 0, 1ul << 39); if (error != 0) { diff --git a/sys/dev/mmc/mmc_fdt_helpers.c b/sys/dev/mmc/mmc_fdt_helpers.c index aed85dab55f4..980785464a00 100644 --- a/sys/dev/mmc/mmc_fdt_helpers.c +++ b/sys/dev/mmc/mmc_fdt_helpers.c @@ -160,6 +160,17 @@ cd_setup(struct mmc_helper *helper, phandle_t node) } /* + * If the device has no card-detection, treat it as non-removable. + * This could be improved by polling for detection. + */ + if (helper->props & MMC_PROP_BROKEN_CD) { + helper->cd_disabled = true; + if (bootverbose) + device_printf(dev, "Broken card-detect\n"); + return; + } + + /* * If there is no cd-gpios property, then presumably the hardware * PRESENT_STATE register and interrupts will reflect card state * properly, and there's nothing more for us to do. Our get_present() diff --git a/sys/dev/mmc/mmcsd.c b/sys/dev/mmc/mmcsd.c index 5b9cb93c7b31..f2965048b285 100644 --- a/sys/dev/mmc/mmcsd.c +++ b/sys/dev/mmc/mmcsd.c @@ -1422,7 +1422,7 @@ mmcsd_task(void *arg) struct mmcsd_softc *sc; struct bio *bp; device_t dev, mmcbus; - int bio_error, err, sz; + int abio_error, err, sz; part = arg; sc = part->sc; @@ -1430,7 +1430,7 @@ mmcsd_task(void *arg) mmcbus = sc->mmcbus; while (1) { - bio_error = 0; + abio_error = 0; MMCSD_DISK_LOCK(part); do { if (part->running == 0) @@ -1475,11 +1475,11 @@ mmcsd_task(void *arg) } else if (bp->bio_cmd == BIO_DELETE) block = mmcsd_delete(part, bp); else - bio_error = EOPNOTSUPP; + abio_error = EOPNOTSUPP; release: MMCBUS_RELEASE_BUS(mmcbus, dev); if (block < end) { - bp->bio_error = (bio_error == 0) ? EIO : bio_error; + bp->bio_error = (abio_error == 0) ? EIO : abio_error; bp->bio_resid = (end - block) * sz; bp->bio_flags |= BIO_ERROR; } else diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c index f4a588373c98..17684cc14ba2 100644 --- a/sys/dev/nvme/nvme_ns.c +++ b/sys/dev/nvme/nvme_ns.c @@ -45,7 +45,7 @@ #include "nvme_private.h" #include "nvme_linux.h" -static void nvme_bio_child_inbed(struct bio *parent, int bio_error); +static void nvme_bio_child_inbed(struct bio *parent, int abio_error); static void nvme_bio_child_done(void *arg, const struct nvme_completion *cpl); static uint32_t nvme_get_num_segments(uint64_t addr, uint64_t size, @@ -275,14 +275,14 @@ nvme_ns_bio_done(void *arg, const struct nvme_completion *status) } static void -nvme_bio_child_inbed(struct bio *parent, int bio_error) +nvme_bio_child_inbed(struct bio *parent, int abio_error) { struct nvme_completion parent_cpl; int children, inbed; - if (bio_error != 0) { + if (abio_error != 0) { parent->bio_flags |= BIO_ERROR; - parent->bio_error = bio_error; + parent->bio_error = abio_error; } /* @@ -309,12 +309,12 @@ nvme_bio_child_done(void *arg, const struct nvme_completion *cpl) { struct bio *child = arg; struct bio *parent; - int bio_error; + int abio_error; parent = child->bio_parent; g_destroy_bio(child); - bio_error = nvme_completion_is_error(cpl) ? EIO : 0; - nvme_bio_child_inbed(parent, bio_error); + abio_error = nvme_completion_is_error(cpl) ? EIO : 0; + nvme_bio_child_inbed(parent, abio_error); } static uint32_t diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c index cc7a233d60ee..41e01549c8b2 100644 --- a/sys/dev/virtio/virtqueue.c +++ b/sys/dev/virtio/virtqueue.c @@ -580,7 +580,8 @@ virtqueue_dequeue(struct virtqueue *vq, uint32_t *len) void *cookie; uint16_t used_idx, desc_idx; - if (vq->vq_used_cons_idx == vq_htog16(vq, vq->vq_ring.used->idx)) + if (vq->vq_used_cons_idx == + vq_htog16(vq, atomic_load_16(&vq->vq_ring.used->idx))) return (NULL); used_idx = vq->vq_used_cons_idx++ & (vq->vq_nentries - 1); diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index ebbceb25b69e..d6543bf6534e 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -14,9 +14,11 @@ #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mman.h> +#include <sys/module.h> #include <sys/priv.h> #include <sys/proc.h> #include <sys/queue.h> +#include <sys/smp.h> #include <sys/sx.h> #include <sys/sysctl.h> #include <sys/ucred.h> @@ -78,6 +80,8 @@ struct vmmdev_softc { int flags; }; +static bool vmm_initialized = false; + static SLIST_HEAD(, vmmdev_softc) head; static unsigned pr_allow_flag; @@ -88,6 +92,10 @@ static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); +u_int vm_maxcpu; +SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &vm_maxcpu, 0, "Maximum number of vCPUs"); + static void devmem_destroy(void *arg); static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem); @@ -619,20 +627,16 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, } error = domainset_populate(&domain, mask, mseg->ds_policy, mseg->ds_mask_size); - if (error) { - free(mask, M_VMMDEV); + free(mask, M_VMMDEV); + if (error) break; - } domainset = domainset_create(&domain); if (domainset == NULL) { error = EINVAL; - free(mask, M_VMMDEV); break; } - free(mask, M_VMMDEV); } error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset); - break; } case VM_GET_MEMSEG: @@ -985,6 +989,9 @@ vmmdev_create(const char *name, struct ucred *cred) struct vm *vm; int error; + if (name == NULL || strlen(name) > VM_MAX_NAMELEN) + return (EINVAL); + sx_xlock(&vmmdev_mtx); sc = vmmdev_lookup(name, cred); if (sc != NULL) { @@ -1025,6 +1032,9 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS) char *buf; int error, buflen; + if (!vmm_initialized) + return (ENXIO); + error = vmm_priv_check(req->td->td_ucred); if (error != 0) return (error); @@ -1110,7 +1120,7 @@ static struct cdevsw vmmctlsw = { .d_ioctl = vmmctl_ioctl, }; -int +static int vmmdev_init(void) { int error; @@ -1126,7 +1136,7 @@ vmmdev_init(void) return (error); } -int +static int vmmdev_cleanup(void) { sx_xlock(&vmmdev_mtx); @@ -1144,6 +1154,71 @@ vmmdev_cleanup(void) } static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + error = vmmdev_init(); + if (error != 0) + break; + + vm_maxcpu = mp_ncpus; + TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); + if (vm_maxcpu > VM_MAXCPU) { + printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); + vm_maxcpu = VM_MAXCPU; + } + if (vm_maxcpu == 0) + vm_maxcpu = 1; + + error = vmm_modinit(); + if (error == 0) + vmm_initialized = true; + else { + error = vmmdev_cleanup(); + KASSERT(error == 0, + ("%s: vmmdev_cleanup failed: %d", __func__, error)); + } + break; + case MOD_UNLOAD: + error = vmmdev_cleanup(); + if (error == 0 && vmm_initialized) { + error = vmm_modcleanup(); + if (error) { + /* + * Something bad happened - prevent new + * VMs from being created + */ + vmm_initialized = false; + } + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - Initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + * - vmm device initialization requires an initialized devfs. + */ +DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { diff --git a/sys/dev/vmm/vmm_dev.h b/sys/dev/vmm/vmm_dev.h index 2881a7063565..f14176c8afad 100644 --- a/sys/dev/vmm/vmm_dev.h +++ b/sys/dev/vmm/vmm_dev.h @@ -11,15 +11,19 @@ #include <sys/types.h> #include <sys/ioccom.h> + #include <machine/vmm_dev.h> +#include <dev/vmm/vmm_param.h> + #ifdef _KERNEL struct thread; struct vm; struct vcpu; -int vmmdev_init(void); -int vmmdev_cleanup(void); +int vmm_modinit(void); +int vmm_modcleanup(void); + int vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, int fflag, struct thread *td); @@ -53,6 +57,17 @@ struct vmmdev_ioctl { extern const struct vmmdev_ioctl vmmdev_machdep_ioctls[]; extern const size_t vmmdev_machdep_ioctl_count; +/* + * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU counts as + * well as range of vpid values for VT-x on amd64 and by the capacity of + * cpuset_t masks. The call to new_unrhdr() in vpid_init() in vmx.c requires + * 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. + */ +#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) + +/* Maximum number of vCPUs in a single VM. */ +extern u_int vm_maxcpu; + #endif /* _KERNEL */ struct vmmctl_vm_create { diff --git a/sys/dev/vmm/vmm_param.h b/sys/dev/vmm/vmm_param.h new file mode 100644 index 000000000000..a5040eb0f58c --- /dev/null +++ b/sys/dev/vmm/vmm_param.h @@ -0,0 +1,33 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + */ + +#ifndef _DEV_VMM_PARAM_H_ +#define _DEV_VMM_PARAM_H_ + +/* + * The VM name has to fit into the pathname length constraints of devfs, + * governed primarily by SPECNAMELEN. The length is the total number of + * characters in the full path, relative to the mount point and not + * including any leading '/' characters. + * A prefix and a suffix are added to the name specified by the user. + * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters + * longer for future use. + * The suffix is a string that identifies a bootrom image or some similar + * image that is attached to the VM. A separator character gets added to + * the suffix automatically when generating the full path, so it must be + * accounted for, reducing the effective length by 1. + * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 + * bytes for FreeBSD 12. A minimum length is set for safety and supports + * a SPECNAMELEN as small as 32 on old systems. + */ +#define VM_MAX_PREFIXLEN 10 +#define VM_MAX_SUFFIXLEN 15 +#define VM_MIN_NAMELEN 6 +#define VM_MAX_NAMELEN \ + (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) + +#endif /* !_DEV_VMM_PARAM_H_ */ diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 6c79e646d2f3..ef5aee5de34c 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -625,7 +625,7 @@ fuse_vnop_allocate(struct vop_allocate_args *ap) return (EROFS); if (fsess_not_impl(mp, FUSE_FALLOCATE)) - return (EXTERROR(EINVAL, "This server does not implement " + return (EXTERROR(EOPNOTSUPP, "This server does not implement " "FUSE_FALLOCATE")); io.uio_offset = *offset; @@ -656,14 +656,14 @@ fuse_vnop_allocate(struct vop_allocate_args *ap) if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FALLOCATE); - err = EXTERROR(EINVAL, "This server does not implement " + err = EXTERROR(EOPNOTSUPP, "This server does not implement " "FUSE_ALLOCATE"); } else if (err == EOPNOTSUPP) { /* * The file system server does not support FUSE_FALLOCATE with * the supplied mode for this particular file. */ - err = EXTERROR(EINVAL, "This file can't be pre-allocated"); + err = EXTERROR(EOPNOTSUPP, "This file can't be pre-allocated"); } else if (!err) { *offset += *len; *len = 0; diff --git a/sys/fs/nfsclient/nfs_clvnops.c b/sys/fs/nfsclient/nfs_clvnops.c index 795a8d106051..193d8b6cd5eb 100644 --- a/sys/fs/nfsclient/nfs_clvnops.c +++ b/sys/fs/nfsclient/nfs_clvnops.c @@ -3896,11 +3896,15 @@ nfs_allocate(struct vop_allocate_args *ap) mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOALLOCATE; mtx_unlock(&nmp->nm_mtx); - error = EINVAL; + error = EOPNOTSUPP; } } else { + /* + * Pre-v4.2 NFS server that doesn't support it, or a newer + * NFS server that has indicated that it doesn't support it. + */ mtx_unlock(&nmp->nm_mtx); - error = EINVAL; + error = EOPNOTSUPP; } if (attrflag != 0) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c index db0bc77a752f..a723d06334a0 100644 --- a/sys/geom/geom_dev.c +++ b/sys/geom/geom_dev.c @@ -734,6 +734,10 @@ g_dev_done(struct bio *bp2) g_trace(G_T_BIO, "g_dev_done(%p) had error %d", bp2, bp2->bio_error); bp->bio_flags |= BIO_ERROR; + if ((bp2->bio_flags & BIO_EXTERR) != 0) { + bp->bio_flags |= BIO_EXTERR; + bp->bio_exterr = bp2->bio_exterr; + } } else { if (bp->bio_cmd == BIO_READ) KNOTE_UNLOCKED(&sc->sc_selinfo.si_note, NOTE_READ); diff --git a/sys/geom/geom_disk.c b/sys/geom/geom_disk.c index 9dbf00371dba..b267130d1e0c 100644 --- a/sys/geom/geom_disk.c +++ b/sys/geom/geom_disk.c @@ -235,8 +235,14 @@ g_disk_done(struct bio *bp) bp2 = bp->bio_parent; binuptime(&now); mtx_lock(&sc->done_mtx); - if (bp2->bio_error == 0) - bp2->bio_error = bp->bio_error; + if (bp2->bio_error == 0) { + if ((bp->bio_flags & BIO_EXTERR) != 0) { + bp2->bio_flags |= BIO_EXTERR; + bp2->bio_exterr = bp->bio_exterr; + } else { + bp2->bio_error = bp->bio_error; + } + } bp2->bio_completed += bp->bio_length - bp->bio_resid; if (bp->bio_cmd == BIO_READ) diff --git a/sys/geom/geom_subr.c b/sys/geom/geom_subr.c index 2a6ce1ab6486..c5dce730da79 100644 --- a/sys/geom/geom_subr.c +++ b/sys/geom/geom_subr.c @@ -38,9 +38,11 @@ #include <sys/cdefs.h> #include "opt_ddb.h" +#define EXTERR_CATEGORY EXTERR_CAT_GEOM #include <sys/param.h> #include <sys/systm.h> #include <sys/devicestat.h> +#include <sys/exterrvar.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/bio.h> @@ -1162,8 +1164,14 @@ g_std_done(struct bio *bp) struct bio *bp2; bp2 = bp->bio_parent; - if (bp2->bio_error == 0) - bp2->bio_error = bp->bio_error; + if (bp2->bio_error == 0) { + if ((bp->bio_flags & BIO_EXTERR) != 0) { + bp2->bio_flags |= BIO_EXTERR; + bp2->bio_exterr = bp->bio_exterr; + } else { + bp2->bio_error = bp->bio_error; + } + } bp2->bio_completed += bp->bio_completed; g_destroy_bio(bp); bp2->bio_inbed++; @@ -1668,6 +1676,8 @@ DB_SHOW_COMMAND(bio, db_show_bio) db_printf(" caller2: %p\n", bp->bio_caller2); db_printf(" bio_from: %p\n", bp->bio_from); db_printf(" bio_to: %p\n", bp->bio_to); + if ((bp->bio_flags & BIO_EXTERR) != 0) + exterr_db_print(&bp->bio_exterr); #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) db_printf(" bio_track_bp: %p\n", bp->bio_track_bp); diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c index 9b5e5a84191f..122e2f6a02ec 100644 --- a/sys/geom/geom_vfs.c +++ b/sys/geom/geom_vfs.c @@ -26,9 +26,11 @@ * SUCH DAMAGE. */ +#define EXTERR_CATEGORY EXTERR_CAT_GEOMVFS #include <sys/param.h> #include <sys/systm.h> #include <sys/bio.h> +#include <sys/exterrvar.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -156,10 +158,13 @@ g_vfs_done(struct bio *bip) " suppressing further ENXIO"); } } - bp->b_error = bip->bio_error; bp->b_ioflags = bip->bio_flags; if (bip->bio_error) bp->b_ioflags |= BIO_ERROR; + if ((bp->b_ioflags & BIO_EXTERR) != 0) + bp->b_exterr = bip->bio_exterr; + else + bp->b_error = bip->bio_error; bp->b_resid = bp->b_bcount - bip->bio_completed; g_destroy_bio(bip); @@ -195,6 +200,8 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp) mtx_unlock(&sc->sc_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; + EXTERROR_KE(&bp->b_exterr, ENXIO, + "orphaned or enxio active"); bufdone(bp); return; } diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 7d666da9f88b..b84f675d1dcb 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -2345,3 +2345,35 @@ exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1, } return (eerror); } + +int +exterr_set_from(const struct kexterr *ke) +{ + struct thread *td; + + td = curthread; + if ((td->td_pflags2 & TDP2_UEXTERR) != 0) { + td->td_pflags2 |= TDP2_EXTERR; + td->td_kexterr = *ke; + } + return (td->td_kexterr.error); +} + +void +exterr_clear(struct kexterr *ke) +{ + memset(ke, 0, sizeof(*ke)); +} + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +void +exterr_db_print(struct kexterr *ke) +{ + db_printf("errno %d cat %d msg %s p1 %#jx p2 %#jx line %d\n", + ke->error, ke->cat, ke->msg == NULL ? "<none>" : ke->msg, + (uintmax_t)ke->p1, (uintmax_t)ke->p2, ke->src_line); +} +#endif diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 60916a9fbd32..02d4b8426757 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -2487,7 +2487,7 @@ aio_biowakeup(struct bio *bp) long bcount = bp->bio_bcount; long resid = bp->bio_resid; int opcode, nblks; - int bio_error = bp->bio_error; + int abio_error = bp->bio_error; uint16_t flags = bp->bio_flags; opcode = job->uaiocb.aio_lio_opcode; @@ -2503,16 +2503,16 @@ aio_biowakeup(struct bio *bp) * error of whichever failed bio completed last. */ if (flags & BIO_ERROR) - atomic_store_int(&job->error, bio_error); + atomic_store_int(&job->error, abio_error); if (opcode & LIO_WRITE) atomic_add_int(&job->outblock, nblks); else atomic_add_int(&job->inblock, nblks); if (refcount_release(&job->nbio)) { - bio_error = atomic_load_int(&job->error); - if (bio_error != 0) - aio_complete(job, -1, bio_error); + abio_error = atomic_load_int(&job->error); + if (abio_error != 0) + aio_complete(job, -1, abio_error); else aio_complete(job, atomic_load_long(&job->nbytes), 0); } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 19c39e42bafa..880cc6b99951 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -44,6 +44,7 @@ * see man buf(9) for more info. */ +#define EXTERR_CATEGORY EXTERR_CAT_VFSBIO #include <sys/param.h> #include <sys/systm.h> #include <sys/asan.h> @@ -55,6 +56,7 @@ #include <sys/counter.h> #include <sys/devicestat.h> #include <sys/eventhandler.h> +#include <sys/exterrvar.h> #include <sys/fail.h> #include <sys/ktr.h> #include <sys/limits.h> @@ -1775,7 +1777,6 @@ buf_alloc(struct bufdomain *bd) bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; - bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; @@ -1785,6 +1786,7 @@ buf_alloc(struct bufdomain *bd) bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; + exterr_clear(&bp->b_exterr); LIST_INIT(&bp->b_dep); return (bp); @@ -2276,7 +2278,7 @@ breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, } if ((flags & GB_CVTENXIO) != 0) bp->b_xflags |= BX_CVTENXIO; - bp->b_ioflags &= ~BIO_ERROR; + bp->b_ioflags &= ~(BIO_ERROR | BIO_EXTERR); if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); @@ -2353,7 +2355,7 @@ bufwrite(struct buf *bp) bundirty(bp); bp->b_flags &= ~B_DONE; - bp->b_ioflags &= ~BIO_ERROR; + bp->b_ioflags &= ~(BIO_ERROR | BIO_EXTERR); bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; @@ -4520,8 +4522,11 @@ biowait(struct bio *bp, const char *wmesg) while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wmesg, 0); mtx_unlock(mtxp); - if (bp->bio_error != 0) + if (bp->bio_error != 0) { + if ((bp->bio_flags & BIO_EXTERR) != 0) + return (exterr_set_from(&bp->bio_exterr)); return (bp->bio_error); + } if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); @@ -4568,6 +4573,8 @@ bufwait(struct buf *bp) return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { + if ((bp->b_ioflags & BIO_EXTERR) != 0) + exterr_set_from(&bp->b_exterr); return (bp->b_error ? bp->b_error : EIO); } else { return (0); @@ -5522,6 +5529,8 @@ DB_SHOW_COMMAND(buffer, db_show_buffer) db_printf("\n"); } BUF_LOCKPRINTINFO(bp); + if ((bp->b_ioflags & BIO_EXTERR) != 0) + exterr_db_print(&bp->b_exterr); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index 56bb90cce9bc..0dc3a58f6ae6 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -138,6 +138,7 @@ struct tuntap_softc { #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ + struct epoch_context tun_epoch_ctx; struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ @@ -630,6 +631,18 @@ out: CURVNET_RESTORE(); } +static void +tunfree(struct epoch_context *ctx) +{ + struct tuntap_softc *tp; + + tp = __containerof(ctx, struct tuntap_softc, tun_epoch_ctx); + + /* Any remaining resources that would be needed by a concurrent open. */ + mtx_destroy(&tp->tun_mtx); + free(tp, M_TUN); +} + static int tun_destroy(struct tuntap_softc *tp, bool may_intr) { @@ -649,7 +662,7 @@ tun_destroy(struct tuntap_softc *tp, bool may_intr) error = cv_wait_sig(&tp->tun_cv, &tp->tun_mtx); else cv_wait(&tp->tun_cv, &tp->tun_mtx); - if (error != 0) { + if (error != 0 && tp->tun_busy != 0) { tp->tun_flags &= ~TUN_DYING; TUN_UNLOCK(tp); return (error); @@ -663,8 +676,18 @@ tun_destroy(struct tuntap_softc *tp, bool may_intr) TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); - /* destroy_dev will take care of any alias. */ - destroy_dev(tp->tun_dev); + /* + * destroy_dev will take care of any alias. For transient tunnels, + * we're being called from close(2) so we can't destroy it ourselves + * without deadlocking, but we already know that we can cleanup + * everything else and just continue to prevent it from being reopened. + */ + if ((tp->tun_flags & TUN_TRANSIENT) != 0) { + atomic_store_ptr(&tp->tun_dev->si_drv1, tp->tun_dev); + destroy_dev_sched(tp->tun_dev); + } else { + destroy_dev(tp->tun_dev); + } seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); @@ -679,9 +702,8 @@ tun_destroy(struct tuntap_softc *tp, bool may_intr) sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); - mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); - free(tp, M_TUN); + NET_EPOCH_CALL(tunfree, &tp->tun_epoch_ctx); CURVNET_RESTORE(); return (0); @@ -742,9 +764,11 @@ tun_uninit(const void *unused __unused) mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; + destroy_dev_drain(&drv->cdevsw); delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } + NET_EPOCH_DRAIN_CALLBACKS(); mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); @@ -1104,19 +1128,43 @@ out: static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { + struct epoch_tracker et; struct ifnet *ifp; struct tuntap_softc *tp; + void *p; int error __diagused, tunflags; + /* + * Transient tunnels do deferred destroy of the tun device but want + * to immediately cleanup state, so they clobber si_drv1 to avoid a + * use-after-free in case someone does happen to open it in the interim. + * We avoid using NULL to be able to distinguish from an uninitialized + * cdev. + * + * We use the net epoch here to let a concurrent tun_destroy() schedule + * freeing our tuntap_softc, in case we entered here and loaded si_drv1 + * before it was swapped out. If we managed to load this while it was + * still a softc, then the concurrent tun_destroy() hasn't yet scheduled + * it to be free- that will take place sometime after the epoch we just + * entered, so we can safely use it. + */ + NET_EPOCH_ENTER(et); + p = atomic_load_ptr(&dev->si_drv1); + if (p == dev) { + NET_EPOCH_EXIT(et); + return (ENXIO); + } + tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); return (error); /* Shouldn't happen */ } - tp = dev->si_drv1; + tp = p; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); @@ -1124,14 +1172,17 @@ tunopen(struct cdev *dev, int flag, int mode, struct thread *td) if ((tp->tun_flags & TUN_INITED) == 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); return (ENXIO); } if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); return (EBUSY); } + NET_EPOCH_EXIT(et); error = tun_busy_locked(tp); KASSERT(error == 0, ("Must be able to busy an unopen tunnel")); ifp = TUN2IFP(tp); diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c index 59a107881676..3583fc50f51b 100644 --- a/sys/netipsec/ipsec_offload.c +++ b/sys/netipsec/ipsec_offload.c @@ -289,19 +289,18 @@ ipsec_accel_sa_newkey_cb(if_t ifp, void *arg) be32toh(tq->sav->spi), tq->sav->flags, tq->sav->seq); priv = NULL; drv_spi = alloc_unr(drv_spi_unr); - if (tq->sav->accel_ifname != NULL && - strcmp(tq->sav->accel_ifname, if_name(ifp)) != 0) { - error = ipsec_accel_handle_sav(tq->sav, - ifp, drv_spi, priv, IFP_HS_REJECTED, NULL); - goto out; - } if (drv_spi == -1) { - /* XXXKIB */ dprintf("ipsec_accel_sa_install_newkey: cannot alloc " "drv_spi if %s spi %#x\n", if_name(ifp), be32toh(tq->sav->spi)); return (0); } + if (tq->sav->accel_ifname != NULL && + strcmp(tq->sav->accel_ifname, if_name(ifp)) != 0) { + error = ipsec_accel_handle_sav(tq->sav, + ifp, drv_spi, priv, IFP_HS_REJECTED, NULL); + goto out; + } error = ifp->if_ipsec_accel_m->if_sa_newkey(ifp, tq->sav, drv_spi, &priv); if (error != 0) { diff --git a/sys/netpfil/ipfilter/netinet/ip_htable.c b/sys/netpfil/ipfilter/netinet/ip_htable.c index 3f765cfab947..5f5c04732d69 100644 --- a/sys/netpfil/ipfilter/netinet/ip_htable.c +++ b/sys/netpfil/ipfilter/netinet/ip_htable.c @@ -96,6 +96,8 @@ typedef struct ipf_htable_softc_s { u_long ipf_nhtnodes[LOOKUP_POOL_SZ]; iphtable_t *ipf_htables[LOOKUP_POOL_SZ]; iphtent_t *ipf_node_explist; + ipftuneable_t *ipf_htable_tune; + u_int ipf_htable_size_max; } ipf_htable_softc_t; ipf_lookup_t ipf_htable_backend = { @@ -122,6 +124,18 @@ ipf_lookup_t ipf_htable_backend = { }; +static ipftuneable_t ipf_htable_tuneables[] = { + { { (void *)offsetof(ipf_htable_softc_t, ipf_htable_size_max) }, + "htable_size_max", 1, 0x7fffffff, + stsizeof(ipf_htable_softc_t, ipf_htable_size_max), + 0, NULL, NULL }, + { { NULL }, + NULL, 0, 0, + 0, + 0, NULL, NULL } +}; + + /* ------------------------------------------------------------------------ */ /* Function: ipf_htable_soft_create */ /* Returns: void * - NULL = failure, else pointer to local context */ @@ -142,6 +156,18 @@ ipf_htable_soft_create(ipf_main_softc_t *softc) bzero((char *)softh, sizeof(*softh)); + softh->ipf_htable_tune = ipf_tune_array_copy(softh, + sizeof(ipf_htable_tuneables), + ipf_htable_tuneables); + if (softh->ipf_htable_tune == NULL) { + ipf_htable_soft_destroy(softc, softh); + return (NULL); + } + if (ipf_tune_array_link(softc, softh->ipf_htable_tune) == -1) { + ipf_htable_soft_destroy(softc, softh); + return (NULL); + } + return (softh); } @@ -160,6 +186,12 @@ ipf_htable_soft_destroy(ipf_main_softc_t *softc, void *arg) { ipf_htable_softc_t *softh = arg; + if (softh->ipf_htable_tune != NULL) { + ipf_tune_array_unlink(softc, softh->ipf_htable_tune); + KFREES(softh->ipf_htable_tune, sizeof(ipf_htable_tuneables)); + softh->ipf_htable_tune = NULL; + } + KFREE(softh); } @@ -179,6 +211,8 @@ ipf_htable_soft_init(ipf_main_softc_t *softc, void *arg) bzero((char *)softh, sizeof(*softh)); + softh->ipf_htable_size_max = IPHTABLE_MAX_SIZE; + return (0); } @@ -327,6 +361,15 @@ ipf_htable_create(ipf_main_softc_t *softc, void *arg, iplookupop_t *op) iph->iph_name[sizeof(iph->iph_name) - 1] = '\0'; } + if ((iph->iph_size == 0) || + (iph->iph_size > softh->ipf_htable_size_max)) { + IPFERROR(30027); + return (EINVAL); + } + if (iph->iph_size > ( SIZE_MAX / sizeof(*iph->iph_table))) { + IPFERROR(30028); + return (EINVAL); + } KMALLOCS(iph->iph_table, iphtent_t **, iph->iph_size * sizeof(*iph->iph_table)); if (iph->iph_table == NULL) { diff --git a/sys/netpfil/ipfilter/netinet/ip_htable.h b/sys/netpfil/ipfilter/netinet/ip_htable.h index 55c289e57ff6..3a8782ccd4b2 100644 --- a/sys/netpfil/ipfilter/netinet/ip_htable.h +++ b/sys/netpfil/ipfilter/netinet/ip_htable.h @@ -55,6 +55,8 @@ typedef struct iphtable_s { char iph_name[FR_GROUPLEN]; /* hash table number */ } iphtable_t; +#define IPHTABLE_MAX_SIZE 1024 + /* iph_type */ #define IPHASH_LOOKUP 0 #define IPHASH_GROUPMAP 1 diff --git a/sys/netpfil/pf/pf_nl.c b/sys/netpfil/pf/pf_nl.c index 21d4db1b8478..993981a9c0de 100644 --- a/sys/netpfil/pf/pf_nl.c +++ b/sys/netpfil/pf/pf_nl.c @@ -2246,6 +2246,87 @@ pf_handle_table_set_addrs(struct nlmsghdr *hdr, struct nl_pstate *npt) return (error); } +static int +nlattr_add_pfr_addr(struct nl_writer *nw, int attr, const struct pfr_addr *a) +{ + int off = nlattr_add_nested(nw, attr); + if (off == 0) + return (false); + + nlattr_add_u32(nw, PFR_A_AF, a->pfra_af); + nlattr_add_u8(nw, PFR_A_NET, a->pfra_net); + nlattr_add_bool(nw, PFR_A_NOT, a->pfra_not); + nlattr_add_in6_addr(nw, PFR_A_ADDR, &a->pfra_u._pfra_ip6addr); + + nlattr_set_len(nw, off); + + return (true); +} + +static int +pf_handle_table_get_addrs(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + struct pfioc_table attrs = { 0 }; + struct pfr_addr *pfras; + struct nl_writer *nw = npt->nw; + struct genlmsghdr *ghdr_new; + int size = 0; + int error; + + PF_RULES_RLOCK_TRACKER; + + error = nl_parse_nlmsg(hdr, &table_addr_parser, npt, &attrs); + if (error != 0) + return (error); + + PF_RULES_RLOCK(); + /* Get required size. */ + error = pfr_get_addrs(&attrs.pfrio_table, NULL, + &size, attrs.pfrio_flags | PFR_FLAG_USERIOCTL); + if (error != 0) { + PF_RULES_RUNLOCK(); + return (error); + } + pfras = mallocarray(size, sizeof(struct pfr_addr), M_PF, + M_NOWAIT | M_ZERO); + if (pfras == NULL) { + PF_RULES_RUNLOCK(); + return (ENOMEM); + } + /* Now get the addresses. */ + error = pfr_get_addrs(&attrs.pfrio_table, pfras, + &size, attrs.pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error != 0) + goto out; + + for (int i = 0; i < size; i++) { + if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { + nlmsg_abort(nw); + error = ENOMEM; + goto out; + } + ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); + ghdr_new->cmd = PFNL_CMD_TABLE_GET_ADDR; + ghdr_new->version = 0; + ghdr_new->reserved = 0; + + if (i == 0) + nlattr_add_u32(nw, PF_TA_ADDR_COUNT, size); + + nlattr_add_pfr_addr(nw, PF_TA_ADDR, &pfras[i]); + if (!nlmsg_end(nw)) { + nlmsg_abort(nw); + error = ENOMEM; + goto out; + } + } + +out: + free(pfras, M_PF); + return (error); +} + static const struct nlhdr_parser *all_parsers[] = { &state_parser, &addrule_parser, @@ -2504,6 +2585,13 @@ static const struct genl_cmd pf_cmds[] = { .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, + { + .cmd_num = PFNL_CMD_TABLE_GET_ADDR, + .cmd_name = "TABLE_GET_ADDRS", + .cmd_cb = pf_handle_table_get_addrs, + .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, + .cmd_priv = PRIV_NETINET_PF, + }, }; void diff --git a/sys/netpfil/pf/pf_nl.h b/sys/netpfil/pf/pf_nl.h index d1538ab4ff5b..e1eb3e628df5 100644 --- a/sys/netpfil/pf/pf_nl.h +++ b/sys/netpfil/pf/pf_nl.h @@ -70,6 +70,7 @@ enum { PFNL_CMD_TABLE_ADD_ADDR = 32, PFNL_CMD_TABLE_DEL_ADDR = 33, PFNL_CMD_TABLE_SET_ADDR = 34, + PFNL_CMD_TABLE_GET_ADDR = 35, __PFNL_CMD_MAX, }; #define PFNL_CMD_MAX (__PFNL_CMD_MAX -1) @@ -485,6 +486,7 @@ enum pf_table_addrs_t { PF_TA_NBR_ADDED = 4, /* u32 */ PF_TA_NBR_DELETED = 5, /* u32 */ PF_TA_NBR_CHANGED = 6, /* u32 */ + PF_TA_ADDR_COUNT = 7, /* u32 */ }; #ifdef _KERNEL diff --git a/sys/riscv/include/vmm.h b/sys/riscv/include/vmm.h index e227dd825966..361140834805 100644 --- a/sys/riscv/include/vmm.h +++ b/sys/riscv/include/vmm.h @@ -103,9 +103,6 @@ enum vm_reg_name { #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) -#define VM_MAX_NAMELEN 32 -#define VM_MAX_SUFFIXLEN 15 - #ifdef _KERNEL struct vm; diff --git a/sys/riscv/include/vmm_dev.h b/sys/riscv/include/vmm_dev.h index 4d30d5a1c35b..a60e545b8f52 100644 --- a/sys/riscv/include/vmm_dev.h +++ b/sys/riscv/include/vmm_dev.h @@ -38,6 +38,8 @@ #include <machine/vmm.h> +#include <dev/vmm/vmm_param.h> + struct vm_memmap { vm_paddr_t gpa; int segid; /* memory segment */ diff --git a/sys/riscv/vmm/vmm.c b/sys/riscv/vmm/vmm.c index a9eb9d144336..23b57ad3b7aa 100644 --- a/sys/riscv/vmm/vmm.c +++ b/sys/riscv/vmm/vmm.c @@ -38,7 +38,6 @@ #include <sys/linker.h> #include <sys/lock.h> #include <sys/malloc.h> -#include <sys/module.h> #include <sys/mutex.h> #include <sys/pcpu.h> #include <sys/proc.h> @@ -121,7 +120,7 @@ struct vm { volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ struct vm_mem mem; /* (i) [m+v] guest memory */ - char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + char name[VM_MAX_NAMELEN + 1]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (i) guest vcpus */ struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS]; /* (o) guest MMIO regions */ @@ -133,8 +132,6 @@ struct vm { struct sx vcpus_init_lock; /* (o) */ }; -static bool vmm_initialized = false; - static MALLOC_DEFINE(M_VMM, "vmm", "vmm"); /* statistics */ @@ -146,10 +143,6 @@ static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); -u_int vm_maxcpu; -SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, - &vm_maxcpu, 0, "Maximum number of vCPUs"); - static void vcpu_notify_event_locked(struct vcpu *vcpu); /* global statistics */ @@ -157,12 +150,6 @@ VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq"); VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception"); -/* - * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this - * is a safe value for now. - */ -#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) - static void vcpu_cleanup(struct vcpu *vcpu, bool destroy) { @@ -210,75 +197,18 @@ vm_exitinfo(struct vcpu *vcpu) return (&vcpu->exitinfo); } -static int -vmm_init(void) +int +vmm_modinit(void) { - - vm_maxcpu = mp_ncpus; - - TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); - - if (vm_maxcpu > VM_MAXCPU) { - printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); - vm_maxcpu = VM_MAXCPU; - } - - if (vm_maxcpu == 0) - vm_maxcpu = 1; - return (vmmops_modinit()); } -static int -vmm_handler(module_t mod, int what, void *arg) +int +vmm_modcleanup(void) { - int error; - - switch (what) { - case MOD_LOAD: - error = vmmdev_init(); - if (error != 0) - break; - error = vmm_init(); - if (error == 0) - vmm_initialized = true; - else - (void)vmmdev_cleanup(); - break; - case MOD_UNLOAD: - error = vmmdev_cleanup(); - if (error == 0 && vmm_initialized) { - error = vmmops_modcleanup(); - if (error) { - /* - * Something bad happened - prevent new - * VMs from being created - */ - vmm_initialized = false; - } - } - break; - default: - error = 0; - break; - } - return (error); + return (vmmops_modcleanup()); } -static moduledata_t vmm_kmod = { - "vmm", - vmm_handler, - NULL -}; - -/* - * vmm initialization has the following dependencies: - * - * - vmm device initialization requires an initialized devfs. - */ -DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_DEVFS + 1, SI_ORDER_ANY); -MODULE_VERSION(vmm, 1); - static void vm_init(struct vm *vm, bool create) { @@ -359,16 +289,6 @@ vm_create(const char *name, struct vm **retvm) struct vm *vm; int error; - /* - * If vmm.ko could not be successfully initialized then don't attempt - * to create the virtual machine. - */ - if (!vmm_initialized) - return (ENXIO); - - if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) - return (EINVAL); - vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); error = vm_mem_init(&vm->mem, 0, 1ul << 39); if (error != 0) { diff --git a/sys/sys/bio.h b/sys/sys/bio.h index 74d2b03bd180..5c12c858f3e5 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -37,6 +37,7 @@ #ifndef _SYS_BIO_H_ #define _SYS_BIO_H_ +#include <sys/_exterr.h> #include <sys/queue.h> #include <sys/disk_zone.h> @@ -65,10 +66,12 @@ #define BIO_TRANSIENT_MAPPING 0x20 #define BIO_VLIST 0x40 #define BIO_SWAP 0x200 /* Swap-related I/O */ +#define BIO_EXTERR 0x2000 #define BIO_SPEEDUP_WRITE 0x4000 /* Resource shortage at upper layers */ #define BIO_SPEEDUP_TRIM 0x8000 /* Resource shortage at upper layers */ -#define PRINT_BIO_FLAGS "\20\20speedup_trim\17speedup_write\12swap\7vlist\6transient_mapping\5unmapped" \ +#define PRINT_BIO_FLAGS "\20\20speedup_trim\17speedup_write\16exterr" \ + "\12swap\7vlist\6transient_mapping\5unmapped" \ "\4ordered\3onqueue\2done\1error" @@ -94,7 +97,6 @@ struct bio { struct vm_page **bio_ma; /* Or unmapped. */ int bio_ma_offset; /* Offset in the first page of bio_ma. */ int bio_ma_n; /* Number of pages in bio_ma. */ - int bio_error; /* Errno for BIO_ERROR. */ long bio_resid; /* Remaining I/O in bytes. */ void (*bio_done)(struct bio *); void *bio_driver1; /* Private use by the provider. */ @@ -130,8 +132,12 @@ struct bio { /* XXX: these go away when bio chaining is introduced */ daddr_t bio_pblkno; /* physical block number */ + struct kexterr bio_exterr; }; +/* Errno for BIO_ERROR. */ +#define bio_error bio_exterr.error + struct uio; struct devstat; diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 064d5cb05214..f08f05e6d50f 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -37,6 +37,7 @@ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ +#include <sys/_exterr.h> #include <sys/bufobj.h> #include <sys/queue.h> #include <sys/lock.h> @@ -98,7 +99,6 @@ struct buf { long b_bcount; void *b_caller1; caddr_t b_data; - int b_error; uint16_t b_iocmd; /* BIO_* bio_cmd from bio.h */ uint16_t b_ioflags; /* BIO_* bio_flags from bio.h */ off_t b_iooffset; @@ -153,10 +153,12 @@ struct buf { #elif defined(BUF_TRACKING) const char *b_io_tracking; #endif + struct kexterr b_exterr; struct vm_page *b_pages[]; }; #define b_object b_bufobj->bo_object +#define b_error b_exterr.error /* * These flags are kept in b_flags. @@ -390,6 +392,12 @@ struct buf { _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) #endif +#define BUF_EXTERR_FROM_CURTHR(bp) \ + bp->b_exterr = curthread->td_kexterr + +#define BUF_EXTERR_TO_CURTHR(bp) \ + curthread->td_kexterr = bp->b_exterr + #endif /* _KERNEL */ struct buf_queue_head { diff --git a/sys/sys/exterr_cat.h b/sys/sys/exterr_cat.h index 43f31e1d5dd6..318e774542ca 100644 --- a/sys/sys/exterr_cat.h +++ b/sys/sys/exterr_cat.h @@ -21,6 +21,9 @@ #define EXTERR_CAT_BRIDGE 7 #define EXTERR_CAT_SWAP 8 #define EXTERR_CAT_VFSSYSCALL 9 +#define EXTERR_CAT_VFSBIO 10 +#define EXTERR_CAT_GEOMVFS 11 +#define EXTERR_CAT_GEOM 12 #endif diff --git a/sys/sys/exterrvar.h b/sys/sys/exterrvar.h index 6783a0d2d84f..8e2961356a1e 100644 --- a/sys/sys/exterrvar.h +++ b/sys/sys/exterrvar.h @@ -37,6 +37,26 @@ #define SET_ERROR_MSG(mmsg) NULL #endif +#define _SET_ERROR2_KE(kep, eerror, mmsg, pp1, pp2) ({ \ + (kep)->error = (eerror); \ + (kep)->cat = EXTERR_CATEGORY; \ + (kep)->msg = SET_ERROR_MSG(mmsg); \ + (kep)->p1 = (pp1); \ + (kep)->p2 = (pp2); \ + (kep)->src_line = __LINE__; \ + (kep)->error; \ +}) +#define _SET_ERROR0_KE(kep, eerror, mmsg) \ + _SET_ERROR2_KE(kep, eerror, mmsg, 0, 0) +#define _SET_ERROR1_KE(kep, eerror, mmsg, pp1) \ + _SET_ERROR2_KE(kep, eerror, mmsg, pp1, 0) + +#define _EXTERROR_MACRO_KE(kep, eerror, mmsg, _1, _2, NAME, ...) \ + NAME +#define EXTERROR_KE(...) \ + _EXTERROR_MACRO_KE(__VA_ARGS__, _SET_ERROR2_KE, _SET_ERROR1_KE, \ + _SET_ERROR0_KE)(__VA_ARGS__) + #define _SET_ERROR2(eerror, mmsg, pp1, pp2) \ exterr_set(eerror, EXTERR_CATEGORY, SET_ERROR_MSG(mmsg), \ (uintptr_t)(pp1), (uintptr_t)(pp2), __LINE__) @@ -49,6 +69,9 @@ _EXTERROR_MACRO(__VA_ARGS__, _SET_ERROR2, _SET_ERROR1, \ _SET_ERROR0)(__VA_ARGS__) +void exterr_clear(struct kexterr *ke); +void exterr_db_print(struct kexterr *ke); +int exterr_set_from(const struct kexterr *ke); int exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1, uintptr_t pp2, int line); int exterr_to_ue(struct thread *td, struct uexterror *ue); diff --git a/tests/sys/fs/fusefs/fallocate.cc b/tests/sys/fs/fusefs/fallocate.cc index 4e5b047b78b7..1a3a0af36236 100644 --- a/tests/sys/fs/fusefs/fallocate.cc +++ b/tests/sys/fs/fusefs/fallocate.cc @@ -205,7 +205,7 @@ TEST_F(Fspacectl, enosys) EXPECT_EQ(0, fspacectl(fd, SPACECTL_DEALLOC, &rqsr, 0, NULL)); /* Neither should posix_fallocate query the daemon */ - EXPECT_EQ(EINVAL, posix_fallocate(fd, off1, len1)); + EXPECT_EQ(EOPNOTSUPP, posix_fallocate(fd, off1, len1)); leak(fd); } @@ -548,7 +548,7 @@ INSTANTIATE_TEST_SUITE_P(FspacectlCache, FspacectlCache, /* * If the server returns ENOSYS, it indicates that the server does not support - * FUSE_FALLOCATE. This and future calls should return EINVAL. + * FUSE_FALLOCATE. This and future calls should return EOPNOTSUPP. */ TEST_F(PosixFallocate, enosys) { @@ -570,10 +570,10 @@ TEST_F(PosixFallocate, enosys) fd = open(FULLPATH, O_RDWR); ASSERT_LE(0, fd) << strerror(errno); - EXPECT_EQ(EINVAL, posix_fallocate(fd, off0, len0)); + EXPECT_EQ(EOPNOTSUPP, posix_fallocate(fd, off0, len0)); /* Subsequent calls shouldn't query the daemon*/ - EXPECT_EQ(EINVAL, posix_fallocate(fd, off0, len0)); + EXPECT_EQ(EOPNOTSUPP, posix_fallocate(fd, off0, len0)); /* Neither should VOP_DEALLOCATE query the daemon */ EXPECT_EQ(0, fspacectl(fd, SPACECTL_DEALLOC, &rqsr, 0, NULL)); @@ -607,10 +607,10 @@ TEST_F(PosixFallocate, eopnotsupp) fd = open(FULLPATH, O_RDWR); ASSERT_LE(0, fd) << strerror(errno); - EXPECT_EQ(EINVAL, posix_fallocate(fd, fsize, length)); + EXPECT_EQ(EOPNOTSUPP, posix_fallocate(fd, fsize, length)); /* Subsequent calls should still query the daemon*/ - EXPECT_EQ(EINVAL, posix_fallocate(fd, offset, length)); + EXPECT_EQ(EOPNOTSUPP, posix_fallocate(fd, offset, length)); /* And subsequent VOP_DEALLOCATE calls should also query the daemon */ rqsr.r_len = length; @@ -759,7 +759,7 @@ TEST_F(PosixFallocate, rlimit_fsize) } /* With older servers, no FUSE_FALLOCATE should be attempted */ -TEST_F(PosixFallocate_7_18, einval) +TEST_F(PosixFallocate_7_18, eopnotsupp) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; @@ -773,7 +773,7 @@ TEST_F(PosixFallocate_7_18, einval) fd = open(FULLPATH, O_RDWR); ASSERT_LE(0, fd) << strerror(errno); - EXPECT_EQ(EINVAL, posix_fallocate(fd, offset, length)); + EXPECT_EQ(EOPNOTSUPP, posix_fallocate(fd, offset, length)); leak(fd); } |
