9 files changed, 623 insertions, 6 deletions
diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile
index 547a134fc440..c4ab767c8b2f 100644
--- a/lib/libmd/Makefile
+++ b/lib/libmd/Makefile
@@ -108,7 +108,7 @@ CFLAGS+= -DWEAK_REFS
 CFLAGS.skein_block.c+= -DSKEIN_LOOP=995
 .PATH: ${.CURDIR}/${MACHINE_ARCH} ${SRCTOP}/sys/crypto/sha2
 .PATH: ${SRCTOP}/sys/crypto/skein ${SRCTOP}/sys/crypto/skein/${MACHINE_ARCH}
-.PATH: ${SRCTOP}/sys/kern
+.PATH: ${SRCTOP}/sys/crypto
 
 USE_ASM_SOURCES?=1
 .if defined(BOOTSTRAPPING) || ${MK_MACHDEP_OPTIMIZATIONS} == no
@@ -117,6 +117,13 @@ USE_ASM_SOURCES:=0
 .endif
 
 .if ${USE_ASM_SOURCES} != 0
+.if exists(${MACHINE_ARCH}/md5block.S)
+SRCS+=	md5block.S
+CFLAGS+= -DMD5_ASM
+.if exists(${MACHINE_ARCH}/md5dispatch.c)
+SRCS+=  md5dispatch.c
+.endif
+.endif
 .if exists(${MACHINE_ARCH}/sha1block.S)
 SRCS+=	sha1block.S
 CFLAGS+= -DSHA1_ASM
diff --git a/lib/libmd/aarch64/md5block.S b/lib/libmd/aarch64/md5block.S
new file mode 100644
index 000000000000..b928c8dd795a
--- /dev/null
+++ b/lib/libmd/aarch64/md5block.S
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <sys/elf_common.h>
+#include <machine/asm.h>
+
+# optimal instruction sequence for k = \key + \m
+.macro	addkm	key, m
+.if 0x100000000 - \key > 0x00ffffff
+	movz	k, #\key & 0xffff
+	movk	k, #\key >> 16, lsl #16
+	add	k, k, \m
+.elseif 0x100000000 - \key > 0x0000ffff
+	sub	k, \m, #(0x100000000 - \key) & 0xfff000
+	sub	k, k, #(0x100000000 - \key) & 0xfff
+.else
+	movz	k, #0x100000000 - \key
+	sub	k, \m, k
+.endif
+.endm
+
+.macro	round	a, b, c, d, f, key, m, s
+	\f	f, \b, \c, \d
+	addkm	\key, \m		// k[i] + m[g]
+	add	\a, \a, k		// k[i] + m[g] + a
+	add	\a, \a, f		// k[i] + m[g] + a + f
+	ror	\a, \a, #32-\s
+	add	\a, \a, \b
+.endm
+
+	/* f = b ? c : d */
+.macro	f0	f, b, c, d
+	eor	\f, \c, \d
+	and	\f, \f, \b
+	eor	\f, \f, \d
+.endm
+
+	/*
+	 * special cased round 1 function
+	 * f1 = d ? b : c = (d & b) + (~d & c)
+	 */
+.macro	round1	a, b, c, d, key, m, s
+	bic	tmp, \c, \d		// ~d & c
+	addkm	\key, \m		// k[i] + m[g]
+	add	\a, \a, k		// k[i] + m[g] + a
+	and	f, \b, \d		// d & b
+	add	\a, \a, tmp		// k[i] + m[g] + a + (~d & c)
+	add	\a, \a, f		// k[i] + m[g] + a + (~d & c) + (d & b)
+	ror	\a, \a, #32-\s
+	add	\a, \a, \b
+.endm
+
+	/* f = b ^ c ^ d */
+.macro	f2	f, b, c, d
+	eor	\f, \c, \d
+	eor	\f, \f, \b
+.endm
+
+	/* f = c ^ (b | ~d) */
+.macro	f3	f, b, c, d
+	orn	\f, \b, \d
+	eor	\f, \f, \c
+.endm
+
+	/* do 4 rounds */
+.macro	rounds	f, m0, m1, m2, m3, s0, s1, s2, s3, k0, k1, k2, k3
+	round	a, b, c, d, \f, \k0, \m0, \s0
+	round	d, a, b, c, \f, \k1, \m1, \s1
+	round	c, d, a, b, \f, \k2, \m2, \s2
+	round	b, c, d, a, \f, \k3, \m3, \s3
+.endm
+
+	/* do 4 rounds with f0, f1, f2, f3 */
+.macro	rounds0	m0, m1, m2, m3, k0, k1, k2, k3
+	rounds	f0, \m0, \m1, \m2, \m3, 7, 12, 17, 22, \k0, \k1, \k2, \k3
+.endm
+
+.macro	rounds1	m0, m1, m2, m3, k0, k1, k2, k3
+	round1	a, b, c, d, \k0, \m0,  5
+	round1	d, a, b, c, \k1, \m1,  9
+	round1	c, d, a, b, \k2, \m2, 14
+	round1	b, c, d, a, \k3, \m3, 20
+.endm
+
+.macro	rounds2	m0, m1, m2, m3, k0, k1, k2, k3
+	rounds	f2, \m0, \m1, \m2, \m3, 4, 11, 16, 23, \k0, \k1, \k2, \k3
+.endm
+
+.macro	rounds3	m0, m1, m2, m3, k0, k1, k2, k3
+	rounds	f3, \m0, \m1, \m2, \m3, 6, 10, 15, 21, \k0, \k1, \k2, \k3
+.endm
+
+	/* md5block(MD5_CTX, buf, len) */
+ENTRY(_libmd_md5block)
+ctx	.req	x0
+buf	.req	x1
+len	.req	x2
+end	.req	x2			// aliases len
+a	.req	w3
+b	.req	w4
+c	.req	w5
+d	.req	w6
+f	.req	w7
+tmp	.req	w8
+k	.req	w9
+m0	.req	w10
+m1	.req	w11
+m2	.req	w12
+m3	.req	w13
+m4	.req	w14
+m5	.req	w15
+m6	.req	w16
+m7	.req	w17
+					// x18 is the platform register
+m8	.req	w19
+m9	.req	w20
+m10	.req	w21
+m11	.req	w22
+m12	.req	w23
+m13	.req	w24
+m14	.req	w25
+m15	.req	w26
+
+a_	.req	m0
+b_	.req	m7
+c_	.req	m14
+d_	.req	m5
+
+	stp	x19, x20, [sp, #-0x40]!
+	stp	x21, x22, [sp, #0x10]
+	stp	x23, x24, [sp, #0x20]
+	stp	x25, x26, [sp, #0x30]
+
+	bics	len, len, #63		// length in blocks
+	add	end, buf, len		// end pointer
+
+	beq	.Lend			// was len == 0 after BICS?
+
+	ldp	a, b, [ctx, #0]
+	ldp	c, d, [ctx, #8]
+
+	/* first eight rounds interleaved with data loads */
+.Lloop:	ldp	m0, m1, [buf, #0]
+	round	a, b, c, d, f0, 0xd76aa478, m0,  7
+	ldp	m2, m3, [buf, #8]
+	round	d, a, b, c, f0, 0xe8c7b756, m1, 12
+	ldp	m4, m5, [buf, #16]
+	round	c, d, a, b, f0, 0x242070db, m2, 17
+	ldp	m6, m7, [buf, #24]
+	round	b, c, d, a, f0, 0xc1bdceee, m3, 22
+
+	ldp	m8, m9, [buf, #32]
+	round	a, b, c, d, f0, 0xf57c0faf, m4,  7
+	ldp	m10, m11, [buf, #40]
+	round	d, a, b, c, f0, 0x4787c62a, m5, 12
+	ldp	m12, m13, [buf, #48]
+	round	c, d, a, b, f0, 0xa8304613, m6, 17
+	ldp	m14, m15, [buf, #56]
+	round	b, c, d, a, f0, 0xfd469501, m7, 22
+
+	/* remaining rounds use the roundsX macros */
+	rounds0	 m8,  m9, m10, m11, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+	rounds0	m12, m13, m14, m15, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+
+	rounds1	 m1,  m6, m11,  m0, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+	rounds1	 m5, m10, m15,  m4, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+	rounds1	 m9, m14,  m3,  m8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+	rounds1	m13,  m2,  m7, m12, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+
+	rounds2	 m5,  m8, m11, m14, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+	rounds2	 m1,  m4,  m7, m10, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+	rounds2	m13,  m0,  m3,  m6, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+	rounds2	 m9, m12, m15,  m2, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+
+	rounds3	 m0,  m7, m14,  m5, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+	rounds3	m12,  m3, m10,  m1, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+	rounds3	 m8, m15,  m6, m13, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+	rounds3	 m4, m11,  m2,  m9, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+
+	ldp	a_, b_, [ctx, #0]
+	ldp	c_, d_, [ctx, #8]
+	add	a, a, a_
+	add	b, b, b_
+	add	c, c, c_
+	add	d, d, d_
+	stp	a, b, [ctx, #0]
+	stp	c, d, [ctx, #8]
+
+	add	buf, buf, #64
+	cmp	buf, end
+	bne	.Lloop
+
+.Lend:	ldp	x25, x26, [sp, #0x30]
+	ldp	x23, x24, [sp, #0x20]
+	ldp	x21, x22, [sp, #0x10]
+	ldp	x19, x20, [sp], #0x40
+
+	ret
+END(_libmd_md5block)
+
+GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL)
+
+	.section .note.GNU-stack,"",%progbits
diff --git a/lib/libmd/aarch64/sha1block.S b/lib/libmd/aarch64/sha1block.S
index 56a0297efadd..e16fb36342fd 100644
--- a/lib/libmd/aarch64/sha1block.S
+++ b/lib/libmd/aarch64/sha1block.S
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  *
diff --git a/lib/libmd/aarch64/sha1dispatch.c b/lib/libmd/aarch64/sha1dispatch.c
index e34bf0a1a344..045527044320 100644
--- a/lib/libmd/aarch64/sha1dispatch.c
+++ b/lib/libmd/aarch64/sha1dispatch.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
diff --git a/lib/libmd/amd64/md5block.S b/lib/libmd/amd64/md5block.S
new file mode 100644
index 000000000000..0dd594dd5dc2
--- /dev/null
+++ b/lib/libmd/amd64/md5block.S
@@ -0,0 +1,363 @@
+/*-
+ * Copyright (c) 2024, 2025 Robert Clausecker <fuz@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <machine/asm.h>
+
+/* apply the round keys to the four round functions */
+.macro	allrounds	rfn0, rfn1, rfn2, rfn3
+	\rfn0	 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
+	\rfn0	 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
+	\rfn0	 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+	\rfn0	12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+
+	\rfn1	16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+	\rfn1	20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+	\rfn1	24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+	\rfn1	28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+
+	\rfn2	32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+	\rfn2	36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+	\rfn2	40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+	\rfn2	44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+
+	\rfn3	48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+	\rfn3	52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+	\rfn3	56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+	\rfn3	60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+.endm
+
+	// md5block(MD5_CTX, buf, len)
+ENTRY(_libmd_md5block_baseline)
+.macro	round	a, b, c, d, f, k, m, s
+	\f	%ebp, \b, \c, \d
+	add	$\k, \a			// a + k[i]
+	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
+	add	%ebp, \a		// a + k[i] + m[g] + f
+	rol	$\s, \a
+	add	\b, \a
+.endm
+
+	// f = b ? c : d
+.macro	f0	f, b, c, d
+	mov	\c, \f
+	xor	\d, \f
+	and	\b, \f
+	xor	\d, \f
+.endm
+
+	// f = d ? b : c
+.macro	f1	f, b, c, d
+	mov	\c, \f
+	xor	\b, \f
+	and	\d, \f
+	xor	\c, \f
+.endm
+
+	// f = b ^ c ^ d
+.macro	f2	f, b, c, d
+	mov	\c, \f
+	xor	\d, \f
+	xor	\b, \f
+.endm
+
+	// f = c ^ (b | ~d)
+.macro	f3	f, b, c, d
+	mov	$-1, \f
+	xor	\d, \f
+	or	\b, \f
+	xor	\c, \f
+.endm
+
+	// do 4 rounds
+.macro	rounds	f, p, q, s0, s1, s2, s3, k0, k1, k2, k3
+	round	%eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0
+	round	%edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1
+	round	%ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2
+	round	%ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3
+.endm
+
+	// do 4 rounds with f0, f1, f2, f3
+.macro	rounds0	i, k0, k1, k2, k3
+	rounds	f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3
+.endm
+
+.macro	rounds1	i, k0, k1, k2, k3
+	rounds	f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3
+.endm
+
+.macro	rounds2	i, k0, k1, k2, k3
+	rounds	f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3
+.endm
+
+.macro	rounds3	i, k0, k1, k2, k3
+	rounds	f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3
+.endm
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+
+	and	$~63, %rdx		// length in blocks
+	lea	(%rsi, %rdx, 1), %r12	// end pointer
+
+	mov	(%rdi), %eax		// a
+	mov	4(%rdi), %ebx		// b
+	mov	8(%rdi), %ecx		// c
+	mov	12(%rdi), %edx		// d
+
+	cmp	%rsi, %r12		// any data to process?
+	je	.Lend
+
+	.balign	16
+.Lloop:	mov	%eax, %r8d
+	mov	%ebx, %r9d
+	mov	%ecx, %r10d
+	mov	%edx, %r11d
+
+	allrounds	rounds0, rounds1, rounds2, rounds3
+
+	add	%r8d, %eax
+	add	%r9d, %ebx
+	add	%r10d, %ecx
+	add	%r11d, %edx
+
+	add	$64, %rsi
+	cmp	%rsi, %r12
+	jne	.Lloop
+
+	mov	%eax, (%rdi)
+	mov	%ebx, 4(%rdi)
+	mov	%ecx, 8(%rdi)
+	mov	%edx, 12(%rdi)
+
+.Lend:	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+END(_libmd_md5block_baseline)
+
+	/*
+	 * An implementation leveraging the ANDN instruction
+	 * from BMI1 to shorten some dependency chains.
+	 */
+ENTRY(_libmd_md5block_bmi1)
+	// special-cased round 1
+	// f1 = d ? b : c = (d & b) + (~d & c)
+.macro	round1	a, b, c, d, k, m, s
+	andn	\c, \d, %edi		// ~d & c
+	add	$\k, \a			// a + k[i]
+	mov	\d, %ebp
+	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
+	and	\b, %ebp		// d & b
+	add	%edi, \a		// a + k[i] + m[g] + (~d & c)
+	add	%ebp, \a		// a + k[i] + m[g] + (~d & c) + (d & b)
+	rol	$\s, \a
+	add	\b, \a
+.endm
+
+	// special-cased round 3
+	// f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d)
+.macro	round3	a, b, c, d, k, m, s
+	andn	\d, \b, %ebp
+	add	$\k - 1, \a		// a + k[i] - 1
+	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
+	xor	\c, %ebp
+	sub	%ebp, \a		// a + k[i] + m[g] + f
+	rol	$\s, \a
+	add	\b, \a
+.endm
+
+	.purgem	rounds1
+.macro	rounds1	i, k0, k1, k2, k3
+	round1	%eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1,  5
+	round1	%edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6,  9
+	round1	%ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14
+	round1	%ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20
+.endm
+
+	.purgem	rounds3
+.macro	rounds3	i, k0, k1, k2, k3
+	round3	%eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0,  6
+	round3	%edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10
+	round3	%ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15
+	round3	%ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21
+.endm
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+
+	and	$~63, %rdx		// length in blocks
+	lea	(%rsi, %rdx, 1), %r12	// end pointer
+
+	mov	(%rdi), %eax		// a
+	mov	4(%rdi), %ebx		// b
+	mov	8(%rdi), %ecx		// c
+	mov	12(%rdi), %edx		// d
+
+	cmp	%rsi, %r12		// any data to process?
+	je	0f
+
+	push	%rdi
+
+	.balign	16
+1:	mov	%eax, %r8d
+	mov	%ebx, %r9d
+	mov	%ecx, %r10d
+	mov	%edx, %r11d
+
+	allrounds	rounds0, rounds1, rounds2, rounds3
+
+	add	%r8d, %eax
+	add	%r9d, %ebx
+	add	%r10d, %ecx
+	add	%r11d, %edx
+
+	add	$64, %rsi
+	cmp	%rsi, %r12
+	jne	1b
+
+	pop	%rdi
+	mov	%eax, (%rdi)
+	mov	%ebx, 4(%rdi)
+	mov	%ecx, 8(%rdi)
+	mov	%edx, 12(%rdi)
+
+0:	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+END(_libmd_md5block_bmi1)
+
+#ifndef _KERNEL
+	/*
+	 * An implementation leveraging AVX-512 for its VPTERNLOGD
+	 * instruction.  We're using only XMM registers here,
+	 * avoiding costly thermal licensing.
+	 */
+ENTRY(_libmd_md5block_avx512)
+.macro	vround		a, b, c, d, f, i, m, mi, s
+	vmovdqa		\b, %xmm4
+	vpternlogd	$\f, \d, \c, %xmm4
+	vpaddd		4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i]
+.if	\mi != 0
+	vpshufd		$0x55 * \mi, %xmm5, %xmm5	// broadcast to each dword
+.endif
+	vpaddd		%xmm5, \a, \a		// a + k[i] + m[g]
+	vpaddd		%xmm4, \a, \a		// a + k[i] + m[g] + f
+	vprold		$\s, \a, \a
+	vpaddd		\b, \a, \a
+.endm
+
+.macro	vrounds		f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3
+	vround		%xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0
+	vround		%xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1
+	vround		%xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2
+	vround		%xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3
+.endm
+
+/*
+ * d c b f0 f1 f2 f3
+ * 0 0 0  0  0  0  1
+ * 1 0 0  1  0  1  0
+ * 0 1 0  0  1  1  0
+ * 1 1 0  1  0  0  1
+ * 0 0 1  0  0  1  1
+ * 1 0 1  0  1  0  1
+ * 0 1 1  1  1  0  0
+ * 1 1 1  1  1  1  0
+ */
+
+.macro	vrounds0	i, m
+	vrounds		0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22
+.endm
+
+.macro	vrounds1	i, m0, i0, m1, i1, m2, i2, m3, i3
+	vrounds		0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20
+.endm
+
+.macro	vrounds2	i, m0, i0, m1, i1, m2, i2, m3, i3
+	vrounds		0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23
+.endm
+
+.macro	vrounds3	i, m0, i0, m1, i1, m2, i2, m3, i3
+	vrounds		0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21
+.endm
+
+	and		$~63, %rdx		// length in blocks
+	add		%rsi, %rdx		// end pointer
+
+	vmovd		(%rdi), %xmm0		// a
+	vmovd		4(%rdi), %xmm1		// b
+	vmovd		8(%rdi), %xmm2		// c
+	vmovd		12(%rdi), %xmm3		// d
+
+	lea		keys(%rip), %rax
+
+	cmp		%rsi, %rdx		// any data to process?
+	je		0f
+
+	.balign		16
+1:	vmovdqu		0*4(%rsi), %xmm8	// message words
+	vmovdqu		4*4(%rsi), %xmm9
+	vmovdqu		8*4(%rsi), %xmm10
+	vmovdqu		12*4(%rsi), %xmm11
+
+	vmovdqa		%xmm0, %xmm12		// stash old state variables
+	vmovdqa		%xmm1, %xmm13
+	vmovdqa		%xmm2, %xmm14
+	vmovdqa		%xmm3, %xmm15
+
+	vrounds0	 0, %xmm8
+	vrounds0	 4, %xmm9
+	vrounds0	 8, %xmm10
+	vrounds0	12, %xmm11
+
+	vrounds1	16,  %xmm8, 1,  %xmm9, 2, %xmm10, 3,  %xmm8, 0
+	vrounds1	20,  %xmm9, 1, %xmm10, 2, %xmm11, 3,  %xmm9, 0
+	vrounds1	24, %xmm10, 1, %xmm11, 2,  %xmm8, 3, %xmm10, 0
+	vrounds1	28, %xmm11, 1,  %xmm8, 2,  %xmm9, 3, %xmm11, 0
+
+	vrounds2	32,  %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2
+	vrounds2	36,  %xmm8, 1,  %xmm9, 0,  %xmm9, 3, %xmm10, 2
+	vrounds2	40, %xmm11, 1,  %xmm8, 0,  %xmm8, 3,  %xmm9, 2
+	vrounds2	44  %xmm10, 1, %xmm11, 0, %xmm11, 3,  %xmm8, 2
+
+	vrounds3	48,  %xmm8, 0,  %xmm9, 3, %xmm11, 2,  %xmm9, 1
+	vrounds3	52, %xmm11, 0,  %xmm8, 3, %xmm10, 2,  %xmm8, 1
+	vrounds3	56, %xmm10, 0, %xmm11, 3,  %xmm9, 2, %xmm11, 1
+	vrounds3	60,  %xmm9, 0, %xmm10, 3,  %xmm8, 2, %xmm10, 1
+
+	vpaddd		%xmm12, %xmm0, %xmm0
+	vpaddd		%xmm13, %xmm1, %xmm1
+	vpaddd		%xmm14, %xmm2, %xmm2
+	vpaddd		%xmm15, %xmm3, %xmm3
+
+	add		$64, %rsi
+	cmp		%rsi, %rdx
+	jne		1b
+
+	vmovd		%xmm0, (%rdi)
+	vmovd		%xmm1, 4(%rdi)
+	vmovd		%xmm2, 8(%rdi)
+	vmovd		%xmm3, 12(%rdi)
+
+0:	ret
+END(_libmd_md5block_avx512)
+
+	// round keys, for use in md5block_avx512
+	.section	.rodata
+	.balign		16
+
+.macro	putkeys		i, a, b, c, d
+	.4byte		\a, \b, \c, \d
+.endm
+
+keys:	allrounds	putkeys, putkeys, putkeys, putkeys
+	.size		keys, .-keys
+#endif /* !defined(_KERNEL) */
+
+	.section .note.GNU-stack,"",%progbits
diff --git a/lib/libmd/amd64/md5dispatch.c b/lib/libmd/amd64/md5dispatch.c
new file mode 100644
index 000000000000..dd2131c5a57c
--- /dev/null
+++ b/lib/libmd/amd64/md5dispatch.c
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <sys/types.h>
+#include <sys/md5.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+#include <stdint.h>
+#include <string.h>
+#include <x86/ifunc.h>
+
+extern void _libmd_md5block_baseline(MD5_CTX *, const void *, size_t);
+extern void _libmd_md5block_bmi1(MD5_CTX *, const void *, size_t);
+extern void _libmd_md5block_avx512(MD5_CTX *, const void *, size_t);
+
+DEFINE_UIFUNC(, void, _libmd_md5block, (MD5_CTX *, const void *, size_t))
+{
+	if ((cpu_stdext_feature & (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL))
+	    == (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL)) {
+		u_int regs[4];
+		char cpu_vendor[12];
+
+		do_cpuid(0, regs);
+		((u_int *)&cpu_vendor)[0] = regs[1];
+		((u_int *)&cpu_vendor)[1] = regs[3];
+		((u_int *)&cpu_vendor)[2] = regs[2];
+
+		/* the AVX-512 kernel performs poorly on AMD */
+		if (memcmp(cpu_vendor, AMD_VENDOR_ID, sizeof(cpu_vendor)) != 0)
+			return (_libmd_md5block_avx512);
+	}
+
+	if (cpu_stdext_feature & CPUID_STDEXT_BMI1)
+		return (_libmd_md5block_bmi1);
+	else
+		return (_libmd_md5block_baseline);
+}
diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S
index f1291ef2647a..6ef083178abc 100644
--- a/lib/libmd/amd64/sha1block.S
+++ b/lib/libmd/amd64/sha1block.S
@@ -1,6 +1,6 @@
 /*-
  * Copyright (c) 2013 The Go Authors. All rights reserved.
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
  *
  * Adapted from Go's crypto/sha1/sha1block_amd64.s.
  *
diff --git a/lib/libmd/amd64/sha1dispatch.c b/lib/libmd/amd64/sha1dispatch.c
index 86509195d56e..c82a60334739 100644
--- a/lib/libmd/amd64/sha1dispatch.c
+++ b/lib/libmd/amd64/sha1dispatch.c
@@ -1,6 +1,6 @@
 /*-
  * Copyright (c) 2016 The Go Authors. All rights reserved.
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
  *
  * Adapted from Go's crypto/sha1/sha1block_amd64.go.
  *
diff --git a/lib/libmd/sha1c.c b/lib/libmd/sha1c.c
index 128e0b991742..02132d720dac 100644
--- a/lib/libmd/sha1c.c
+++ b/lib/libmd/sha1c.c
@@ -1,6 +1,6 @@
 /*-
  * Copyright (c) 2009 The Go Authors. All rights reserved.
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
  *
  * Adapted from Go's crypto/sha1/sha1.go.
  *