aboutsummaryrefslogtreecommitdiff
path: root/lib/libmd
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libmd')
-rw-r--r--lib/libmd/Makefile9
-rw-r--r--lib/libmd/aarch64/md5block.S206
-rw-r--r--lib/libmd/aarch64/sha1block.S2
-rw-r--r--lib/libmd/aarch64/sha1dispatch.c2
-rw-r--r--lib/libmd/amd64/md5block.S363
-rw-r--r--lib/libmd/amd64/md5dispatch.c41
-rw-r--r--lib/libmd/amd64/sha1block.S2
-rw-r--r--lib/libmd/amd64/sha1dispatch.c2
-rw-r--r--lib/libmd/sha1c.c2
9 files changed, 623 insertions, 6 deletions
diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile
index 547a134fc440..c4ab767c8b2f 100644
--- a/lib/libmd/Makefile
+++ b/lib/libmd/Makefile
@@ -108,7 +108,7 @@ CFLAGS+= -DWEAK_REFS
CFLAGS.skein_block.c+= -DSKEIN_LOOP=995
.PATH: ${.CURDIR}/${MACHINE_ARCH} ${SRCTOP}/sys/crypto/sha2
.PATH: ${SRCTOP}/sys/crypto/skein ${SRCTOP}/sys/crypto/skein/${MACHINE_ARCH}
-.PATH: ${SRCTOP}/sys/kern
+.PATH: ${SRCTOP}/sys/crypto
USE_ASM_SOURCES?=1
.if defined(BOOTSTRAPPING) || ${MK_MACHDEP_OPTIMIZATIONS} == no
@@ -117,6 +117,13 @@ USE_ASM_SOURCES:=0
.endif
.if ${USE_ASM_SOURCES} != 0
+.if exists(${MACHINE_ARCH}/md5block.S)
+SRCS+= md5block.S
+CFLAGS+= -DMD5_ASM
+.if exists(${MACHINE_ARCH}/md5dispatch.c)
+SRCS+= md5dispatch.c
+.endif
+.endif
.if exists(${MACHINE_ARCH}/sha1block.S)
SRCS+= sha1block.S
CFLAGS+= -DSHA1_ASM
diff --git a/lib/libmd/aarch64/md5block.S b/lib/libmd/aarch64/md5block.S
new file mode 100644
index 000000000000..b928c8dd795a
--- /dev/null
+++ b/lib/libmd/aarch64/md5block.S
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <sys/elf_common.h>
+#include <machine/asm.h>
+
+# optimal instruction sequence for k = \key + \m
+.macro addkm key, m
+.if 0x100000000 - \key > 0x00ffffff
+ movz k, #\key & 0xffff
+ movk k, #\key >> 16, lsl #16
+ add k, k, \m
+.elseif 0x100000000 - \key > 0x0000ffff
+ sub k, \m, #(0x100000000 - \key) & 0xfff000
+ sub k, k, #(0x100000000 - \key) & 0xfff
+.else
+ movz k, #0x100000000 - \key
+ sub k, \m, k
+.endif
+.endm
+
+.macro round a, b, c, d, f, key, m, s
+ \f f, \b, \c, \d
+ addkm \key, \m // k[i] + m[g]
+ add \a, \a, k // k[i] + m[g] + a
+ add \a, \a, f // k[i] + m[g] + a + f
+ ror \a, \a, #32-\s
+ add \a, \a, \b
+.endm
+
+ /* f = b ? c : d */
+.macro f0 f, b, c, d
+ eor \f, \c, \d
+ and \f, \f, \b
+ eor \f, \f, \d
+.endm
+
+ /*
+ * special cased round 1 function
+ * f1 = d ? b : c = (d & b) + (~d & c)
+ */
+.macro round1 a, b, c, d, key, m, s
+ bic tmp, \c, \d // ~d & c
+ addkm \key, \m // k[i] + m[g]
+ add \a, \a, k // k[i] + m[g] + a
+ and f, \b, \d // d & b
+ add \a, \a, tmp // k[i] + m[g] + a + (~d & c)
+ add \a, \a, f // k[i] + m[g] + a + (~d & c) + (d & b)
+ ror \a, \a, #32-\s
+ add \a, \a, \b
+.endm
+
+ /* f = b ^ c ^ d */
+.macro f2 f, b, c, d
+ eor \f, \c, \d
+ eor \f, \f, \b
+.endm
+
+ /* f = c ^ (b | ~d) */
+.macro f3 f, b, c, d
+ orn \f, \b, \d
+ eor \f, \f, \c
+.endm
+
+ /* do 4 rounds */
+.macro rounds f, m0, m1, m2, m3, s0, s1, s2, s3, k0, k1, k2, k3
+ round a, b, c, d, \f, \k0, \m0, \s0
+ round d, a, b, c, \f, \k1, \m1, \s1
+ round c, d, a, b, \f, \k2, \m2, \s2
+ round b, c, d, a, \f, \k3, \m3, \s3
+.endm
+
+ /* do 4 rounds with f0, f1, f2, f3 */
+.macro rounds0 m0, m1, m2, m3, k0, k1, k2, k3
+ rounds f0, \m0, \m1, \m2, \m3, 7, 12, 17, 22, \k0, \k1, \k2, \k3
+.endm
+
+.macro rounds1 m0, m1, m2, m3, k0, k1, k2, k3
+ round1 a, b, c, d, \k0, \m0, 5
+ round1 d, a, b, c, \k1, \m1, 9
+ round1 c, d, a, b, \k2, \m2, 14
+ round1 b, c, d, a, \k3, \m3, 20
+.endm
+
+.macro rounds2 m0, m1, m2, m3, k0, k1, k2, k3
+ rounds f2, \m0, \m1, \m2, \m3, 4, 11, 16, 23, \k0, \k1, \k2, \k3
+.endm
+
+.macro rounds3 m0, m1, m2, m3, k0, k1, k2, k3
+ rounds f3, \m0, \m1, \m2, \m3, 6, 10, 15, 21, \k0, \k1, \k2, \k3
+.endm
+
+ /* md5block(MD5_CTX, buf, len) */
+ENTRY(_libmd_md5block)
+ctx .req x0
+buf .req x1
+len .req x2
+end .req x2 // aliases len
+a .req w3
+b .req w4
+c .req w5
+d .req w6
+f .req w7
+tmp .req w8
+k .req w9
+m0 .req w10
+m1 .req w11
+m2 .req w12
+m3 .req w13
+m4 .req w14
+m5 .req w15
+m6 .req w16
+m7 .req w17
+ // x18 is the platform register
+m8 .req w19
+m9 .req w20
+m10 .req w21
+m11 .req w22
+m12 .req w23
+m13 .req w24
+m14 .req w25
+m15 .req w26
+
+a_ .req m0
+b_ .req m7
+c_ .req m14
+d_ .req m5
+
+ stp x19, x20, [sp, #-0x40]!
+ stp x21, x22, [sp, #0x10]
+ stp x23, x24, [sp, #0x20]
+ stp x25, x26, [sp, #0x30]
+
+ bics len, len, #63 // length in blocks
+ add end, buf, len // end pointer
+
+ beq .Lend // was len == 0 after BICS?
+
+ ldp a, b, [ctx, #0]
+ ldp c, d, [ctx, #8]
+
+ /* first eight rounds interleaved with data loads */
+.Lloop: ldp m0, m1, [buf, #0]
+ round a, b, c, d, f0, 0xd76aa478, m0, 7
+ ldp m2, m3, [buf, #8]
+ round d, a, b, c, f0, 0xe8c7b756, m1, 12
+ ldp m4, m5, [buf, #16]
+ round c, d, a, b, f0, 0x242070db, m2, 17
+ ldp m6, m7, [buf, #24]
+ round b, c, d, a, f0, 0xc1bdceee, m3, 22
+
+ ldp m8, m9, [buf, #32]
+ round a, b, c, d, f0, 0xf57c0faf, m4, 7
+ ldp m10, m11, [buf, #40]
+ round d, a, b, c, f0, 0x4787c62a, m5, 12
+ ldp m12, m13, [buf, #48]
+ round c, d, a, b, f0, 0xa8304613, m6, 17
+ ldp m14, m15, [buf, #56]
+ round b, c, d, a, f0, 0xfd469501, m7, 22
+
+ /* remaining rounds use the roundsX macros */
+ rounds0 m8, m9, m10, m11, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+ rounds0 m12, m13, m14, m15, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+
+ rounds1 m1, m6, m11, m0, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+ rounds1 m5, m10, m15, m4, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+ rounds1 m9, m14, m3, m8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+ rounds1 m13, m2, m7, m12, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+
+ rounds2 m5, m8, m11, m14, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+ rounds2 m1, m4, m7, m10, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+ rounds2 m13, m0, m3, m6, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+ rounds2 m9, m12, m15, m2, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+
+ rounds3 m0, m7, m14, m5, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+ rounds3 m12, m3, m10, m1, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+ rounds3 m8, m15, m6, m13, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+ rounds3 m4, m11, m2, m9, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+
+ ldp a_, b_, [ctx, #0]
+ ldp c_, d_, [ctx, #8]
+ add a, a, a_
+ add b, b, b_
+ add c, c, c_
+ add d, d, d_
+ stp a, b, [ctx, #0]
+ stp c, d, [ctx, #8]
+
+ add buf, buf, #64
+ cmp buf, end
+ bne .Lloop
+
+.Lend: ldp x25, x26, [sp, #0x30]
+ ldp x23, x24, [sp, #0x20]
+ ldp x21, x22, [sp, #0x10]
+ ldp x19, x20, [sp], #0x40
+
+ ret
+END(_libmd_md5block)
+
+GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL)
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libmd/aarch64/sha1block.S b/lib/libmd/aarch64/sha1block.S
index 56a0297efadd..e16fb36342fd 100644
--- a/lib/libmd/aarch64/sha1block.S
+++ b/lib/libmd/aarch64/sha1block.S
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*
diff --git a/lib/libmd/aarch64/sha1dispatch.c b/lib/libmd/aarch64/sha1dispatch.c
index e34bf0a1a344..045527044320 100644
--- a/lib/libmd/aarch64/sha1dispatch.c
+++ b/lib/libmd/aarch64/sha1dispatch.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
diff --git a/lib/libmd/amd64/md5block.S b/lib/libmd/amd64/md5block.S
new file mode 100644
index 000000000000..0dd594dd5dc2
--- /dev/null
+++ b/lib/libmd/amd64/md5block.S
@@ -0,0 +1,363 @@
+/*-
+ * Copyright (c) 2024, 2025 Robert Clausecker <fuz@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <machine/asm.h>
+
+/* apply the round keys to the four round functions */
+.macro allrounds rfn0, rfn1, rfn2, rfn3
+ \rfn0 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
+ \rfn0 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
+ \rfn0 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+ \rfn0 12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+
+ \rfn1 16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+ \rfn1 20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+ \rfn1 24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+ \rfn1 28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+
+ \rfn2 32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+ \rfn2 36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+ \rfn2 40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+ \rfn2 44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+
+ \rfn3 48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+ \rfn3 52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+ \rfn3 56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+ \rfn3 60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+.endm
+
+ // md5block(MD5_CTX, buf, len)
+ENTRY(_libmd_md5block_baseline)
+.macro round a, b, c, d, f, k, m, s
+ \f %ebp, \b, \c, \d
+ add $\k, \a // a + k[i]
+ add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g]
+ add %ebp, \a // a + k[i] + m[g] + f
+ rol $\s, \a
+ add \b, \a
+.endm
+
+ // f = b ? c : d
+.macro f0 f, b, c, d
+ mov \c, \f
+ xor \d, \f
+ and \b, \f
+ xor \d, \f
+.endm
+
+ // f = d ? b : c
+.macro f1 f, b, c, d
+ mov \c, \f
+ xor \b, \f
+ and \d, \f
+ xor \c, \f
+.endm
+
+ // f = b ^ c ^ d
+.macro f2 f, b, c, d
+ mov \c, \f
+ xor \d, \f
+ xor \b, \f
+.endm
+
+ // f = c ^ (b | ~d)
+.macro f3 f, b, c, d
+ mov $-1, \f
+ xor \d, \f
+ or \b, \f
+ xor \c, \f
+.endm
+
+ // do 4 rounds
+.macro rounds f, p, q, s0, s1, s2, s3, k0, k1, k2, k3
+ round %eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0
+ round %edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1
+ round %ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2
+ round %ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3
+.endm
+
+ // do 4 rounds with f0, f1, f2, f3
+.macro rounds0 i, k0, k1, k2, k3
+ rounds f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3
+.endm
+
+.macro rounds1 i, k0, k1, k2, k3
+ rounds f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3
+.endm
+
+.macro rounds2 i, k0, k1, k2, k3
+ rounds f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3
+.endm
+
+.macro rounds3 i, k0, k1, k2, k3
+ rounds f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3
+.endm
+
+ push %rbx
+ push %rbp
+ push %r12
+
+ and $~63, %rdx // length in blocks
+ lea (%rsi, %rdx, 1), %r12 // end pointer
+
+ mov (%rdi), %eax // a
+ mov 4(%rdi), %ebx // b
+ mov 8(%rdi), %ecx // c
+ mov 12(%rdi), %edx // d
+
+ cmp %rsi, %r12 // any data to process?
+ je .Lend
+
+ .balign 16
+.Lloop: mov %eax, %r8d
+ mov %ebx, %r9d
+ mov %ecx, %r10d
+ mov %edx, %r11d
+
+ allrounds rounds0, rounds1, rounds2, rounds3
+
+ add %r8d, %eax
+ add %r9d, %ebx
+ add %r10d, %ecx
+ add %r11d, %edx
+
+ add $64, %rsi
+ cmp %rsi, %r12
+ jne .Lloop
+
+ mov %eax, (%rdi)
+ mov %ebx, 4(%rdi)
+ mov %ecx, 8(%rdi)
+ mov %edx, 12(%rdi)
+
+.Lend: pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+END(_libmd_md5block_baseline)
+
+ /*
+ * An implementation leveraging the ANDN instruction
+ * from BMI1 to shorten some dependency chains.
+ */
+ENTRY(_libmd_md5block_bmi1)
+ // special-cased round 1
+ // f1 = d ? b : c = (d & b) + (~d & c)
+.macro round1 a, b, c, d, k, m, s
+ andn \c, \d, %edi // ~d & c
+ add $\k, \a // a + k[i]
+ mov \d, %ebp
+ add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g]
+ and \b, %ebp // d & b
+ add %edi, \a // a + k[i] + m[g] + (~d & c)
+ add %ebp, \a // a + k[i] + m[g] + (~d & c) + (d & b)
+ rol $\s, \a
+ add \b, \a
+.endm
+
+ // special-cased round 3
+ // f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d)
+.macro round3 a, b, c, d, k, m, s
+ andn \d, \b, %ebp
+ add $\k - 1, \a // a + k[i] - 1
+ add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g]
+ xor \c, %ebp
+ sub %ebp, \a // a + k[i] + m[g] + f
+ rol $\s, \a
+ add \b, \a
+.endm
+
+ .purgem rounds1
+.macro rounds1 i, k0, k1, k2, k3
+ round1 %eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1, 5
+ round1 %edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6, 9
+ round1 %ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14
+ round1 %ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20
+.endm
+
+ .purgem rounds3
+.macro rounds3 i, k0, k1, k2, k3
+ round3 %eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0, 6
+ round3 %edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10
+ round3 %ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15
+ round3 %ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21
+.endm
+
+ push %rbx
+ push %rbp
+ push %r12
+
+ and $~63, %rdx // length in blocks
+ lea (%rsi, %rdx, 1), %r12 // end pointer
+
+ mov (%rdi), %eax // a
+ mov 4(%rdi), %ebx // b
+ mov 8(%rdi), %ecx // c
+ mov 12(%rdi), %edx // d
+
+ cmp %rsi, %r12 // any data to process?
+ je 0f
+
+ push %rdi
+
+ .balign 16
+1: mov %eax, %r8d
+ mov %ebx, %r9d
+ mov %ecx, %r10d
+ mov %edx, %r11d
+
+ allrounds rounds0, rounds1, rounds2, rounds3
+
+ add %r8d, %eax
+ add %r9d, %ebx
+ add %r10d, %ecx
+ add %r11d, %edx
+
+ add $64, %rsi
+ cmp %rsi, %r12
+ jne 1b
+
+ pop %rdi
+ mov %eax, (%rdi)
+ mov %ebx, 4(%rdi)
+ mov %ecx, 8(%rdi)
+ mov %edx, 12(%rdi)
+
+0: pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+END(_libmd_md5block_bmi1)
+
+#ifndef _KERNEL
+ /*
+ * An implementation leveraging AVX-512 for its VPTERNLOGD
+ * instruction. We're using only XMM registers here,
+ * avoiding costly thermal licensing.
+ */
+ENTRY(_libmd_md5block_avx512)
+.macro vround a, b, c, d, f, i, m, mi, s
+ vmovdqa \b, %xmm4
+ vpternlogd $\f, \d, \c, %xmm4
+ vpaddd 4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i]
+.if \mi != 0
+ vpshufd $0x55 * \mi, %xmm5, %xmm5 // broadcast to each dword
+.endif
+ vpaddd %xmm5, \a, \a // a + k[i] + m[g]
+ vpaddd %xmm4, \a, \a // a + k[i] + m[g] + f
+ vprold $\s, \a, \a
+ vpaddd \b, \a, \a
+.endm
+
+.macro vrounds f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3
+ vround %xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0
+ vround %xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1
+ vround %xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2
+ vround %xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3
+.endm
+
+/*
+ * d c b f0 f1 f2 f3
+ * 0 0 0 0 0 0 1
+ * 1 0 0 1 0 1 0
+ * 0 1 0 0 1 1 0
+ * 1 1 0 1 0 0 1
+ * 0 0 1 0 0 1 1
+ * 1 0 1 0 1 0 1
+ * 0 1 1 1 1 0 0
+ * 1 1 1 1 1 1 0
+ */
+
+.macro vrounds0 i, m
+ vrounds 0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22
+.endm
+
+.macro vrounds1 i, m0, i0, m1, i1, m2, i2, m3, i3
+ vrounds 0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20
+.endm
+
+.macro vrounds2 i, m0, i0, m1, i1, m2, i2, m3, i3
+ vrounds 0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23
+.endm
+
+.macro vrounds3 i, m0, i0, m1, i1, m2, i2, m3, i3
+ vrounds 0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21
+.endm
+
+ and $~63, %rdx // length in blocks
+ add %rsi, %rdx // end pointer
+
+ vmovd (%rdi), %xmm0 // a
+ vmovd 4(%rdi), %xmm1 // b
+ vmovd 8(%rdi), %xmm2 // c
+ vmovd 12(%rdi), %xmm3 // d
+
+ lea keys(%rip), %rax
+
+ cmp %rsi, %rdx // any data to process?
+ je 0f
+
+ .balign 16
+1: vmovdqu 0*4(%rsi), %xmm8 // message words
+ vmovdqu 4*4(%rsi), %xmm9
+ vmovdqu 8*4(%rsi), %xmm10
+ vmovdqu 12*4(%rsi), %xmm11
+
+ vmovdqa %xmm0, %xmm12 // stash old state variables
+ vmovdqa %xmm1, %xmm13
+ vmovdqa %xmm2, %xmm14
+ vmovdqa %xmm3, %xmm15
+
+ vrounds0 0, %xmm8
+ vrounds0 4, %xmm9
+ vrounds0 8, %xmm10
+ vrounds0 12, %xmm11
+
+ vrounds1 16, %xmm8, 1, %xmm9, 2, %xmm10, 3, %xmm8, 0
+ vrounds1 20, %xmm9, 1, %xmm10, 2, %xmm11, 3, %xmm9, 0
+ vrounds1 24, %xmm10, 1, %xmm11, 2, %xmm8, 3, %xmm10, 0
+ vrounds1 28, %xmm11, 1, %xmm8, 2, %xmm9, 3, %xmm11, 0
+
+ vrounds2 32, %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2
+ vrounds2 36, %xmm8, 1, %xmm9, 0, %xmm9, 3, %xmm10, 2
+ vrounds2 40, %xmm11, 1, %xmm8, 0, %xmm8, 3, %xmm9, 2
+ vrounds2 44 %xmm10, 1, %xmm11, 0, %xmm11, 3, %xmm8, 2
+
+ vrounds3 48, %xmm8, 0, %xmm9, 3, %xmm11, 2, %xmm9, 1
+ vrounds3 52, %xmm11, 0, %xmm8, 3, %xmm10, 2, %xmm8, 1
+ vrounds3 56, %xmm10, 0, %xmm11, 3, %xmm9, 2, %xmm11, 1
+ vrounds3 60, %xmm9, 0, %xmm10, 3, %xmm8, 2, %xmm10, 1
+
+ vpaddd %xmm12, %xmm0, %xmm0
+ vpaddd %xmm13, %xmm1, %xmm1
+ vpaddd %xmm14, %xmm2, %xmm2
+ vpaddd %xmm15, %xmm3, %xmm3
+
+ add $64, %rsi
+ cmp %rsi, %rdx
+ jne 1b
+
+ vmovd %xmm0, (%rdi)
+ vmovd %xmm1, 4(%rdi)
+ vmovd %xmm2, 8(%rdi)
+ vmovd %xmm3, 12(%rdi)
+
+0: ret
+END(_libmd_md5block_avx512)
+
+ // round keys, for use in md5block_avx512
+ .section .rodata
+ .balign 16
+
+.macro putkeys i, a, b, c, d
+ .4byte \a, \b, \c, \d
+.endm
+
+keys: allrounds putkeys, putkeys, putkeys, putkeys
+ .size keys, .-keys
+#endif /* !defined(_KERNEL) */
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libmd/amd64/md5dispatch.c b/lib/libmd/amd64/md5dispatch.c
new file mode 100644
index 000000000000..dd2131c5a57c
--- /dev/null
+++ b/lib/libmd/amd64/md5dispatch.c
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <sys/types.h>
+#include <sys/md5.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+#include <stdint.h>
+#include <string.h>
+#include <x86/ifunc.h>
+
+extern void _libmd_md5block_baseline(MD5_CTX *, const void *, size_t);
+extern void _libmd_md5block_bmi1(MD5_CTX *, const void *, size_t);
+extern void _libmd_md5block_avx512(MD5_CTX *, const void *, size_t);
+
+DEFINE_UIFUNC(, void, _libmd_md5block, (MD5_CTX *, const void *, size_t))
+{
+ if ((cpu_stdext_feature & (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL))
+ == (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL)) {
+ u_int regs[4];
+ char cpu_vendor[12];
+
+ do_cpuid(0, regs);
+ ((u_int *)&cpu_vendor)[0] = regs[1];
+ ((u_int *)&cpu_vendor)[1] = regs[3];
+ ((u_int *)&cpu_vendor)[2] = regs[2];
+
+ /* the AVX-512 kernel performs poorly on AMD */
+ if (memcmp(cpu_vendor, AMD_VENDOR_ID, sizeof(cpu_vendor)) != 0)
+ return (_libmd_md5block_avx512);
+ }
+
+ if (cpu_stdext_feature & CPUID_STDEXT_BMI1)
+ return (_libmd_md5block_bmi1);
+ else
+ return (_libmd_md5block_baseline);
+}
diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S
index f1291ef2647a..6ef083178abc 100644
--- a/lib/libmd/amd64/sha1block.S
+++ b/lib/libmd/amd64/sha1block.S
@@ -1,6 +1,6 @@
/*-
* Copyright (c) 2013 The Go Authors. All rights reserved.
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
*
* Adapted from Go's crypto/sha1/sha1block_amd64.s.
*
diff --git a/lib/libmd/amd64/sha1dispatch.c b/lib/libmd/amd64/sha1dispatch.c
index 86509195d56e..c82a60334739 100644
--- a/lib/libmd/amd64/sha1dispatch.c
+++ b/lib/libmd/amd64/sha1dispatch.c
@@ -1,6 +1,6 @@
/*-
* Copyright (c) 2016 The Go Authors. All rights reserved.
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
*
* Adapted from Go's crypto/sha1/sha1block_amd64.go.
*
diff --git a/lib/libmd/sha1c.c b/lib/libmd/sha1c.c
index 128e0b991742..02132d720dac 100644
--- a/lib/libmd/sha1c.c
+++ b/lib/libmd/sha1c.c
@@ -1,6 +1,6 @@
/*-
* Copyright (c) 2009 The Go Authors. All rights reserved.
- * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
*
* Adapted from Go's crypto/sha1/sha1.go.
*