diff options
Diffstat (limited to 'lib/libmd')
| -rw-r--r-- | lib/libmd/Makefile | 9 | ||||
| -rw-r--r-- | lib/libmd/aarch64/md5block.S | 206 | ||||
| -rw-r--r-- | lib/libmd/aarch64/sha1block.S | 2 | ||||
| -rw-r--r-- | lib/libmd/aarch64/sha1dispatch.c | 2 | ||||
| -rw-r--r-- | lib/libmd/amd64/md5block.S | 363 | ||||
| -rw-r--r-- | lib/libmd/amd64/md5dispatch.c | 41 | ||||
| -rw-r--r-- | lib/libmd/amd64/sha1block.S | 2 | ||||
| -rw-r--r-- | lib/libmd/amd64/sha1dispatch.c | 2 | ||||
| -rw-r--r-- | lib/libmd/sha1c.c | 2 | 
9 files changed, 623 insertions, 6 deletions
| diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile index 547a134fc440..c4ab767c8b2f 100644 --- a/lib/libmd/Makefile +++ b/lib/libmd/Makefile @@ -108,7 +108,7 @@ CFLAGS+= -DWEAK_REFS  CFLAGS.skein_block.c+= -DSKEIN_LOOP=995  .PATH: ${.CURDIR}/${MACHINE_ARCH} ${SRCTOP}/sys/crypto/sha2  .PATH: ${SRCTOP}/sys/crypto/skein ${SRCTOP}/sys/crypto/skein/${MACHINE_ARCH} -.PATH: ${SRCTOP}/sys/kern +.PATH: ${SRCTOP}/sys/crypto  USE_ASM_SOURCES?=1  .if defined(BOOTSTRAPPING) || ${MK_MACHDEP_OPTIMIZATIONS} == no @@ -117,6 +117,13 @@ USE_ASM_SOURCES:=0  .endif  .if ${USE_ASM_SOURCES} != 0 +.if exists(${MACHINE_ARCH}/md5block.S) +SRCS+=	md5block.S +CFLAGS+= -DMD5_ASM +.if exists(${MACHINE_ARCH}/md5dispatch.c) +SRCS+=  md5dispatch.c +.endif +.endif  .if exists(${MACHINE_ARCH}/sha1block.S)  SRCS+=	sha1block.S  CFLAGS+= -DSHA1_ASM diff --git a/lib/libmd/aarch64/md5block.S b/lib/libmd/aarch64/md5block.S new file mode 100644 index 000000000000..b928c8dd795a --- /dev/null +++ b/lib/libmd/aarch64/md5block.S @@ -0,0 +1,206 @@ +/*- + * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <sys/elf_common.h> +#include <machine/asm.h> + +# optimal instruction sequence for k = \key + \m +.macro	addkm	key, m +.if 0x100000000 - \key > 0x00ffffff +	movz	k, #\key & 0xffff +	movk	k, #\key >> 16, lsl #16 +	add	k, k, \m +.elseif 0x100000000 - \key > 0x0000ffff +	sub	k, \m, #(0x100000000 - \key) & 0xfff000 +	sub	k, k, #(0x100000000 - \key) & 0xfff +.else +	movz	k, #0x100000000 - \key +	sub	k, \m, k +.endif +.endm + +.macro	round	a, b, c, d, f, key, m, s +	\f	f, \b, \c, \d +	addkm	\key, \m		// k[i] + m[g] +	add	\a, \a, k		// k[i] + m[g] + a +	add	\a, \a, f		// k[i] + m[g] + a + f +	ror	\a, \a, #32-\s +	add	\a, \a, \b +.endm + +	/* f = b ? c : d */ +.macro	f0	f, b, c, d +	eor	\f, \c, \d +	and	\f, \f, \b +	eor	\f, \f, \d +.endm + +	/* +	 * special cased round 1 function +	 * f1 = d ? b : c = (d & b) + (~d & c) +	 */ +.macro	round1	a, b, c, d, key, m, s +	bic	tmp, \c, \d		// ~d & c +	addkm	\key, \m		// k[i] + m[g] +	add	\a, \a, k		// k[i] + m[g] + a +	and	f, \b, \d		// d & b +	add	\a, \a, tmp		// k[i] + m[g] + a + (~d & c) +	add	\a, \a, f		// k[i] + m[g] + a + (~d & c) + (d & b) +	ror	\a, \a, #32-\s +	add	\a, \a, \b +.endm + +	/* f = b ^ c ^ d */ +.macro	f2	f, b, c, d +	eor	\f, \c, \d +	eor	\f, \f, \b +.endm + +	/* f = c ^ (b | ~d) */ +.macro	f3	f, b, c, d +	orn	\f, \b, \d +	eor	\f, \f, \c +.endm + +	/* do 4 rounds */ +.macro	rounds	f, m0, m1, m2, m3, s0, s1, s2, s3, k0, k1, k2, k3 +	round	a, b, c, d, \f, \k0, \m0, \s0 +	round	d, a, b, c, \f, \k1, \m1, \s1 +	round	c, d, a, b, \f, \k2, \m2, \s2 +	round	b, c, d, a, \f, \k3, \m3, \s3 +.endm + +	/* do 4 rounds with f0, f1, f2, f3 */ +.macro	rounds0	m0, m1, m2, m3, k0, k1, k2, k3 +	rounds	f0, \m0, \m1, \m2, \m3, 7, 12, 17, 22, \k0, \k1, \k2, \k3 +.endm + +.macro	rounds1	m0, m1, m2, m3, k0, k1, k2, k3 +	round1	a, b, c, d, \k0, \m0,  5 +	round1	d, a, b, c, \k1, \m1,  9 +	round1	c, d, a, b, \k2, \m2, 14 +	round1	b, c, d, a, \k3, \m3, 20 +.endm + +.macro	rounds2	m0, m1, m2, m3, k0, k1, k2, k3 +	rounds	f2, \m0, \m1, \m2, \m3, 4, 11, 16, 23, \k0, \k1, \k2, \k3 +.endm + +.macro	rounds3	m0, m1, m2, m3, k0, k1, k2, k3 +	rounds	f3, \m0, \m1, \m2, \m3, 6, 10, 15, 21, \k0, \k1, \k2, \k3 +.endm + +	/* md5block(MD5_CTX, buf, len) */ +ENTRY(_libmd_md5block) +ctx	.req	x0 +buf	.req	x1 +len	.req	x2 +end	.req	x2			// aliases len +a	.req	w3 +b	.req	w4 +c	.req	w5 +d	.req	w6 +f	.req	w7 +tmp	.req	w8 +k	.req	w9 +m0	.req	w10 +m1	.req	w11 +m2	.req	w12 +m3	.req	w13 +m4	.req	w14 +m5	.req	w15 +m6	.req	w16 +m7	.req	w17 +					// x18 is the platform register +m8	.req	w19 +m9	.req	w20 +m10	.req	w21 +m11	.req	w22 +m12	.req	w23 +m13	.req	w24 +m14	.req	w25 +m15	.req	w26 + +a_	.req	m0 +b_	.req	m7 +c_	.req	m14 +d_	.req	m5 + +	stp	x19, x20, [sp, #-0x40]! +	stp	x21, x22, [sp, #0x10] +	stp	x23, x24, [sp, #0x20] +	stp	x25, x26, [sp, #0x30] + +	bics	len, len, #63		// length in blocks +	add	end, buf, len		// end pointer + +	beq	.Lend			// was len == 0 after BICS? + +	ldp	a, b, [ctx, #0] +	ldp	c, d, [ctx, #8] + +	/* first eight rounds interleaved with data loads */ +.Lloop:	ldp	m0, m1, [buf, #0] +	round	a, b, c, d, f0, 0xd76aa478, m0,  7 +	ldp	m2, m3, [buf, #8] +	round	d, a, b, c, f0, 0xe8c7b756, m1, 12 +	ldp	m4, m5, [buf, #16] +	round	c, d, a, b, f0, 0x242070db, m2, 17 +	ldp	m6, m7, [buf, #24] +	round	b, c, d, a, f0, 0xc1bdceee, m3, 22 + +	ldp	m8, m9, [buf, #32] +	round	a, b, c, d, f0, 0xf57c0faf, m4,  7 +	ldp	m10, m11, [buf, #40] +	round	d, a, b, c, f0, 0x4787c62a, m5, 12 +	ldp	m12, m13, [buf, #48] +	round	c, d, a, b, f0, 0xa8304613, m6, 17 +	ldp	m14, m15, [buf, #56] +	round	b, c, d, a, f0, 0xfd469501, m7, 22 + +	/* remaining rounds use the roundsX macros */ +	rounds0	 m8,  m9, m10, m11, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be +	rounds0	m12, m13, m14, m15, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 + +	rounds1	 m1,  m6, m11,  m0, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa +	rounds1	 m5, m10, m15,  m4, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 +	rounds1	 m9, m14,  m3,  m8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed +	rounds1	m13,  m2,  m7, m12, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a + +	rounds2	 m5,  m8, m11, m14, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c +	rounds2	 m1,  m4,  m7, m10, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 +	rounds2	m13,  m0,  m3,  m6, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 +	rounds2	 m9, m12, m15,  m2, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 + +	rounds3	 m0,  m7, m14,  m5, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 +	rounds3	m12,  m3, m10,  m1, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 +	rounds3	 m8, m15,  m6, m13, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 +	rounds3	 m4, m11,  m2,  m9, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 + +	ldp	a_, b_, [ctx, #0] +	ldp	c_, d_, [ctx, #8] +	add	a, a, a_ +	add	b, b, b_ +	add	c, c, c_ +	add	d, d, d_ +	stp	a, b, [ctx, #0] +	stp	c, d, [ctx, #8] + +	add	buf, buf, #64 +	cmp	buf, end +	bne	.Lloop + +.Lend:	ldp	x25, x26, [sp, #0x30] +	ldp	x23, x24, [sp, #0x20] +	ldp	x21, x22, [sp, #0x10] +	ldp	x19, x20, [sp], #0x40 + +	ret +END(_libmd_md5block) + +GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL) + +	.section .note.GNU-stack,"",%progbits diff --git a/lib/libmd/aarch64/sha1block.S b/lib/libmd/aarch64/sha1block.S index 56a0297efadd..e16fb36342fd 100644 --- a/lib/libmd/aarch64/sha1block.S +++ b/lib/libmd/aarch64/sha1block.S @@ -1,5 +1,5 @@  /*- - * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>   *   * SPDX-License-Identifier: BSD-2-Clause   * diff --git a/lib/libmd/aarch64/sha1dispatch.c b/lib/libmd/aarch64/sha1dispatch.c index e34bf0a1a344..045527044320 100644 --- a/lib/libmd/aarch64/sha1dispatch.c +++ b/lib/libmd/aarch64/sha1dispatch.c @@ -1,5 +1,5 @@  /*- - * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>   *   * SPDX-License-Identifier: BSD-2-Clause   */ diff --git a/lib/libmd/amd64/md5block.S b/lib/libmd/amd64/md5block.S new file mode 100644 index 000000000000..0dd594dd5dc2 --- /dev/null +++ b/lib/libmd/amd64/md5block.S @@ -0,0 +1,363 @@ +/*- + * Copyright (c) 2024, 2025 Robert Clausecker <fuz@FreeBSD.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <machine/asm.h> + +/* apply the round keys to the four round functions */ +.macro	allrounds	rfn0, rfn1, rfn2, rfn3 +	\rfn0	 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee +	\rfn0	 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 +	\rfn0	 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be +	\rfn0	12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 + +	\rfn1	16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa +	\rfn1	20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 +	\rfn1	24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed +	\rfn1	28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a + +	\rfn2	32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c +	\rfn2	36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 +	\rfn2	40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 +	\rfn2	44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 + +	\rfn3	48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 +	\rfn3	52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 +	\rfn3	56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 +	\rfn3	60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 +.endm + +	// md5block(MD5_CTX, buf, len) +ENTRY(_libmd_md5block_baseline) +.macro	round	a, b, c, d, f, k, m, s +	\f	%ebp, \b, \c, \d +	add	$\k, \a			// a + k[i] +	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g] +	add	%ebp, \a		// a + k[i] + m[g] + f +	rol	$\s, \a +	add	\b, \a +.endm + +	// f = b ? c : d +.macro	f0	f, b, c, d +	mov	\c, \f +	xor	\d, \f +	and	\b, \f +	xor	\d, \f +.endm + +	// f = d ? b : c +.macro	f1	f, b, c, d +	mov	\c, \f +	xor	\b, \f +	and	\d, \f +	xor	\c, \f +.endm + +	// f = b ^ c ^ d +.macro	f2	f, b, c, d +	mov	\c, \f +	xor	\d, \f +	xor	\b, \f +.endm + +	// f = c ^ (b | ~d) +.macro	f3	f, b, c, d +	mov	$-1, \f +	xor	\d, \f +	or	\b, \f +	xor	\c, \f +.endm + +	// do 4 rounds +.macro	rounds	f, p, q, s0, s1, s2, s3, k0, k1, k2, k3 +	round	%eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0 +	round	%edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1 +	round	%ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2 +	round	%ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3 +.endm + +	// do 4 rounds with f0, f1, f2, f3 +.macro	rounds0	i, k0, k1, k2, k3 +	rounds	f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3 +.endm + +.macro	rounds1	i, k0, k1, k2, k3 +	rounds	f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3 +.endm + +.macro	rounds2	i, k0, k1, k2, k3 +	rounds	f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3 +.endm + +.macro	rounds3	i, k0, k1, k2, k3 +	rounds	f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3 +.endm + +	push	%rbx +	push	%rbp +	push	%r12 + +	and	$~63, %rdx		// length in blocks +	lea	(%rsi, %rdx, 1), %r12	// end pointer + +	mov	(%rdi), %eax		// a +	mov	4(%rdi), %ebx		// b +	mov	8(%rdi), %ecx		// c +	mov	12(%rdi), %edx		// d + +	cmp	%rsi, %r12		// any data to process? +	je	.Lend + +	.balign	16 +.Lloop:	mov	%eax, %r8d +	mov	%ebx, %r9d +	mov	%ecx, %r10d +	mov	%edx, %r11d + +	allrounds	rounds0, rounds1, rounds2, rounds3 + +	add	%r8d, %eax +	add	%r9d, %ebx +	add	%r10d, %ecx +	add	%r11d, %edx + +	add	$64, %rsi +	cmp	%rsi, %r12 +	jne	.Lloop + +	mov	%eax, (%rdi) +	mov	%ebx, 4(%rdi) +	mov	%ecx, 8(%rdi) +	mov	%edx, 12(%rdi) + +.Lend:	pop	%r12 +	pop	%rbp +	pop	%rbx +	ret +END(_libmd_md5block_baseline) + +	/* +	 * An implementation leveraging the ANDN instruction +	 * from BMI1 to shorten some dependency chains. +	 */ +ENTRY(_libmd_md5block_bmi1) +	// special-cased round 1 +	// f1 = d ? b : c = (d & b) + (~d & c) +.macro	round1	a, b, c, d, k, m, s +	andn	\c, \d, %edi		// ~d & c +	add	$\k, \a			// a + k[i] +	mov	\d, %ebp +	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g] +	and	\b, %ebp		// d & b +	add	%edi, \a		// a + k[i] + m[g] + (~d & c) +	add	%ebp, \a		// a + k[i] + m[g] + (~d & c) + (d & b) +	rol	$\s, \a +	add	\b, \a +.endm + +	// special-cased round 3 +	// f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d) +.macro	round3	a, b, c, d, k, m, s +	andn	\d, \b, %ebp +	add	$\k - 1, \a		// a + k[i] - 1 +	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g] +	xor	\c, %ebp +	sub	%ebp, \a		// a + k[i] + m[g] + f +	rol	$\s, \a +	add	\b, \a +.endm + +	.purgem	rounds1 +.macro	rounds1	i, k0, k1, k2, k3 +	round1	%eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1,  5 +	round1	%edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6,  9 +	round1	%ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14 +	round1	%ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20 +.endm + +	.purgem	rounds3 +.macro	rounds3	i, k0, k1, k2, k3 +	round3	%eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0,  6 +	round3	%edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10 +	round3	%ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15 +	round3	%ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21 +.endm + +	push	%rbx +	push	%rbp +	push	%r12 + +	and	$~63, %rdx		// length in blocks +	lea	(%rsi, %rdx, 1), %r12	// end pointer + +	mov	(%rdi), %eax		// a +	mov	4(%rdi), %ebx		// b +	mov	8(%rdi), %ecx		// c +	mov	12(%rdi), %edx		// d + +	cmp	%rsi, %r12		// any data to process? +	je	0f + +	push	%rdi + +	.balign	16 +1:	mov	%eax, %r8d +	mov	%ebx, %r9d +	mov	%ecx, %r10d +	mov	%edx, %r11d + +	allrounds	rounds0, rounds1, rounds2, rounds3 + +	add	%r8d, %eax +	add	%r9d, %ebx +	add	%r10d, %ecx +	add	%r11d, %edx + +	add	$64, %rsi +	cmp	%rsi, %r12 +	jne	1b + +	pop	%rdi +	mov	%eax, (%rdi) +	mov	%ebx, 4(%rdi) +	mov	%ecx, 8(%rdi) +	mov	%edx, 12(%rdi) + +0:	pop	%r12 +	pop	%rbp +	pop	%rbx +	ret +END(_libmd_md5block_bmi1) + +#ifndef _KERNEL +	/* +	 * An implementation leveraging AVX-512 for its VPTERNLOGD +	 * instruction.  We're using only XMM registers here, +	 * avoiding costly thermal licensing. +	 */ +ENTRY(_libmd_md5block_avx512) +.macro	vround		a, b, c, d, f, i, m, mi, s +	vmovdqa		\b, %xmm4 +	vpternlogd	$\f, \d, \c, %xmm4 +	vpaddd		4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i] +.if	\mi != 0 +	vpshufd		$0x55 * \mi, %xmm5, %xmm5	// broadcast to each dword +.endif +	vpaddd		%xmm5, \a, \a		// a + k[i] + m[g] +	vpaddd		%xmm4, \a, \a		// a + k[i] + m[g] + f +	vprold		$\s, \a, \a +	vpaddd		\b, \a, \a +.endm + +.macro	vrounds		f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3 +	vround		%xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0 +	vround		%xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1 +	vround		%xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2 +	vround		%xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3 +.endm + +/* + * d c b f0 f1 f2 f3 + * 0 0 0  0  0  0  1 + * 1 0 0  1  0  1  0 + * 0 1 0  0  1  1  0 + * 1 1 0  1  0  0  1 + * 0 0 1  0  0  1  1 + * 1 0 1  0  1  0  1 + * 0 1 1  1  1  0  0 + * 1 1 1  1  1  1  0 + */ + +.macro	vrounds0	i, m +	vrounds		0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22 +.endm + +.macro	vrounds1	i, m0, i0, m1, i1, m2, i2, m3, i3 +	vrounds		0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20 +.endm + +.macro	vrounds2	i, m0, i0, m1, i1, m2, i2, m3, i3 +	vrounds		0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23 +.endm + +.macro	vrounds3	i, m0, i0, m1, i1, m2, i2, m3, i3 +	vrounds		0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21 +.endm + +	and		$~63, %rdx		// length in blocks +	add		%rsi, %rdx		// end pointer + +	vmovd		(%rdi), %xmm0		// a +	vmovd		4(%rdi), %xmm1		// b +	vmovd		8(%rdi), %xmm2		// c +	vmovd		12(%rdi), %xmm3		// d + +	lea		keys(%rip), %rax + +	cmp		%rsi, %rdx		// any data to process? +	je		0f + +	.balign		16 +1:	vmovdqu		0*4(%rsi), %xmm8	// message words +	vmovdqu		4*4(%rsi), %xmm9 +	vmovdqu		8*4(%rsi), %xmm10 +	vmovdqu		12*4(%rsi), %xmm11 + +	vmovdqa		%xmm0, %xmm12		// stash old state variables +	vmovdqa		%xmm1, %xmm13 +	vmovdqa		%xmm2, %xmm14 +	vmovdqa		%xmm3, %xmm15 + +	vrounds0	 0, %xmm8 +	vrounds0	 4, %xmm9 +	vrounds0	 8, %xmm10 +	vrounds0	12, %xmm11 + +	vrounds1	16,  %xmm8, 1,  %xmm9, 2, %xmm10, 3,  %xmm8, 0 +	vrounds1	20,  %xmm9, 1, %xmm10, 2, %xmm11, 3,  %xmm9, 0 +	vrounds1	24, %xmm10, 1, %xmm11, 2,  %xmm8, 3, %xmm10, 0 +	vrounds1	28, %xmm11, 1,  %xmm8, 2,  %xmm9, 3, %xmm11, 0 + +	vrounds2	32,  %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2 +	vrounds2	36,  %xmm8, 1,  %xmm9, 0,  %xmm9, 3, %xmm10, 2 +	vrounds2	40, %xmm11, 1,  %xmm8, 0,  %xmm8, 3,  %xmm9, 2 +	vrounds2	44  %xmm10, 1, %xmm11, 0, %xmm11, 3,  %xmm8, 2 + +	vrounds3	48,  %xmm8, 0,  %xmm9, 3, %xmm11, 2,  %xmm9, 1 +	vrounds3	52, %xmm11, 0,  %xmm8, 3, %xmm10, 2,  %xmm8, 1 +	vrounds3	56, %xmm10, 0, %xmm11, 3,  %xmm9, 2, %xmm11, 1 +	vrounds3	60,  %xmm9, 0, %xmm10, 3,  %xmm8, 2, %xmm10, 1 + +	vpaddd		%xmm12, %xmm0, %xmm0 +	vpaddd		%xmm13, %xmm1, %xmm1 +	vpaddd		%xmm14, %xmm2, %xmm2 +	vpaddd		%xmm15, %xmm3, %xmm3 + +	add		$64, %rsi +	cmp		%rsi, %rdx +	jne		1b + +	vmovd		%xmm0, (%rdi) +	vmovd		%xmm1, 4(%rdi) +	vmovd		%xmm2, 8(%rdi) +	vmovd		%xmm3, 12(%rdi) + +0:	ret +END(_libmd_md5block_avx512) + +	// round keys, for use in md5block_avx512 +	.section	.rodata +	.balign		16 + +.macro	putkeys		i, a, b, c, d +	.4byte		\a, \b, \c, \d +.endm + +keys:	allrounds	putkeys, putkeys, putkeys, putkeys +	.size		keys, .-keys +#endif /* !defined(_KERNEL) */ + +	.section .note.GNU-stack,"",%progbits diff --git a/lib/libmd/amd64/md5dispatch.c b/lib/libmd/amd64/md5dispatch.c new file mode 100644 index 000000000000..dd2131c5a57c --- /dev/null +++ b/lib/libmd/amd64/md5dispatch.c @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <sys/types.h> +#include <sys/md5.h> + +#include <machine/cpufunc.h> +#include <machine/specialreg.h> +#include <stdint.h> +#include <string.h> +#include <x86/ifunc.h> + +extern void _libmd_md5block_baseline(MD5_CTX *, const void *, size_t); +extern void _libmd_md5block_bmi1(MD5_CTX *, const void *, size_t); +extern void _libmd_md5block_avx512(MD5_CTX *, const void *, size_t); + +DEFINE_UIFUNC(, void, _libmd_md5block, (MD5_CTX *, const void *, size_t)) +{ +	if ((cpu_stdext_feature & (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL)) +	    == (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL)) { +		u_int regs[4]; +		char cpu_vendor[12]; + +		do_cpuid(0, regs); +		((u_int *)&cpu_vendor)[0] = regs[1]; +		((u_int *)&cpu_vendor)[1] = regs[3]; +		((u_int *)&cpu_vendor)[2] = regs[2]; + +		/* the AVX-512 kernel performs poorly on AMD */ +		if (memcmp(cpu_vendor, AMD_VENDOR_ID, sizeof(cpu_vendor)) != 0) +			return (_libmd_md5block_avx512); +	} + +	if (cpu_stdext_feature & CPUID_STDEXT_BMI1) +		return (_libmd_md5block_bmi1); +	else +		return (_libmd_md5block_baseline); +} diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S index f1291ef2647a..6ef083178abc 100644 --- a/lib/libmd/amd64/sha1block.S +++ b/lib/libmd/amd64/sha1block.S @@ -1,6 +1,6 @@  /*-   * Copyright (c) 2013 The Go Authors. All rights reserved. - * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>   *   * Adapted from Go's crypto/sha1/sha1block_amd64.s.   * diff --git a/lib/libmd/amd64/sha1dispatch.c b/lib/libmd/amd64/sha1dispatch.c index 86509195d56e..c82a60334739 100644 --- a/lib/libmd/amd64/sha1dispatch.c +++ b/lib/libmd/amd64/sha1dispatch.c @@ -1,6 +1,6 @@  /*-   * Copyright (c) 2016 The Go Authors. All rights reserved. - * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>   *   * Adapted from Go's crypto/sha1/sha1block_amd64.go.   * diff --git a/lib/libmd/sha1c.c b/lib/libmd/sha1c.c index 128e0b991742..02132d720dac 100644 --- a/lib/libmd/sha1c.c +++ b/lib/libmd/sha1c.c @@ -1,6 +1,6 @@  /*-   * Copyright (c) 2009 The Go Authors. All rights reserved. - * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>   *   * Adapted from Go's crypto/sha1/sha1.go.   * | 
