/*- * Copyright (c) 2024, 2025 Robert Clausecker * * SPDX-License-Identifier: BSD-2-Clause */ #include /* apply the round keys to the four round functions */ .macro allrounds rfn0, rfn1, rfn2, rfn3 \rfn0 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee \rfn0 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 \rfn0 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be \rfn0 12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 \rfn1 16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa \rfn1 20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 \rfn1 24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed \rfn1 28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a \rfn2 32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c \rfn2 36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 \rfn2 40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 \rfn2 44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 \rfn3 48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 \rfn3 52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 \rfn3 56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 \rfn3 60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 .endm // md5block(MD5_CTX, buf, len) ENTRY(_libmd_md5block_baseline) .macro round a, b, c, d, f, k, m, s \f %ebp, \b, \c, \d add $\k, \a // a + k[i] add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] add %ebp, \a // a + k[i] + m[g] + f rol $\s, \a add \b, \a .endm // f = b ? c : d .macro f0 f, b, c, d mov \c, \f xor \d, \f and \b, \f xor \d, \f .endm // f = d ? b : c .macro f1 f, b, c, d mov \c, \f xor \b, \f and \d, \f xor \c, \f .endm // f = b ^ c ^ d .macro f2 f, b, c, d mov \c, \f xor \d, \f xor \b, \f .endm // f = c ^ (b | ~d) .macro f3 f, b, c, d mov $-1, \f xor \d, \f or \b, \f xor \c, \f .endm // do 4 rounds .macro rounds f, p, q, s0, s1, s2, s3, k0, k1, k2, k3 round %eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0 round %edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1 round %ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2 round %ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3 .endm // do 4 rounds with f0, f1, f2, f3 .macro rounds0 i, k0, k1, k2, k3 rounds f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3 .endm .macro rounds1 i, k0, k1, k2, k3 rounds f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3 .endm .macro rounds2 i, k0, k1, k2, k3 rounds f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3 .endm .macro rounds3 i, k0, k1, k2, k3 rounds f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3 .endm push %rbx push %rbp push %r12 and $~63, %rdx // length in blocks lea (%rsi, %rdx, 1), %r12 // end pointer mov (%rdi), %eax // a mov 4(%rdi), %ebx // b mov 8(%rdi), %ecx // c mov 12(%rdi), %edx // d cmp %rsi, %r12 // any data to process? je .Lend .balign 16 .Lloop: mov %eax, %r8d mov %ebx, %r9d mov %ecx, %r10d mov %edx, %r11d allrounds rounds0, rounds1, rounds2, rounds3 add %r8d, %eax add %r9d, %ebx add %r10d, %ecx add %r11d, %edx add $64, %rsi cmp %rsi, %r12 jne .Lloop mov %eax, (%rdi) mov %ebx, 4(%rdi) mov %ecx, 8(%rdi) mov %edx, 12(%rdi) .Lend: pop %r12 pop %rbp pop %rbx ret END(_libmd_md5block_baseline) /* * An implementation leveraging the ANDN instruction * from BMI1 to shorten some dependency chains. */ ENTRY(_libmd_md5block_bmi1) // special-cased round 1 // f1 = d ? b : c = (d & b) + (~d & c) .macro round1 a, b, c, d, k, m, s andn \c, \d, %edi // ~d & c add $\k, \a // a + k[i] mov \d, %ebp add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] and \b, %ebp // d & b add %edi, \a // a + k[i] + m[g] + (~d & c) add %ebp, \a // a + k[i] + m[g] + (~d & c) + (d & b) rol $\s, \a add \b, \a .endm // special-cased round 3 // f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d) .macro round3 a, b, c, d, k, m, s andn \d, \b, %ebp add $\k - 1, \a // a + k[i] - 1 add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] xor \c, %ebp sub %ebp, \a // a + k[i] + m[g] + f rol $\s, \a add \b, \a .endm .purgem rounds1 .macro rounds1 i, k0, k1, k2, k3 round1 %eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1, 5 round1 %edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6, 9 round1 %ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14 round1 %ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20 .endm .purgem rounds3 .macro rounds3 i, k0, k1, k2, k3 round3 %eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0, 6 round3 %edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10 round3 %ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15 round3 %ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21 .endm push %rbx push %rbp push %r12 and $~63, %rdx // length in blocks lea (%rsi, %rdx, 1), %r12 // end pointer mov (%rdi), %eax // a mov 4(%rdi), %ebx // b mov 8(%rdi), %ecx // c mov 12(%rdi), %edx // d cmp %rsi, %r12 // any data to process? je 0f push %rdi .balign 16 1: mov %eax, %r8d mov %ebx, %r9d mov %ecx, %r10d mov %edx, %r11d allrounds rounds0, rounds1, rounds2, rounds3 add %r8d, %eax add %r9d, %ebx add %r10d, %ecx add %r11d, %edx add $64, %rsi cmp %rsi, %r12 jne 1b pop %rdi mov %eax, (%rdi) mov %ebx, 4(%rdi) mov %ecx, 8(%rdi) mov %edx, 12(%rdi) 0: pop %r12 pop %rbp pop %rbx ret END(_libmd_md5block_bmi1) #ifndef _KERNEL /* * An implementation leveraging AVX-512 for its VPTERNLOGD * instruction. We're using only XMM registers here, * avoiding costly thermal licensing. */ ENTRY(_libmd_md5block_avx512) .macro vround a, b, c, d, f, i, m, mi, s vmovdqa \b, %xmm4 vpternlogd $\f, \d, \c, %xmm4 vpaddd 4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i] .if \mi != 0 vpshufd $0x55 * \mi, %xmm5, %xmm5 // broadcast to each dword .endif vpaddd %xmm5, \a, \a // a + k[i] + m[g] vpaddd %xmm4, \a, \a // a + k[i] + m[g] + f vprold $\s, \a, \a vpaddd \b, \a, \a .endm .macro vrounds f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3 vround %xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0 vround %xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1 vround %xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2 vround %xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3 .endm /* * d c b f0 f1 f2 f3 * 0 0 0 0 0 0 1 * 1 0 0 1 0 1 0 * 0 1 0 0 1 1 0 * 1 1 0 1 0 0 1 * 0 0 1 0 0 1 1 * 1 0 1 0 1 0 1 * 0 1 1 1 1 0 0 * 1 1 1 1 1 1 0 */ .macro vrounds0 i, m vrounds 0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22 .endm .macro vrounds1 i, m0, i0, m1, i1, m2, i2, m3, i3 vrounds 0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20 .endm .macro vrounds2 i, m0, i0, m1, i1, m2, i2, m3, i3 vrounds 0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23 .endm .macro vrounds3 i, m0, i0, m1, i1, m2, i2, m3, i3 vrounds 0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21 .endm and $~63, %rdx // length in blocks add %rsi, %rdx // end pointer vmovd (%rdi), %xmm0 // a vmovd 4(%rdi), %xmm1 // b vmovd 8(%rdi), %xmm2 // c vmovd 12(%rdi), %xmm3 // d lea keys(%rip), %rax cmp %rsi, %rdx // any data to process? je 0f .balign 16 1: vmovdqu 0*4(%rsi), %xmm8 // message words vmovdqu 4*4(%rsi), %xmm9 vmovdqu 8*4(%rsi), %xmm10 vmovdqu 12*4(%rsi), %xmm11 vmovdqa %xmm0, %xmm12 // stash old state variables vmovdqa %xmm1, %xmm13 vmovdqa %xmm2, %xmm14 vmovdqa %xmm3, %xmm15 vrounds0 0, %xmm8 vrounds0 4, %xmm9 vrounds0 8, %xmm10 vrounds0 12, %xmm11 vrounds1 16, %xmm8, 1, %xmm9, 2, %xmm10, 3, %xmm8, 0 vrounds1 20, %xmm9, 1, %xmm10, 2, %xmm11, 3, %xmm9, 0 vrounds1 24, %xmm10, 1, %xmm11, 2, %xmm8, 3, %xmm10, 0 vrounds1 28, %xmm11, 1, %xmm8, 2, %xmm9, 3, %xmm11, 0 vrounds2 32, %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2 vrounds2 36, %xmm8, 1, %xmm9, 0, %xmm9, 3, %xmm10, 2 vrounds2 40, %xmm11, 1, %xmm8, 0, %xmm8, 3, %xmm9, 2 vrounds2 44 %xmm10, 1, %xmm11, 0, %xmm11, 3, %xmm8, 2 vrounds3 48, %xmm8, 0, %xmm9, 3, %xmm11, 2, %xmm9, 1 vrounds3 52, %xmm11, 0, %xmm8, 3, %xmm10, 2, %xmm8, 1 vrounds3 56, %xmm10, 0, %xmm11, 3, %xmm9, 2, %xmm11, 1 vrounds3 60, %xmm9, 0, %xmm10, 3, %xmm8, 2, %xmm10, 1 vpaddd %xmm12, %xmm0, %xmm0 vpaddd %xmm13, %xmm1, %xmm1 vpaddd %xmm14, %xmm2, %xmm2 vpaddd %xmm15, %xmm3, %xmm3 add $64, %rsi cmp %rsi, %rdx jne 1b vmovd %xmm0, (%rdi) vmovd %xmm1, 4(%rdi) vmovd %xmm2, 8(%rdi) vmovd %xmm3, 12(%rdi) 0: ret END(_libmd_md5block_avx512) // round keys, for use in md5block_avx512 .section .rodata .balign 16 .macro putkeys i, a, b, c, d .4byte \a, \b, \c, \d .endm keys: allrounds putkeys, putkeys, putkeys, putkeys .size keys, .-keys #endif /* !defined(_KERNEL) */ .section .note.GNU-stack,"",%progbits