/*-
 * Copyright (c) 2024, 2025 Robert Clausecker <fuz@FreeBSD.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <machine/asm.h>

/* apply the round keys to the four round functions */
.macro	allrounds	rfn0, rfn1, rfn2, rfn3
	\rfn0	 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
	\rfn0	 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
	\rfn0	 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
	\rfn0	12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821

	\rfn1	16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
	\rfn1	20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
	\rfn1	24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
	\rfn1	28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a

	\rfn2	32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
	\rfn2	36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
	\rfn2	40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
	\rfn2	44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665

	\rfn3	48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
	\rfn3	52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
	\rfn3	56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
	\rfn3	60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
.endm

	// md5block(MD5_CTX, buf, len)
ENTRY(_libmd_md5block_baseline)
.macro	round	a, b, c, d, f, k, m, s
	\f	%ebp, \b, \c, \d
	add	$\k, \a			// a + k[i]
	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
	add	%ebp, \a		// a + k[i] + m[g] + f
	rol	$\s, \a
	add	\b, \a
.endm

	// f = b ? c : d
.macro	f0	f, b, c, d
	mov	\c, \f
	xor	\d, \f
	and	\b, \f
	xor	\d, \f
.endm

	// f = d ? b : c
.macro	f1	f, b, c, d
	mov	\c, \f
	xor	\b, \f
	and	\d, \f
	xor	\c, \f
.endm

	// f = b ^ c ^ d
.macro	f2	f, b, c, d
	mov	\c, \f
	xor	\d, \f
	xor	\b, \f
.endm

	// f = c ^ (b | ~d)
.macro	f3	f, b, c, d
	mov	$-1, \f
	xor	\d, \f
	or	\b, \f
	xor	\c, \f
.endm

	// do 4 rounds
.macro	rounds	f, p, q, s0, s1, s2, s3, k0, k1, k2, k3
	round	%eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0
	round	%edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1
	round	%ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2
	round	%ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3
.endm

	// do 4 rounds with f0, f1, f2, f3
.macro	rounds0	i, k0, k1, k2, k3
	rounds	f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3
.endm

.macro	rounds1	i, k0, k1, k2, k3
	rounds	f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3
.endm

.macro	rounds2	i, k0, k1, k2, k3
	rounds	f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3
.endm

.macro	rounds3	i, k0, k1, k2, k3
	rounds	f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3
.endm

	push	%rbx
	push	%rbp
	push	%r12

	and	$~63, %rdx		// length in blocks
	lea	(%rsi, %rdx, 1), %r12	// end pointer

	mov	(%rdi), %eax		// a
	mov	4(%rdi), %ebx		// b
	mov	8(%rdi), %ecx		// c
	mov	12(%rdi), %edx		// d

	cmp	%rsi, %r12		// any data to process?
	je	.Lend

	.balign	16
.Lloop:	mov	%eax, %r8d
	mov	%ebx, %r9d
	mov	%ecx, %r10d
	mov	%edx, %r11d

	allrounds	rounds0, rounds1, rounds2, rounds3

	add	%r8d, %eax
	add	%r9d, %ebx
	add	%r10d, %ecx
	add	%r11d, %edx

	add	$64, %rsi
	cmp	%rsi, %r12
	jne	.Lloop

	mov	%eax, (%rdi)
	mov	%ebx, 4(%rdi)
	mov	%ecx, 8(%rdi)
	mov	%edx, 12(%rdi)

.Lend:	pop	%r12
	pop	%rbp
	pop	%rbx
	ret
END(_libmd_md5block_baseline)

	/*
	 * An implementation leveraging the ANDN instruction
	 * from BMI1 to shorten some dependency chains.
	 */
ENTRY(_libmd_md5block_bmi1)
	// special-cased round 1
	// f1 = d ? b : c = (d & b) + (~d & c)
.macro	round1	a, b, c, d, k, m, s
	andn	\c, \d, %edi		// ~d & c
	add	$\k, \a			// a + k[i]
	mov	\d, %ebp
	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
	and	\b, %ebp		// d & b
	add	%edi, \a		// a + k[i] + m[g] + (~d & c)
	add	%ebp, \a		// a + k[i] + m[g] + (~d & c) + (d & b)
	rol	$\s, \a
	add	\b, \a
.endm

	// special-cased round 3
	// f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d)
.macro	round3	a, b, c, d, k, m, s
	andn	\d, \b, %ebp
	add	$\k - 1, \a		// a + k[i] - 1
	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
	xor	\c, %ebp
	sub	%ebp, \a		// a + k[i] + m[g] + f
	rol	$\s, \a
	add	\b, \a
.endm

	.purgem	rounds1
.macro	rounds1	i, k0, k1, k2, k3
	round1	%eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1,  5
	round1	%edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6,  9
	round1	%ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14
	round1	%ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20
.endm

	.purgem	rounds3
.macro	rounds3	i, k0, k1, k2, k3
	round3	%eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0,  6
	round3	%edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10
	round3	%ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15
	round3	%ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21
.endm

	push	%rbx
	push	%rbp
	push	%r12

	and	$~63, %rdx		// length in blocks
	lea	(%rsi, %rdx, 1), %r12	// end pointer

	mov	(%rdi), %eax		// a
	mov	4(%rdi), %ebx		// b
	mov	8(%rdi), %ecx		// c
	mov	12(%rdi), %edx		// d

	cmp	%rsi, %r12		// any data to process?
	je	0f

	push	%rdi

	.balign	16
1:	mov	%eax, %r8d
	mov	%ebx, %r9d
	mov	%ecx, %r10d
	mov	%edx, %r11d

	allrounds	rounds0, rounds1, rounds2, rounds3

	add	%r8d, %eax
	add	%r9d, %ebx
	add	%r10d, %ecx
	add	%r11d, %edx

	add	$64, %rsi
	cmp	%rsi, %r12
	jne	1b

	pop	%rdi
	mov	%eax, (%rdi)
	mov	%ebx, 4(%rdi)
	mov	%ecx, 8(%rdi)
	mov	%edx, 12(%rdi)

0:	pop	%r12
	pop	%rbp
	pop	%rbx
	ret
END(_libmd_md5block_bmi1)

#ifndef _KERNEL
	/*
	 * An implementation leveraging AVX-512 for its VPTERNLOGD
	 * instruction.  We're using only XMM registers here,
	 * avoiding costly thermal licensing.
	 */
ENTRY(_libmd_md5block_avx512)
.macro	vround		a, b, c, d, f, i, m, mi, s
	vmovdqa		\b, %xmm4
	vpternlogd	$\f, \d, \c, %xmm4
	vpaddd		4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i]
.if	\mi != 0
	vpshufd		$0x55 * \mi, %xmm5, %xmm5	// broadcast to each dword
.endif
	vpaddd		%xmm5, \a, \a		// a + k[i] + m[g]
	vpaddd		%xmm4, \a, \a		// a + k[i] + m[g] + f
	vprold		$\s, \a, \a
	vpaddd		\b, \a, \a
.endm

.macro	vrounds		f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3
	vround		%xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0
	vround		%xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1
	vround		%xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2
	vround		%xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3
.endm

/*
 * d c b f0 f1 f2 f3
 * 0 0 0  0  0  0  1
 * 1 0 0  1  0  1  0
 * 0 1 0  0  1  1  0
 * 1 1 0  1  0  0  1
 * 0 0 1  0  0  1  1
 * 1 0 1  0  1  0  1
 * 0 1 1  1  1  0  0
 * 1 1 1  1  1  1  0
 */

.macro	vrounds0	i, m
	vrounds		0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22
.endm

.macro	vrounds1	i, m0, i0, m1, i1, m2, i2, m3, i3
	vrounds		0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20
.endm

.macro	vrounds2	i, m0, i0, m1, i1, m2, i2, m3, i3
	vrounds		0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23
.endm

.macro	vrounds3	i, m0, i0, m1, i1, m2, i2, m3, i3
	vrounds		0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21
.endm

	and		$~63, %rdx		// length in blocks
	add		%rsi, %rdx		// end pointer

	vmovd		(%rdi), %xmm0		// a
	vmovd		4(%rdi), %xmm1		// b
	vmovd		8(%rdi), %xmm2		// c
	vmovd		12(%rdi), %xmm3		// d

	lea		keys(%rip), %rax

	cmp		%rsi, %rdx		// any data to process?
	je		0f

	.balign		16
1:	vmovdqu		0*4(%rsi), %xmm8	// message words
	vmovdqu		4*4(%rsi), %xmm9
	vmovdqu		8*4(%rsi), %xmm10
	vmovdqu		12*4(%rsi), %xmm11

	vmovdqa		%xmm0, %xmm12		// stash old state variables
	vmovdqa		%xmm1, %xmm13
	vmovdqa		%xmm2, %xmm14
	vmovdqa		%xmm3, %xmm15

	vrounds0	 0, %xmm8
	vrounds0	 4, %xmm9
	vrounds0	 8, %xmm10
	vrounds0	12, %xmm11

	vrounds1	16,  %xmm8, 1,  %xmm9, 2, %xmm10, 3,  %xmm8, 0
	vrounds1	20,  %xmm9, 1, %xmm10, 2, %xmm11, 3,  %xmm9, 0
	vrounds1	24, %xmm10, 1, %xmm11, 2,  %xmm8, 3, %xmm10, 0
	vrounds1	28, %xmm11, 1,  %xmm8, 2,  %xmm9, 3, %xmm11, 0

	vrounds2	32,  %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2
	vrounds2	36,  %xmm8, 1,  %xmm9, 0,  %xmm9, 3, %xmm10, 2
	vrounds2	40, %xmm11, 1,  %xmm8, 0,  %xmm8, 3,  %xmm9, 2
	vrounds2	44  %xmm10, 1, %xmm11, 0, %xmm11, 3,  %xmm8, 2

	vrounds3	48,  %xmm8, 0,  %xmm9, 3, %xmm11, 2,  %xmm9, 1
	vrounds3	52, %xmm11, 0,  %xmm8, 3, %xmm10, 2,  %xmm8, 1
	vrounds3	56, %xmm10, 0, %xmm11, 3,  %xmm9, 2, %xmm11, 1
	vrounds3	60,  %xmm9, 0, %xmm10, 3,  %xmm8, 2, %xmm10, 1

	vpaddd		%xmm12, %xmm0, %xmm0
	vpaddd		%xmm13, %xmm1, %xmm1
	vpaddd		%xmm14, %xmm2, %xmm2
	vpaddd		%xmm15, %xmm3, %xmm3

	add		$64, %rsi
	cmp		%rsi, %rdx
	jne		1b

	vmovd		%xmm0, (%rdi)
	vmovd		%xmm1, 4(%rdi)
	vmovd		%xmm2, 8(%rdi)
	vmovd		%xmm3, 12(%rdi)

0:	ret
END(_libmd_md5block_avx512)

	// round keys, for use in md5block_avx512
	.section	.rodata
	.balign		16

.macro	putkeys		i, a, b, c, d
	.4byte		\a, \b, \c, \d
.endm

keys:	allrounds	putkeys, putkeys, putkeys, putkeys
	.size		keys, .-keys
#endif /* !defined(_KERNEL) */

	.section .note.GNU-stack,"",%progbits