/*-
 * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <sys/elf_common.h>
#include <machine/asm.h>

# optimal instruction sequence for k = \key + \m
.macro	addkm	key, m
.if 0x100000000 - \key > 0x00ffffff
	movz	k, #\key & 0xffff
	movk	k, #\key >> 16, lsl #16
	add	k, k, \m
.elseif 0x100000000 - \key > 0x0000ffff
	sub	k, \m, #(0x100000000 - \key) & 0xfff000
	sub	k, k, #(0x100000000 - \key) & 0xfff
.else
	movz	k, #0x100000000 - \key
	sub	k, \m, k
.endif
.endm

.macro	round	a, b, c, d, f, key, m, s
	\f	f, \b, \c, \d
	addkm	\key, \m		// k[i] + m[g]
	add	\a, \a, k		// k[i] + m[g] + a
	add	\a, \a, f		// k[i] + m[g] + a + f
	ror	\a, \a, #32-\s
	add	\a, \a, \b
.endm

	/* f = b ? c : d */
.macro	f0	f, b, c, d
	eor	\f, \c, \d
	and	\f, \f, \b
	eor	\f, \f, \d
.endm

	/*
	 * special cased round 1 function
	 * f1 = d ? b : c = (d & b) + (~d & c)
	 */
.macro	round1	a, b, c, d, key, m, s
	bic	tmp, \c, \d		// ~d & c
	addkm	\key, \m		// k[i] + m[g]
	add	\a, \a, k		// k[i] + m[g] + a
	and	f, \b, \d		// d & b
	add	\a, \a, tmp		// k[i] + m[g] + a + (~d & c)
	add	\a, \a, f		// k[i] + m[g] + a + (~d & c) + (d & b)
	ror	\a, \a, #32-\s
	add	\a, \a, \b
.endm

	/* f = b ^ c ^ d */
.macro	f2	f, b, c, d
	eor	\f, \c, \d
	eor	\f, \f, \b
.endm

	/* f = c ^ (b | ~d) */
.macro	f3	f, b, c, d
	orn	\f, \b, \d
	eor	\f, \f, \c
.endm

	/* do 4 rounds */
.macro	rounds	f, m0, m1, m2, m3, s0, s1, s2, s3, k0, k1, k2, k3
	round	a, b, c, d, \f, \k0, \m0, \s0
	round	d, a, b, c, \f, \k1, \m1, \s1
	round	c, d, a, b, \f, \k2, \m2, \s2
	round	b, c, d, a, \f, \k3, \m3, \s3
.endm

	/* do 4 rounds with f0, f1, f2, f3 */
.macro	rounds0	m0, m1, m2, m3, k0, k1, k2, k3
	rounds	f0, \m0, \m1, \m2, \m3, 7, 12, 17, 22, \k0, \k1, \k2, \k3
.endm

.macro	rounds1	m0, m1, m2, m3, k0, k1, k2, k3
	round1	a, b, c, d, \k0, \m0,  5
	round1	d, a, b, c, \k1, \m1,  9
	round1	c, d, a, b, \k2, \m2, 14
	round1	b, c, d, a, \k3, \m3, 20
.endm

.macro	rounds2	m0, m1, m2, m3, k0, k1, k2, k3
	rounds	f2, \m0, \m1, \m2, \m3, 4, 11, 16, 23, \k0, \k1, \k2, \k3
.endm

.macro	rounds3	m0, m1, m2, m3, k0, k1, k2, k3
	rounds	f3, \m0, \m1, \m2, \m3, 6, 10, 15, 21, \k0, \k1, \k2, \k3
.endm

	/* md5block(MD5_CTX, buf, len) */
ENTRY(_libmd_md5block)
ctx	.req	x0
buf	.req	x1
len	.req	x2
end	.req	x2			// aliases len
a	.req	w3
b	.req	w4
c	.req	w5
d	.req	w6
f	.req	w7
tmp	.req	w8
k	.req	w9
m0	.req	w10
m1	.req	w11
m2	.req	w12
m3	.req	w13
m4	.req	w14
m5	.req	w15
m6	.req	w16
m7	.req	w17
					// x18 is the platform register
m8	.req	w19
m9	.req	w20
m10	.req	w21
m11	.req	w22
m12	.req	w23
m13	.req	w24
m14	.req	w25
m15	.req	w26

a_	.req	m0
b_	.req	m7
c_	.req	m14
d_	.req	m5

	stp	x19, x20, [sp, #-0x40]!
	stp	x21, x22, [sp, #0x10]
	stp	x23, x24, [sp, #0x20]
	stp	x25, x26, [sp, #0x30]

	ands	len, len, #~63		// length in blocks
	add	end, buf, len		// end pointer

	beq	.Lend			// was len == 0 after BICS?

	ldp	a, b, [ctx, #0]
	ldp	c, d, [ctx, #8]

	/* first eight rounds interleaved with data loads */
.Lloop:	ldp	m0, m1, [buf, #0]
	round	a, b, c, d, f0, 0xd76aa478, m0,  7
	ldp	m2, m3, [buf, #8]
	round	d, a, b, c, f0, 0xe8c7b756, m1, 12
	ldp	m4, m5, [buf, #16]
	round	c, d, a, b, f0, 0x242070db, m2, 17
	ldp	m6, m7, [buf, #24]
	round	b, c, d, a, f0, 0xc1bdceee, m3, 22

	ldp	m8, m9, [buf, #32]
	round	a, b, c, d, f0, 0xf57c0faf, m4,  7
	ldp	m10, m11, [buf, #40]
	round	d, a, b, c, f0, 0x4787c62a, m5, 12
	ldp	m12, m13, [buf, #48]
	round	c, d, a, b, f0, 0xa8304613, m6, 17
	ldp	m14, m15, [buf, #56]
	round	b, c, d, a, f0, 0xfd469501, m7, 22

	/* remaining rounds use the roundsX macros */
	rounds0	 m8,  m9, m10, m11, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
	rounds0	m12, m13, m14, m15, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821

	rounds1	 m1,  m6, m11,  m0, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
	rounds1	 m5, m10, m15,  m4, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
	rounds1	 m9, m14,  m3,  m8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
	rounds1	m13,  m2,  m7, m12, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a

	rounds2	 m5,  m8, m11, m14, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
	rounds2	 m1,  m4,  m7, m10, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
	rounds2	m13,  m0,  m3,  m6, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
	rounds2	 m9, m12, m15,  m2, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665

	rounds3	 m0,  m7, m14,  m5, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
	rounds3	m12,  m3, m10,  m1, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
	rounds3	 m8, m15,  m6, m13, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
	rounds3	 m4, m11,  m2,  m9, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391

	ldp	a_, b_, [ctx, #0]
	ldp	c_, d_, [ctx, #8]
	add	a, a, a_
	add	b, b, b_
	add	c, c, c_
	add	d, d, d_
	stp	a, b, [ctx, #0]
	stp	c, d, [ctx, #8]

	add	buf, buf, #64
	cmp	buf, end
	bne	.Lloop

.Lend:	ldp	x25, x26, [sp, #0x30]
	ldp	x23, x24, [sp, #0x20]
	ldp	x21, x22, [sp, #0x10]
	ldp	x19, x20, [sp], #0x40

	ret
END(_libmd_md5block)

GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL)

	.section .note.GNU-stack,"",%progbits
