arch/x86/memmove: Add 64bit version

The 64bit handles 64bit input variables properly.

TESTED: Both qemu and real hardware can use LZ4 properly which use this
code.

Change-Id: Ib43ec19df97194d6b1c18bfacb5fe8211ba0ffe5
Signed-off-by: Arthur Heymans <arthur@aheymans.xyz>
Reviewed-on: https://review.coreboot.org/c/coreboot/+/69231
Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
Reviewed-by: Angel Pons <th3fanbus@gmail.com>
diff --git a/src/arch/x86/Makefile.inc b/src/arch/x86/Makefile.inc
index 50c344c..d281037 100644
--- a/src/arch/x86/Makefile.inc
+++ b/src/arch/x86/Makefile.inc
@@ -85,7 +85,8 @@
 bootblock-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S
 bootblock-y += memcpy.c
 bootblock-y += memset.c
-bootblock-y += memmove.c
+bootblock-$(CONFIG_ARCH_BOOTBLOCK_X86_32) += memmove_32.c
+bootblock-$(CONFIG_ARCH_BOOTBLOCK_X86_64) += memmove_64.S
 bootblock-$(CONFIG_COLLECT_TIMESTAMPS_TSC) += timestamp.c
 bootblock-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
 bootblock-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
@@ -134,7 +135,8 @@
 verstage-y += cpu_common.c
 verstage-y += memset.c
 verstage-y += memcpy.c
-verstage-y += memmove.c
+verstage-$(CONFIG_ARCH_VERSTAGE_X86_32) += memmove_32.c
+verstage-$(CONFIG_ARCH_VERSTAGE_X86_64) += memmove_64.S
 verstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
 verstage-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
 # If verstage is a separate stage it means there's no need
@@ -172,7 +174,8 @@
 romstage-$(CONFIG_IDT_IN_EVERY_STAGE) += exception.c
 romstage-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S
 romstage-y += memcpy.c
-romstage-y += memmove.c
+romstage-$(CONFIG_ARCH_ROMSTAGE_X86_32) += memmove_32.c
+romstage-$(CONFIG_ARCH_ROMSTAGE_X86_64) += memmove_64.S
 romstage-y += memset.c
 romstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
 romstage-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
@@ -217,7 +220,8 @@
 postcar-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S
 postcar-y += exit_car.S
 postcar-y += memcpy.c
-postcar-y += memmove.c
+postcar-$(CONFIG_ARCH_POSTCAR_X86_32) += memmove_32.c
+postcar-$(CONFIG_ARCH_POSTCAR_X86_64) += memmove_64.S
 postcar-y += memset.c
 postcar-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
 postcar-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
@@ -261,7 +265,8 @@
 ramstage-y += idt.S
 ramstage-$(CONFIG_IOAPIC) += ioapic.c
 ramstage-y += memcpy.c
-ramstage-y += memmove.c
+ramstage-$(CONFIG_ARCH_RAMSTAGE_X86_32) += memmove_32.c
+ramstage-$(CONFIG_ARCH_RAMSTAGE_X86_64) += memmove_64.S
 ramstage-y += memset.c
 ramstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
 ramstage-$(CONFIG_GENERATE_MP_TABLE) += mpspec.c
@@ -278,11 +283,11 @@
 ramstage-$(CONFIG_HAVE_CF9_RESET) += cf9_reset.c
 
 rmodules_x86_32-y += memcpy.c
-rmodules_x86_32-y += memmove.c
+rmodules_x86_32-y += memmove_32.c
 rmodules_x86_32-y += memset.c
 
 rmodules_x86_64-y += memcpy.c
-rmodules_x86_64-y += memmove.c
+rmodules_x86_64-y += memmove_64.S
 rmodules_x86_64-y += memset.c
 
 ifeq ($(CONFIG_ARCH_RAMSTAGE_X86_32),y)
@@ -324,7 +329,8 @@
 smm-$(CONFIG_IDT_IN_EVERY_STAGE) += exception.c
 smm-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S
 smm-y += memcpy.c
-smm-y += memmove.c
+smm-$(CONFIG_ARCH_RAMSTAGE_X86_32) += memmove_32.c
+smm-$(CONFIG_ARCH_RAMSTAGE_X86_64) += memmove_64.S
 smm-y += memset.c
 smm-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c
 smm-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c
diff --git a/src/arch/x86/memmove.c b/src/arch/x86/memmove_32.c
similarity index 100%
rename from src/arch/x86/memmove.c
rename to src/arch/x86/memmove_32.c
diff --git a/src/arch/x86/memmove_64.S b/src/arch/x86/memmove_64.S
new file mode 100644
index 0000000..ebec8ee
--- /dev/null
+++ b/src/arch/x86/memmove_64.S
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* This code originates from Linux 5.19 */
+
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+.global memmove
+memmove:
+
+	mov %rdi, %rax
+
+	/* Decide forward/backward copy mode */
+	cmp %rdi, %rsi
+	jge .Lmemmove_begin_forward
+	mov %rsi, %r8
+	add %rdx, %r8
+	cmp %rdi, %r8
+	jg 2f
+
+	/* Don't optimize for FSRM and ERMS like Linux */
+.Lmemmove_begin_forward:
+	cmp $0x20, %rdx
+	jb 1f
+
+	/*
+	 * movsq instruction have many startup latency
+	 * so we handle small size by general register.
+	 */
+	cmp  $680, %rdx
+	jb	3f
+	/*
+	 * movsq instruction is only good for aligned case.
+	 */
+
+	cmpb %dil, %sil
+	je 4f
+3:
+	sub $0x20, %rdx
+	/*
+	 * We gobble 32 bytes forward in each loop.
+	 */
+5:
+	sub $0x20, %rdx
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq 2*8(%rsi), %r9
+	movq 3*8(%rsi), %r8
+	leaq 4*8(%rsi), %rsi
+
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, 2*8(%rdi)
+	movq %r8, 3*8(%rdi)
+	leaq 4*8(%rdi), %rdi
+	jae 5b
+	addq $0x20, %rdx
+	jmp 1f
+	/*
+	 * Handle data forward by movsq.
+	 */
+	.p2align 4
+4:
+	movq %rdx, %rcx
+	movq -8(%rsi, %rdx), %r11
+	lea -8(%rdi, %rdx), %r10
+	shrq $3, %rcx
+	rep movsq
+	movq %r11, (%r10)
+	jmp 13f
+.Lmemmove_end_forward:
+
+	/*
+	 * Handle data backward by movsq.
+	 */
+	.p2align 4
+7:
+	movq %rdx, %rcx
+	movq (%rsi), %r11
+	movq %rdi, %r10
+	leaq -8(%rsi, %rdx), %rsi
+	leaq -8(%rdi, %rdx), %rdi
+	shrq $3, %rcx
+	std
+	rep movsq
+	cld
+	movq %r11, (%r10)
+	jmp 13f
+
+	/*
+	 * Start to prepare for backward copy.
+	 */
+	.p2align 4
+2:
+	cmp $0x20, %rdx
+	jb 1f
+	cmp $680, %rdx
+	jb 6f
+	cmp %dil, %sil
+	je 7b
+6:
+	/*
+	 * Calculate copy position to tail.
+	 */
+	addq %rdx, %rsi
+	addq %rdx, %rdi
+	subq $0x20, %rdx
+	/*
+	 * We gobble 32 bytes backward in each loop.
+	 */
+8:
+	subq $0x20, %rdx
+	movq -1*8(%rsi), %r11
+	movq -2*8(%rsi), %r10
+	movq -3*8(%rsi), %r9
+	movq -4*8(%rsi), %r8
+	leaq -4*8(%rsi), %rsi
+
+	movq %r11, -1*8(%rdi)
+	movq %r10, -2*8(%rdi)
+	movq %r9, -3*8(%rdi)
+	movq %r8, -4*8(%rdi)
+	leaq -4*8(%rdi), %rdi
+	jae 8b
+	/*
+	 * Calculate copy position to head.
+	 */
+	addq $0x20, %rdx
+	subq %rdx, %rsi
+	subq %rdx, %rdi
+1:
+	cmpq $16, %rdx
+	jb 9f
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq -2*8(%rsi, %rdx), %r9
+	movq -1*8(%rsi, %rdx), %r8
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, -2*8(%rdi, %rdx)
+	movq %r8, -1*8(%rdi, %rdx)
+	jmp 13f
+	.p2align 4
+9:
+	cmpq $8, %rdx
+	jb 10f
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq -1*8(%rsi, %rdx), %r10
+	movq %r11, 0*8(%rdi)
+	movq %r10, -1*8(%rdi, %rdx)
+	jmp 13f
+10:
+	cmpq $4, %rdx
+	jb 11f
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %r11d
+	movl -4(%rsi, %rdx), %r10d
+	movl %r11d, (%rdi)
+	movl %r10d, -4(%rdi, %rdx)
+	jmp 13f
+11:
+	cmp $2, %rdx
+	jb 12f
+	/*
+	 * Move data from 2 bytes to 3 bytes.
+	 */
+	movw (%rsi), %r11w
+	movw -2(%rsi, %rdx), %r10w
+	movw %r11w, (%rdi)
+	movw %r10w, -2(%rdi, %rdx)
+	jmp 13f
+12:
+	cmp $1, %rdx
+	jb 13f
+	/*
+	 * Move data for 1 byte.
+	 */
+	movb (%rsi), %r11b
+	movb %r11b, (%rdi)
+13:
+	RET