commonlib: Add assembly optimization for ipchksum() on x86

This patch adds a bit of optimized assembly code to the ipchksum()
algorithm for x86 targets in order to take advantage of larger load
sizes and the add-with-carry instruction. The same assembly (with one
minor manual tweak) works for both 32 and 64 bit mode (with most of the
work being done by GCC which automatically inserts `rax` or `eax` in the
inline assembly depending on the build target).

Change-Id: I484620dc14679ff5ca02b2ced2f84650730a6efc
Signed-off-by: Julius Werner <jwerner@chromium.org>
Reviewed-on: https://review.coreboot.org/c/coreboot/+/80255
Reviewed-by: Arthur Heymans <arthur@aheymans.xyz>
Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
diff --git a/src/commonlib/bsd/ipchksum.c b/src/commonlib/bsd/ipchksum.c
index 89d261f..b7434e5 100644
--- a/src/commonlib/bsd/ipchksum.c
+++ b/src/commonlib/bsd/ipchksum.c
@@ -34,7 +34,30 @@
 		:: "cc"
 		);
 	}
-#endif
+#elif defined(__i386__) || defined(__x86_64__)
+	size_t size8 = size / 8;
+	const uint64_t *p8 = data;
+	i = size8 * 8;
+	asm (
+		"clc\n\t"
+		"1:\n\t"
+		"jecxz	2f\n\t"		/* technically RCX on 64, but not gonna be that big */
+		"adc	(%[p8]), %[wsum]\n\t"
+#if defined(__i386__)
+		"adc	4(%[p8]), %[wsum]\n\t"
+#endif	/* __i386__ */
+		"lea	-1(%[size8]), %[size8]\n\t"	/* Use LEA as a makeshift ADD that */
+		"lea	8(%[p8]), %[p8]\n\t"		/* doesn't modify the carry flag. */
+		"jmp	1b\n\t"
+		"2:\n\t"
+		"setc	%b[size8]\n\t"	/* reuse size register to save last carry */
+		"add	%[size8], %[wsum]\n\t"
+	: [wsum] "+r" (wide_sum),
+	  [p8] "+r" (p8),
+	  [size8] "+c" (size8)		/* put size in ECX so we can JECXZ */
+	:: "cc"
+	);
+#endif	/* __i386__ || __x86_64__ */
 
 	while (wide_sum) {
 		sum += wide_sum & 0xFFFF;