cpu/x86/mp_init: Use clflush to write SIPI data back to RAM

Improve boot time performances by replacing the wbinvd instruction
with multiple clflush to ensure that the SIPI data is written back to
RAM.

According to some experimental measurements, the wbinvd execution
takes between 1.6 up and 6 milliseconds to complete. In the case of
the SIPI data, wbinvd unnecessarily flushes and invalidates the entire
cache. Indeed, the SIPI module is quite small (about 400 bytes) and
cflush'ing the associated cache lines is almost instantaneous,
typically less than 100 microseconds.

BUG=b/260455826
TEST=Successful boot on Skolas and Rex board

Change-Id: I0e00db8eaa6a3cb41bec3422572c8f2a9bec4057
Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>
Suggested-by: Erin Park <erin.park@intel.com>
Reviewed-on: https://review.coreboot.org/c/coreboot/+/75391
Reviewed-by: Kyösti Mälkki <kyosti.malkki@gmail.com>
Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
Reviewed-by: Arthur Heymans <arthur@aheymans.xyz>
diff --git a/src/cpu/x86/cache/cache.c b/src/cpu/x86/cache/cache.c
index d02d6d4..6413660 100644
--- a/src/cpu/x86/cache/cache.c
+++ b/src/cpu/x86/cache/cache.c
@@ -12,16 +12,11 @@
 	return (cpuid_edx(1) >> CPUID_FEATURE_CLFLUSH_BIT) & 1;
 }
 
-static void clflush_region(const uintptr_t start, const size_t size)
+void clflush_region(const uintptr_t start, const size_t size)
 {
 	uintptr_t addr;
 	const size_t cl_size = ((cpuid_ebx(1) >> 8) & 0xff) * 8;
 
-	if (!clflush_supported()) {
-		printk(BIOS_DEBUG, "Not flushing cache to RAM, CLFLUSH not supported\n");
-		return;
-	}
-
 	printk(BIOS_SPEW, "CLFLUSH [0x%lx, 0x%lx]\n", start, start + size);
 
 	for (addr = ALIGN_DOWN(start, cl_size); addr < start + size; addr += cl_size)
@@ -54,5 +49,8 @@
 	if (!cbmem_online())
 		return;
 
-	clflush_region(start, size);
+	if (clflush_supported())
+		clflush_region(start, size);
+	else
+		printk(BIOS_DEBUG, "Not flushing cache to RAM, CLFLUSH not supported\n");
 }
diff --git a/src/cpu/x86/mp_init.c b/src/cpu/x86/mp_init.c
index 28d6092..6809f81 100644
--- a/src/cpu/x86/mp_init.c
+++ b/src/cpu/x86/mp_init.c
@@ -364,6 +364,13 @@
 	ap_count = &sp->ap_count;
 	atomic_set(ap_count, 0);
 
+	/* Make sure SIPI data hits RAM so the APs that come up will see the
+	   startup code even if the caches are disabled. */
+	if (clflush_supported())
+		clflush_region((uintptr_t)mod_loc, module_size);
+	else
+		wbinvd();
+
 	return ap_count;
 }
 
@@ -626,10 +633,6 @@
 	if (ap_count == NULL)
 		return CB_ERR;
 
-	/* Make sure SIPI data hits RAM so the APs that come up will see
-	 * the startup code even if the caches are disabled.  */
-	wbinvd();
-
 	/* Start the APs providing number of APs and the cpus_entered field. */
 	global_num_aps = p->num_cpus - 1;
 	if (start_aps(cpu_bus, global_num_aps, ap_count) != CB_SUCCESS) {
diff --git a/src/include/cpu/x86/cache.h b/src/include/cpu/x86/cache.h
index 63703a7..d35c4e7 100644
--- a/src/include/cpu/x86/cache.h
+++ b/src/include/cpu/x86/cache.h
@@ -13,6 +13,7 @@
 #if !defined(__ASSEMBLER__)
 
 #include <stdbool.h>
+#include <stddef.h>
 
 static inline void wbinvd(void)
 {
@@ -30,6 +31,7 @@
 }
 
 bool clflush_supported(void);
+void clflush_region(const uintptr_t start, const size_t size);
 
 /* The following functions require the __always_inline due to AMD
  * function STOP_CAR_AND_CPU that disables cache as