cpu/intel/model_206ax: Use parallel MP init

This patch adds a few southbridge calls needed for parallel MP init.

Moves the smm_relocate() function to smm/gen1/smi.h, since that is
where this function is defined now.

Tested on Thinkpad X220, shaves off ~30ms on a 2 core, 4 threads CPU.

Change-Id: Ia1d547ed4a3cb6746a0222c3e54e94e5848b0dd7
Signed-off-by: Arthur Heymans <arthur@aheymans.xyz>
Reviewed-on: https://review.coreboot.org/c/25618
Reviewed-by: Angel Pons <th3fanbus@gmail.com>
Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
diff --git a/src/cpu/intel/model_2065x/model_2065x.h b/src/cpu/intel/model_2065x/model_2065x.h
index 11d86cd..1e6a311 100644
--- a/src/cpu/intel/model_2065x/model_2065x.h
+++ b/src/cpu/intel/model_2065x/model_2065x.h
@@ -77,7 +77,6 @@
 /* Configure power limits for turbo mode */
 void set_power_limits(u8 power_limit_1_time);
 int cpu_config_tdp_levels(void);
-void smm_relocate(void);
 #endif
 
 #endif
diff --git a/src/cpu/intel/model_206ax/Kconfig b/src/cpu/intel/model_206ax/Kconfig
index b30cfa1..cb09d23 100644
--- a/src/cpu/intel/model_206ax/Kconfig
+++ b/src/cpu/intel/model_206ax/Kconfig
@@ -23,6 +23,7 @@
 	select TSC_SYNC_MFENCE
 	select CPU_INTEL_COMMON
 	select CACHE_RELOCATED_RAMSTAGE_OUTSIDE_CBMEM
+	select PARALLEL_MP
 
 config BOOTBLOCK_CPU_INIT
 	string
diff --git a/src/cpu/intel/model_206ax/model_206ax.h b/src/cpu/intel/model_206ax/model_206ax.h
index 2bf9d32..f4c5d93 100644
--- a/src/cpu/intel/model_206ax/model_206ax.h
+++ b/src/cpu/intel/model_206ax/model_206ax.h
@@ -22,6 +22,7 @@
 /* SandyBridge/IvyBridge bus clock is fixed at 100MHz */
 #define SANDYBRIDGE_BCLK		100
 
+#define CORE_THREAD_COUNT_MSR		0x35
 #define MSR_FEATURE_CONFIG		0x13c
 #define MSR_FLEX_RATIO			0x194
 #define  FLEX_RATIO_LOCK		(1 << 20)
@@ -109,7 +110,6 @@
 /* Configure power limits for turbo mode */
 void set_power_limits(u8 power_limit_1_time);
 int cpu_config_tdp_levels(void);
-void smm_relocate(void);
 #endif
 int get_platform_id(void);
 
diff --git a/src/cpu/intel/model_206ax/model_206ax_init.c b/src/cpu/intel/model_206ax/model_206ax_init.c
index 58aabdb..33ad1e7 100644
--- a/src/cpu/intel/model_206ax/model_206ax_init.c
+++ b/src/cpu/intel/model_206ax/model_206ax_init.c
@@ -15,6 +15,7 @@
  * GNU General Public License for more details.
  */
 
+#include <assert.h>
 #include <console/console.h>
 #include <device/device.h>
 #include <string.h>
@@ -24,6 +25,7 @@
 #include <cpu/x86/mtrr.h>
 #include <cpu/x86/msr.h>
 #include <cpu/x86/lapic.h>
+#include <cpu/x86/mp.h>
 #include <cpu/intel/microcode.h>
 #include <cpu/intel/speedstep.h>
 #include <cpu/intel/turbo.h>
@@ -426,83 +428,6 @@
 		wrmsr(IA32_MC0_STATUS + (i * 4), msr);
 }
 
-int cpu_get_apic_id_map(int *apic_id_map)
-{
-	struct cpuid_result result;
-	unsigned int threads_per_package, threads_per_core, i, shift = 0;
-
-	/* Logical processors (threads) per core */
-	result = cpuid_ext(0xb, 0);
-	threads_per_core = result.ebx & 0xffff;
-
-	/* Logical processors (threads) per package */
-	result = cpuid_ext(0xb, 1);
-	threads_per_package = result.ebx & 0xffff;
-
-	if (threads_per_core == 1)
-		shift++;
-
-	for (i = 0; i < threads_per_package && i < CONFIG_MAX_CPUS; i++)
-		apic_id_map[i] = i << shift;
-
-	return threads_per_package;
-}
-
-/*
- * Initialize any extra cores/threads in this package.
- */
-static void intel_cores_init(struct device *cpu)
-{
-	struct cpuid_result result;
-	unsigned int threads_per_package, threads_per_core, i;
-
-	/* Logical processors (threads) per core */
-	result = cpuid_ext(0xb, 0);
-	threads_per_core = result.ebx & 0xffff;
-
-	/* Logical processors (threads) per package */
-	result = cpuid_ext(0xb, 1);
-	threads_per_package = result.ebx & 0xffff;
-
-	/* Only initialize extra cores from BSP */
-	if (cpu->path.apic.apic_id)
-		return;
-
-	printk(BIOS_DEBUG, "CPU: %u has %u cores, %u threads per core\n",
-	       cpu->path.apic.apic_id, threads_per_package/threads_per_core,
-	       threads_per_core);
-
-	for (i = 1; i < threads_per_package; ++i) {
-		struct device_path cpu_path;
-		struct device *new;
-
-		/* Build the CPU device path */
-		cpu_path.type = DEVICE_PATH_APIC;
-		cpu_path.apic.apic_id =
-			cpu->path.apic.apic_id + i;
-
-		/* Update APIC ID if no hyperthreading */
-		if (threads_per_core == 1)
-			cpu_path.apic.apic_id <<= 1;
-
-		/* Allocate the new CPU device structure */
-		new = alloc_dev(cpu->bus, &cpu_path);
-		if (!new)
-			continue;
-
-		printk(BIOS_DEBUG, "CPU: %u has core %u\n",
-		       cpu->path.apic.apic_id,
-		       new->path.apic.apic_id);
-
-		/* Start the new CPU */
-		if (is_smp_boot() && !start_cpu(new)) {
-			/* Record the error in cpu? */
-			printk(BIOS_ERR, "CPU %u would not start!\n",
-			       new->path.apic.apic_id);
-		}
-	}
-}
-
 static void model_206ax_report(void)
 {
 	static const char *const mode[] = {"NOT ", ""};
@@ -536,18 +461,12 @@
 	/* Turn on caching if we haven't already */
 	x86_enable_cache();
 
-	intel_update_microcode_from_cbfs();
-
 	/* Clear out pending MCEs */
 	configure_mca();
 
 	/* Print infos */
 	model_206ax_report();
 
-	/* Setup MTRRs based on physical address size */
-	x86_setup_mtrrs_with_detect();
-	x86_mtrr_check();
-
 	/* Setup Page Attribute Tables (PAT) */
 	// TODO set up PAT
 
@@ -578,9 +497,75 @@
 
 	/* Enable Turbo */
 	enable_turbo();
+}
 
-	/* Start up extra cores */
-	intel_cores_init(cpu);
+/* MP initialization support. */
+static const void *microcode_patch;
+
+static void pre_mp_init(void)
+{
+	/* Setup MTRRs based on physical address size. */
+	x86_setup_mtrrs_with_detect();
+	x86_mtrr_check();
+}
+
+static int get_cpu_count(void)
+{
+	msr_t msr;
+	int num_threads;
+	int num_cores;
+
+	msr = rdmsr(CORE_THREAD_COUNT_MSR);
+	num_threads = (msr.lo >> 0) & 0xffff;
+	num_cores = (msr.lo >> 16) & 0xffff;
+	printk(BIOS_DEBUG, "CPU has %u cores, %u threads enabled.\n",
+	       num_cores, num_threads);
+
+	return num_threads;
+}
+
+static void get_microcode_info(const void **microcode, int *parallel)
+{
+	microcode_patch = intel_microcode_find();
+	*microcode = microcode_patch;
+	*parallel = 1;
+}
+
+static void per_cpu_smm_trigger(void)
+{
+	/* Relocate the SMM handler. */
+	smm_relocate();
+
+	/* After SMM relocation a 2nd microcode load is required. */
+	intel_microcode_load_unlocked(microcode_patch);
+}
+
+static void post_mp_init(void)
+{
+	/* Now that all APs have been relocated as well as the BSP let SMIs
+	 * start flowing. */
+	southbridge_smm_init();
+
+	/* Lock down the SMRAM space. */
+	smm_lock();
+}
+
+
+static const struct mp_ops mp_ops = {
+	.pre_mp_init = pre_mp_init,
+	.get_cpu_count = get_cpu_count,
+	.get_smm_info = smm_info,
+	.get_microcode_info = get_microcode_info,
+	.pre_mp_smm_init = smm_initialize,
+	.per_cpu_smm_trigger = per_cpu_smm_trigger,
+	.relocation_handler = smm_relocation_handler,
+	.post_mp_init = post_mp_init,
+};
+
+void bsp_init_and_start_aps(struct bus *cpu_bus)
+{
+	if (mp_init_with_smm(cpu_bus, &mp_ops))
+		printk(BIOS_ERR, "MP initialization failure.\n");
 }
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/smm/gen1/smi.h b/src/cpu/intel/smm/gen1/smi.h
index cc6811f..6d08bb3 100644
--- a/src/cpu/intel/smm/gen1/smi.h
+++ b/src/cpu/intel/smm/gen1/smi.h
@@ -11,6 +11,10 @@
  * GNU General Public License for more details.
  */
 
+#include <device/device.h>
+
+void bsp_init_and_start_aps(struct bus *cpu_bus);
+
 /* These helpers are for performing SMM relocation. */
 void southbridge_smm_init(void);
 void southbridge_trigger_smi(void);
@@ -21,3 +25,12 @@
 void northbridge_write_smram(u8 smram);
 
 bool cpu_has_alternative_smrr(void);
+
+/* parallel MP helper functions */
+void smm_info(uintptr_t *perm_smbase, size_t *perm_smsize,
+		size_t *smm_save_state_size);
+void smm_initialize(void);
+void southbridge_smm_clear_state(void);
+void smm_relocation_handler(int cpu, uintptr_t curr_smbase,
+			uintptr_t staggered_smbase);
+void smm_relocate(void);
diff --git a/src/cpu/intel/smm/gen1/smmrelocate.c b/src/cpu/intel/smm/gen1/smmrelocate.c
index 105b9a4..4b824a5 100644
--- a/src/cpu/intel/smm/gen1/smmrelocate.c
+++ b/src/cpu/intel/smm/gen1/smmrelocate.c
@@ -22,10 +22,12 @@
 #include <device/device.h>
 #include <device/pci.h>
 #include <cpu/x86/cache.h>
+#include <cpu/x86/mp.h>
 #include <cpu/x86/msr.h>
 #include <cpu/x86/mtrr.h>
 #include <cpu/x86/smm.h>
 #include <console/console.h>
+#include <smp/node.h>
 #include "smi.h"
 
 #define SMRR_SUPPORTED (1 << 11)
@@ -349,3 +351,82 @@
 
 	northbridge_write_smram(D_LCK | G_SMRAME | C_BASE_SEG);
 }
+
+void smm_info(uintptr_t *perm_smbase, size_t *perm_smsize,
+		size_t *smm_save_state_size)
+{
+	printk(BIOS_DEBUG, "Setting up SMI for CPU\n");
+
+	fill_in_relocation_params(&smm_reloc_params);
+
+	if (CONFIG_IED_REGION_SIZE != 0)
+		setup_ied_area(&smm_reloc_params);
+
+	*perm_smbase = smm_reloc_params.smram_base;
+	*perm_smsize = smm_reloc_params.smram_size;
+	*smm_save_state_size = sizeof(em64t101_smm_state_save_area_t);
+}
+
+void smm_initialize(void)
+{
+	/* Clear the SMM state in the southbridge. */
+	southbridge_smm_clear_state();
+
+	/*
+	 * Run the relocation handler for on the BSP to check and set up
+	 * parallel SMM relocation.
+	 */
+	smm_initiate_relocation();
+}
+
+/* The relocation work is actually performed in SMM context, but the code
+ * resides in the ramstage module. This occurs by trampolining from the default
+ * SMRAM entry point to here. */
+void smm_relocation_handler(int cpu, uintptr_t curr_smbase,
+				uintptr_t staggered_smbase)
+{
+	msr_t mtrr_cap;
+	struct smm_relocation_params *relo_params = &smm_reloc_params;
+	em64t101_smm_state_save_area_t *save_state;
+	u32 smbase = staggered_smbase;
+	u32 iedbase = relo_params->ied_base;
+
+	printk(BIOS_DEBUG, "In relocation handler: cpu %d\n", cpu);
+
+	/* Make appropriate changes to the save state map. */
+	if (CONFIG_IED_REGION_SIZE != 0)
+		printk(BIOS_DEBUG, "New SMBASE=0x%08x IEDBASE=0x%08x\n",
+		       smbase, iedbase);
+	else
+		printk(BIOS_DEBUG, "New SMBASE=0x%08x\n",
+		       smbase);
+
+	save_state = (void *)(curr_smbase + SMM_DEFAULT_SIZE -
+			sizeof(*save_state));
+	save_state->smbase = smbase;
+	save_state->iedbase = iedbase;
+
+	/* Write EMRR and SMRR MSRs based on indicated support. */
+	mtrr_cap = rdmsr(MTRR_CAP_MSR);
+	if (mtrr_cap.lo & SMRR_SUPPORTED && relo_params->smrr_mask.lo != 0)
+		write_smrr(relo_params);
+}
+
+/*
+ * The default SMM entry can happen in parallel or serially. If the
+ * default SMM entry is done in parallel the BSP has already setup
+ * the saving state to each CPU's MSRs. At least one save state size
+ * is required for the initial SMM entry for the BSP to determine if
+ * parallel SMM relocation is even feasible.
+ */
+void smm_relocate(void)
+{
+	/*
+	 * If smm_save_state_in_msrs is non-zero then parallel SMM relocation
+	 * shall take place. Run the relocation handler a second time on the
+	 * BSP to do the final move. For APs, a relocation handler always
+	 * needs to be run.
+	 */
+	if (!boot_cpu())
+		smm_initiate_relocation();
+}
diff --git a/src/northbridge/intel/sandybridge/northbridge.c b/src/northbridge/intel/sandybridge/northbridge.c
index 4a8419a..0d644ca 100644
--- a/src/northbridge/intel/sandybridge/northbridge.c
+++ b/src/northbridge/intel/sandybridge/northbridge.c
@@ -530,7 +530,7 @@
 
 static void cpu_bus_init(struct device *dev)
 {
-	initialize_cpus(dev->link_list);
+	bsp_init_and_start_aps(dev->link_list);
 }
 
 static struct device_operations cpu_bus_ops = {
diff --git a/src/southbridge/intel/common/smi.c b/src/southbridge/intel/common/smi.c
index af9dd5d..40f5412 100644
--- a/src/southbridge/intel/common/smi.c
+++ b/src/southbridge/intel/common/smi.c
@@ -154,3 +154,29 @@
 		  "d" (APM_CNT)
 	);
 }
+
+void southbridge_smm_clear_state(void)
+{
+	u32 smi_en;
+
+	if (IS_ENABLED(CONFIG_ELOG))
+	/* Log events from chipset before clearing */
+		pch_log_state();
+
+	printk(BIOS_DEBUG, "Initializing Southbridge SMI...\n");
+	printk(BIOS_SPEW, " ... pmbase = 0x%04x\n", get_pmbase());
+
+	smi_en = inl(get_pmbase() + SMI_EN);
+	if (smi_en & APMC_EN) {
+		printk(BIOS_INFO, "SMI# handler already enabled?\n");
+		return;
+	}
+
+	printk(BIOS_DEBUG, "\n");
+
+	/* Dump and clear status registers */
+	reset_smi_status();
+	reset_pm1_status();
+	reset_tco_status();
+	reset_gpe0_status();
+}