Use broadcast SIPI to startup siblings

The current code for initializing AP cpus has several shortcomings:

- it assumes APIC IDs are sequential
- it uses only the BSP for determining the AP count, which is bad if
  there's more than one physical CPU, and CPUs are of different type

Note that the new code call cpu->ops->init() in parallel, and therefore
some CPU code needs to be changed to address that. One example are old
Intel HT enabled CPUs which can't do microcode update in parallel.

Change-Id: Ic48a1ebab6a7c52aa76765f497268af09fa38c25
Signed-off-by: Sven Schnelle <svens@stackframe.org>
Reviewed-on: http://review.coreboot.org/1139
Tested-by: build bot (Jenkins)
Reviewed-by: Ronald G. Minnich <rminnich@gmail.com>
diff --git a/src/arch/x86/Kconfig b/src/arch/x86/Kconfig
index 6d56ec6..4dfbe70 100644
--- a/src/arch/x86/Kconfig
+++ b/src/arch/x86/Kconfig
@@ -3,10 +3,6 @@
 # This is an SMP option. It relates to starting up APs.
 # It is usually set in mainboard/*/Kconfig.
 # TODO: Improve description.
-config AP_IN_SIPI_WAIT
-	bool
-	default n
-	depends on ARCH_X86
 
 # Aligns 16bit entry code in bootblock so that hyper-threading CPUs
 # can boot AP CPUs to enable their shared caches.
diff --git a/src/arch/x86/include/arch/cpu.h b/src/arch/x86/include/arch/cpu.h
index 0dc92fb..0fe5ea5 100644
--- a/src/arch/x86/include/arch/cpu.h
+++ b/src/arch/x86/include/arch/cpu.h
@@ -158,30 +158,6 @@
 struct device;
 struct cpu_driver *find_cpu_driver(struct device *cpu);
 
-struct cpu_info {
-	device_t cpu;
-	unsigned long index;
-};
-
-static inline struct cpu_info *cpu_info(void)
-{
-	struct cpu_info *ci;
-	__asm__("andl %%esp,%0; "
-		"orl  %2, %0 "
-		:"=r" (ci)
-		: "0" (~(CONFIG_STACK_SIZE - 1)),
-		"r" (CONFIG_STACK_SIZE - sizeof(struct cpu_info))
-	);
-	return ci;
-}
-
-static inline unsigned long cpu_index(void)
-{
-	struct cpu_info *ci;
-	ci = cpu_info();
-	return ci->index;
-}
-
 struct cpuinfo_x86 {
         uint8_t    x86;            /* CPU family */
         uint8_t    x86_vendor;     /* CPU vendor */
diff --git a/src/arch/x86/lib/cpu.c b/src/arch/x86/lib/cpu.c
index 98ede06..2d64be9 100644
--- a/src/arch/x86/lib/cpu.c
+++ b/src/arch/x86/lib/cpu.c
@@ -9,6 +9,7 @@
 #include <device/path.h>
 #include <device/device.h>
 #include <smp/spinlock.h>
+#include <cpu/x86/lapic.h>
 
 /* Standard macro to see if a specific flag is changeable */
 static inline int flag_is_changeable_p(uint32_t flag)
@@ -234,7 +235,11 @@
 	cpu->ops = driver ? driver->ops : NULL;
 }
 
-void cpu_initialize(void)
+#if CONFIG_SMP
+static spinlock_t start_cpu_lock = SPIN_LOCK_UNLOCKED;
+#endif
+
+void cpu_initialize(struct bus *cpu_bus, int index)
 {
 	/* Because we busy wait at the printk spinlock.
 	 * It is important to keep the number of printed messages
@@ -242,17 +247,22 @@
 	 * disabled.
 	 */
 	struct device *cpu;
-	struct cpu_info *info;
 	struct cpuinfo_x86 c;
+	struct device_path cpu_path;
+	unsigned char id = lapicid();
 
-	info = cpu_info();
+	cpu_path.type = DEVICE_PATH_APIC;
+	cpu_path.apic.apic_id = id;
+	cpu_path.apic.index = index;
 
-	printk(BIOS_INFO, "Initializing CPU #%ld\n", info->index);
-
-	cpu = info->cpu;
-	if (!cpu) {
-		die("CPU: missing cpu device structure");
-	}
+#if CONFIG_SMP
+	spin_lock(&start_cpu_lock);
+#endif
+	cpu = alloc_find_dev(cpu_bus, &cpu_path);
+#if CONFIG_SMP
+	spin_unlock(&start_cpu_lock);
+#endif
+	printk(BIOS_DEBUG, "Initializing CPU #%d\n", id);
 
 	/* Find what type of cpu we are dealing with */
 	identify_cpu(cpu);
@@ -276,7 +286,6 @@
 		printk(BIOS_DEBUG, "Using generic cpu ops (good)\n");
 	}
 
-
 	/* Initialize the cpu */
 	if (cpu->ops && cpu->ops->init) {
 		cpu->enabled = 1;
@@ -284,7 +293,7 @@
 		cpu->ops->init(cpu);
 	}
 
-	printk(BIOS_INFO, "CPU #%ld initialized\n", info->index);
+	printk(BIOS_INFO, "CPU #%d initialized\n", id);
 
 	return;
 }
diff --git a/src/cpu/intel/hyperthreading/intel_sibling.c b/src/cpu/intel/hyperthreading/intel_sibling.c
index b9a9ae7..8377cd0 100644
--- a/src/cpu/intel/hyperthreading/intel_sibling.c
+++ b/src/cpu/intel/hyperthreading/intel_sibling.c
@@ -7,13 +7,6 @@
 #include <smp/spinlock.h>
 #include <assert.h>
 
-#if !CONFIG_SERIAL_CPU_INIT
-#error Intel hyper-threading requires serialized cpu init
-#endif
-
-static int first_time = 1;
-static int disable_siblings = !CONFIG_LOGICAL_CPUS;
-
 /* Return true if running thread does not have the smallest lapic ID
  * within a CPU core.
  */
@@ -34,61 +27,3 @@
 	threads = (apic_ids / core_ids);
 	return !!(lapicid() & (threads-1));
 }
-
-void intel_sibling_init(device_t cpu)
-{
-	unsigned i, siblings;
-	struct cpuid_result result;
-
-	/* On the bootstrap processor see if I want sibling cpus enabled */
-	if (first_time) {
-		first_time = 0;
-		get_option(&disable_siblings, "hyper_threading");
-	}
-	result = cpuid(1);
-	/* Is hyperthreading supported */
-	if (!(result.edx & (1 << 28))) {
-		return;
-	}
-	/* See how many sibling cpus we have */
-	siblings = (result.ebx >> 16) & 0xff;
-	if (siblings < 1) {
-		siblings = 1;
-	}
-
-	printk(BIOS_DEBUG, "CPU: %u %d siblings\n",
-		cpu->path.apic.apic_id,
-		siblings);
-
-	/* See if I am a sibling cpu */
-	if (cpu->path.apic.apic_id & (siblings -1)) {
-		if (disable_siblings) {
-			cpu->enabled = 0;
-		}
-		return;
-	}
-
-	/* I am the primary cpu start up my siblings */
-	for(i = 1; i < siblings; i++) {
-		struct device_path cpu_path;
-		device_t new;
-		/* Build the cpu device path */
-		cpu_path.type = DEVICE_PATH_APIC;
-		cpu_path.apic.apic_id = cpu->path.apic.apic_id + i;
-
-
-		/* Allocate new cpu device structure iff sibling CPU
-		 * was not in static device tree.
-		 */
-		new = alloc_find_dev(cpu->bus, &cpu_path);
-
-		if (!new) {
-			continue;
-		}
-
-		printk(BIOS_DEBUG, "CPU: %u has sibling %u\n",
-			cpu->path.apic.apic_id,
-			new->path.apic.apic_id);
-	}
-}
-
diff --git a/src/cpu/intel/model_1067x/model_1067x_init.c b/src/cpu/intel/model_1067x/model_1067x_init.c
index c6d716d9..ddd1381 100644
--- a/src/cpu/intel/model_1067x/model_1067x_init.c
+++ b/src/cpu/intel/model_1067x/model_1067x_init.c
@@ -29,7 +29,6 @@
 #include <cpu/x86/lapic.h>
 #include <cpu/intel/microcode.h>
 #include <cpu/intel/speedstep.h>
-#include <cpu/intel/hyperthreading.h>
 #include <cpu/x86/cache.h>
 #include <cpu/x86/name.h>
 
@@ -221,9 +220,6 @@
 
 	/* PIC thermal sensor control */
 	configure_pic_thermal_sensors();
-
-	/* Start up my cpu siblings */
-	intel_sibling_init(cpu);
 }
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/model_106cx/Kconfig b/src/cpu/intel/model_106cx/Kconfig
index 09449cb..fe44024 100644
--- a/src/cpu/intel/model_106cx/Kconfig
+++ b/src/cpu/intel/model_106cx/Kconfig
@@ -3,4 +3,3 @@
 	select SMP
 	select SSE2
 	select UDELAY_LAPIC
-	select AP_IN_SIPI_WAIT
diff --git a/src/cpu/intel/model_106cx/model_106cx_init.c b/src/cpu/intel/model_106cx/model_106cx_init.c
index 4bf2924..8d2ef3d 100644
--- a/src/cpu/intel/model_106cx/model_106cx_init.c
+++ b/src/cpu/intel/model_106cx/model_106cx_init.c
@@ -27,7 +27,6 @@
 #include <cpu/x86/lapic.h>
 #include <cpu/intel/microcode.h>
 #include <cpu/intel/speedstep.h>
-#include <cpu/intel/hyperthreading.h>
 #include <cpu/x86/cache.h>
 #include <cpu/x86/name.h>
 #include <usbdebug.h>
@@ -178,9 +177,6 @@
 	configure_misc();
 
 	/* TODO: PIC thermal sensor control */
-
-	/* Start up my cpu siblings */
-	intel_sibling_init(cpu);
 }
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/model_206ax/Kconfig b/src/cpu/intel/model_206ax/Kconfig
index d2d4c73..e3e4360 100644
--- a/src/cpu/intel/model_206ax/Kconfig
+++ b/src/cpu/intel/model_206ax/Kconfig
@@ -12,7 +12,6 @@
 	select SSE2
 	select UDELAY_LAPIC
 	select SMM_TSEG
-	#select AP_IN_SIPI_WAIT
 
 config BOOTBLOCK_CPU_INIT
 	string
diff --git a/src/cpu/intel/model_206ax/acpi.c b/src/cpu/intel/model_206ax/acpi.c
index 6153174..dea23e7 100644
--- a/src/cpu/intel/model_206ax/acpi.c
+++ b/src/cpu/intel/model_206ax/acpi.c
@@ -26,6 +26,7 @@
 #include <arch/acpigen.h>
 #include <arch/cpu.h>
 #include <cpu/x86/msr.h>
+#include <cpu/x86/lapic.h>
 #include <cpu/intel/acpi.h>
 #include <cpu/intel/speedstep.h>
 #include <cpu/intel/turbo.h>
@@ -88,8 +89,8 @@
 
 static int generate_C_state_entries(void)
 {
-	struct cpu_info *info;
 	struct cpu_driver *cpu;
+	struct device *cpu_dev;
 	int len, lenif;
 	device_t lapic;
 	struct cpu_intel_model_206ax_config *conf = NULL;
@@ -103,10 +104,10 @@
 		return 0;
 
 	/* Find CPU map of supported C-states */
-	info = cpu_info();
-	if (!info)
+	cpu_dev = dev_find_lapic(lapicid());
+	if (!cpu_dev)
 		return 0;
-	cpu = find_cpu_driver(info->cpu);
+	cpu = find_cpu_driver(cpu_dev);
 	if (!cpu || !cpu->cstates)
 		return 0;
 
diff --git a/src/cpu/intel/model_206ax/model_206ax_init.c b/src/cpu/intel/model_206ax/model_206ax_init.c
index 874ce4d..d60c237 100644
--- a/src/cpu/intel/model_206ax/model_206ax_init.c
+++ b/src/cpu/intel/model_206ax/model_206ax_init.c
@@ -414,58 +414,6 @@
 static unsigned ehci_debug_addr;
 #endif
 
-/*
- * Initialize any extra cores/threads in this package.
- */
-static void intel_cores_init(device_t cpu)
-{
-	struct cpuid_result result;
-	unsigned cores, threads, i;
-
-	result = cpuid_ext(0xb, 0); /* Threads per core */
-	threads = result.ebx & 0xff;
-
-	result = cpuid_ext(0xb, 1); /* Cores per package */
-	cores = result.ebx & 0xff;
-
-	/* Only initialize extra cores from BSP */
-	if (cpu->path.apic.apic_id)
-		return;
-
-	printk(BIOS_DEBUG, "CPU: %u has %u cores %u threads\n",
-	       cpu->path.apic.apic_id, cores, threads);
-
-	for (i = 1; i < cores; ++i) {
-		struct device_path cpu_path;
-		device_t new;
-
-		/* Build the cpu device path */
-		cpu_path.type = DEVICE_PATH_APIC;
-		cpu_path.apic.apic_id =
-			cpu->path.apic.apic_id + i;
-
-		/* Update APIC ID if no hyperthreading */
-		if (threads == 1)
-			cpu_path.apic.apic_id <<= 1;
-
-		/* Allocate the new cpu device structure */
-		new = alloc_dev(cpu->bus, &cpu_path);
-		if (!new)
-			continue;
-
-		printk(BIOS_DEBUG, "CPU: %u has core %u\n",
-		       cpu->path.apic.apic_id,
-		       new->path.apic.apic_id);
-
-		/* Start the new cpu */
-		if (!start_cpu(new)) {
-			/* Record the error in cpu? */
-			printk(BIOS_ERR, "CPU %u would not start!\n",
-			       new->path.apic.apic_id);
-		}
-	}
-}
-
 static void model_206ax_init(device_t cpu)
 {
 	char processor_name[49];
@@ -528,9 +476,6 @@
 
 	/* Enable Turbo */
 	enable_turbo();
-
-	/* Start up extra cores */
-	intel_cores_init(cpu);
 }
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/model_6ex/Kconfig b/src/cpu/intel/model_6ex/Kconfig
index 31d24bd..c3faa39 100644
--- a/src/cpu/intel/model_6ex/Kconfig
+++ b/src/cpu/intel/model_6ex/Kconfig
@@ -3,4 +3,3 @@
 	select SMP
 	select SSE2
 	select UDELAY_LAPIC
-	select AP_IN_SIPI_WAIT
diff --git a/src/cpu/intel/model_6ex/model_6ex_init.c b/src/cpu/intel/model_6ex/model_6ex_init.c
index 1c8c72b..a0afd2e 100644
--- a/src/cpu/intel/model_6ex/model_6ex_init.c
+++ b/src/cpu/intel/model_6ex/model_6ex_init.c
@@ -205,9 +205,6 @@
 
 	/* PIC thermal sensor control */
 	configure_pic_thermal_sensors();
-
-	/* Start up my cpu siblings */
-	intel_sibling_init(cpu);
 }
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/model_6fx/Kconfig b/src/cpu/intel/model_6fx/Kconfig
index 851685c..065cdd9 100644
--- a/src/cpu/intel/model_6fx/Kconfig
+++ b/src/cpu/intel/model_6fx/Kconfig
@@ -3,4 +3,3 @@
 	select SMP
 	select SSE2
 	select UDELAY_LAPIC
-	select AP_IN_SIPI_WAIT
diff --git a/src/cpu/intel/model_6fx/model_6fx_init.c b/src/cpu/intel/model_6fx/model_6fx_init.c
index 106719e..c5d7a6b 100644
--- a/src/cpu/intel/model_6fx/model_6fx_init.c
+++ b/src/cpu/intel/model_6fx/model_6fx_init.c
@@ -243,9 +243,6 @@
 
 	/* PIC thermal sensor control */
 	configure_pic_thermal_sensors();
-
-	/* Start up my cpu siblings */
-	intel_sibling_init(cpu);
 }
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/model_f2x/model_f2x_init.c b/src/cpu/intel/model_f2x/model_f2x_init.c
index 8fd8abc..fa9e05f 100644
--- a/src/cpu/intel/model_f2x/model_f2x_init.c
+++ b/src/cpu/intel/model_f2x/model_f2x_init.c
@@ -60,9 +60,6 @@
 
 	/* Enable the local cpu apics */
 	setup_lapic();
-
-	/* Start up my cpu siblings */
-	intel_sibling_init(cpu);
 };
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/model_f3x/model_f3x_init.c b/src/cpu/intel/model_f3x/model_f3x_init.c
index 2504ba9..dd2a45f 100644
--- a/src/cpu/intel/model_f3x/model_f3x_init.c
+++ b/src/cpu/intel/model_f3x/model_f3x_init.c
@@ -43,9 +43,6 @@
 
 	/* Enable the local cpu apics */
 	setup_lapic();
-
-	/* Start up my cpu siblings */
-	intel_sibling_init(cpu);
 };
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/model_f4x/model_f4x_init.c b/src/cpu/intel/model_f4x/model_f4x_init.c
index f3f0b2a..af7d9d2 100644
--- a/src/cpu/intel/model_f4x/model_f4x_init.c
+++ b/src/cpu/intel/model_f4x/model_f4x_init.c
@@ -51,9 +51,6 @@
 
 	/* Enable the local cpu apics */
 	setup_lapic();
-
-	/* Start up my cpu siblings */
-	intel_sibling_init(cpu);
 };
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/socket_LGA771/Kconfig b/src/cpu/intel/socket_LGA771/Kconfig
index 62bd17b..f549210 100644
--- a/src/cpu/intel/socket_LGA771/Kconfig
+++ b/src/cpu/intel/socket_LGA771/Kconfig
@@ -3,4 +3,3 @@
         select CPU_INTEL_MODEL_6FX
 	select SSE2
 	select MMX
-	select AP_IN_SIPI_WAIT
diff --git a/src/cpu/x86/lapic/lapic_cpu_init.c b/src/cpu/x86/lapic/lapic_cpu_init.c
index e491d46..061d3d3 100644
--- a/src/cpu/x86/lapic/lapic_cpu_init.c
+++ b/src/cpu/x86/lapic/lapic_cpu_init.c
@@ -66,31 +66,28 @@
 	printk(BIOS_DEBUG, "start_eip=0x%08lx, offset=0x%08lx, code_size=0x%08lx\n", start_eip, ((unsigned long)_secondary_start - start_eip), code_size);
 }
 
-static int lapic_start_cpu(unsigned long apicid)
+static struct bus *current_cpu_bus;
+
+static int lapic_start_cpus(struct bus *cpu_bus)
 {
 	int timeout;
 	unsigned long send_status, accept_status, start_eip;
-	int j, num_starts, maxlvt;
+	int maxlvt;
 
 	/*
 	 * Starting actual IPI sequence...
 	 */
 
+	current_cpu_bus = cpu_bus;
+
 	printk(BIOS_SPEW, "Asserting INIT.\n");
 
-	/*
-	 * Turn INIT on target chip
-	 */
-	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(apicid));
+	/* Send INIT SIPI to target chip */
+	lapic_write_around(LAPIC_ICR2, 0);
+	lapic_write_around(LAPIC_ICR, LAPIC_INT_ASSERT
+				| LAPIC_DM_INIT | LAPIC_DEST_ALLBUT);
 
-	/*
-	 * Send IPI
-	 */
-
-	lapic_write_around(LAPIC_ICR, LAPIC_INT_LEVELTRIG | LAPIC_INT_ASSERT
-				| LAPIC_DM_INIT);
-
-	printk(BIOS_SPEW, "Waiting for send to finish...\n");
+	printk(BIOS_DEBUG, "Waiting for send to finish...\n");
 	timeout = 0;
 	do {
 		printk(BIOS_SPEW, "+");
@@ -98,108 +95,67 @@
 		send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
 	} while (send_status && (timeout++ < 1000));
 	if (timeout >= 1000) {
-		printk(BIOS_ERR, "CPU %ld: First apic write timed out. Disabling\n",
-			 apicid);
+		printk(BIOS_DEBUG, "First apic write timed out. Disabling\n");
 		// too bad.
-		printk(BIOS_ERR, "ESR is 0x%lx\n", lapic_read(LAPIC_ESR));
+		printk(BIOS_DEBUG, "ESR is 0x%lx\n", lapic_read(LAPIC_ESR));
 		if (lapic_read(LAPIC_ESR)) {
-			printk(BIOS_ERR, "Try to reset ESR\n");
+			printk(BIOS_DEBUG, "Try to reset ESR\n");
 			lapic_write_around(LAPIC_ESR, 0);
-			printk(BIOS_ERR, "ESR is 0x%lx\n", lapic_read(LAPIC_ESR));
+			printk(BIOS_DEBUG, "ESR is 0x%lx\n", lapic_read(LAPIC_ESR));
 		}
 		return 0;
 	}
-#if !CONFIG_CPU_AMD_MODEL_10XXX && !CONFIG_CPU_INTEL_MODEL_206AX
-	mdelay(10);
-#endif
-
-	printk(BIOS_SPEW, "Deasserting INIT.\n");
-
-	/* Target chip */
-	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(apicid));
-
-	/* Send IPI */
-	lapic_write_around(LAPIC_ICR, LAPIC_INT_LEVELTRIG | LAPIC_DM_INIT);
-
-	printk(BIOS_SPEW, "Waiting for send to finish...\n");
-	timeout = 0;
-	do {
-		printk(BIOS_SPEW, "+");
-		udelay(100);
-		send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
-	} while (send_status && (timeout++ < 1000));
-	if (timeout >= 1000) {
-		printk(BIOS_ERR, "CPU %ld: Second apic write timed out. Disabling\n",
-			 apicid);
-		// too bad.
-		return 0;
-	}
-
 	start_eip = get_valid_start_eip((unsigned long)_secondary_start);
 
-#if !CONFIG_CPU_AMD_MODEL_10XXX
-	num_starts = 2;
-#else
-	num_starts = 1;
-#endif
-
-	/*
-	 * Run STARTUP IPI loop.
-	 */
-	printk(BIOS_SPEW, "#startup loops: %d.\n", num_starts);
-
 	maxlvt = 4;
 
-	for (j = 1; j <= num_starts; j++) {
-		printk(BIOS_SPEW, "Sending STARTUP #%d to %lu.\n", j, apicid);
+	printk(BIOS_SPEW, "Sending STARTUP.\n");
+	lapic_read_around(LAPIC_SPIV);
+	lapic_write(LAPIC_ESR, 0);
+	lapic_read(LAPIC_ESR);
+	printk(BIOS_SPEW, "After apic_write.\n");
+
+	/*
+	 * STARTUP IPI
+	 */
+
+	/* Target chip */
+	lapic_write_around(LAPIC_ICR2, 0);
+
+	/* Boot on the stack */
+	/* Kick the second */
+	lapic_write_around(LAPIC_ICR, LAPIC_INT_ASSERT | LAPIC_DM_STARTUP | LAPIC_DEST_ALLBUT
+			   | (start_eip >> 12));
+
+	/*
+	 * Give the other CPU some time to accept the IPI.
+	 */
+	udelay(300);
+
+	printk(BIOS_DEBUG, "Startup point 1.\n");
+
+	printk(BIOS_DEBUG, "Waiting for send to finish...\n");
+	timeout = 0;
+	do {
+		printk(BIOS_DEBUG, "+");
+		udelay(100);
+		send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
+	} while (send_status && (timeout++ < 1000));
+
+	/*
+	 * Give the other CPU some time to accept the IPI.
+	 */
+	udelay(200);
+	/*
+	 * Due to the Pentium erratum 3AP.
+	 */
+	if (maxlvt > 3) {
 		lapic_read_around(LAPIC_SPIV);
 		lapic_write(LAPIC_ESR, 0);
-		lapic_read(LAPIC_ESR);
-		printk(BIOS_SPEW, "After apic_write.\n");
-
-		/*
-		 * STARTUP IPI
-		 */
-
-		/* Target chip */
-		lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(apicid));
-
-		/* Boot on the stack */
-		/* Kick the second */
-		lapic_write_around(LAPIC_ICR, LAPIC_DM_STARTUP
-					| (start_eip >> 12));
-
-		/*
-		 * Give the other CPU some time to accept the IPI.
-		 */
-		udelay(300);
-
-		printk(BIOS_SPEW, "Startup point 1.\n");
-
-		printk(BIOS_SPEW, "Waiting for send to finish...\n");
-		timeout = 0;
-		do {
-			printk(BIOS_SPEW, "+");
-			udelay(100);
-			send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
-		} while (send_status && (timeout++ < 1000));
-
-		/*
-		 * Give the other CPU some time to accept the IPI.
-		 */
-		udelay(200);
-		/*
-		 * Due to the Pentium erratum 3AP.
-		 */
-		if (maxlvt > 3) {
-			lapic_read_around(LAPIC_SPIV);
-			lapic_write(LAPIC_ESR, 0);
-		}
-		accept_status = (lapic_read(LAPIC_ESR) & 0xEF);
-		if (send_status || accept_status)
-			break;
 	}
-	printk(BIOS_SPEW, "After Startup.\n");
+	accept_status = (lapic_read(LAPIC_ESR) & 0xEF);
+
+	printk(BIOS_DEBUG, "After Startup.\n");
 	if (send_status)
 		printk(BIOS_WARNING, "APIC never delivered???\n");
 	if (accept_status)
@@ -209,156 +165,34 @@
 	return 1;
 }
 
+
 /* Number of cpus that are currently running in coreboot */
 static atomic_t active_cpus = ATOMIC_INIT(1);
 
-/* start_cpu_lock covers last_cpu_index and secondary_stack.
- * Only starting one cpu at a time let's me remove the logic
- * for select the stack from assembly language.
- *
- * In addition communicating by variables to the cpu I
- * am starting allows me to veryify it has started before
- * start_cpu returns.
- */
-
-static spinlock_t start_cpu_lock = SPIN_LOCK_UNLOCKED;
-static unsigned last_cpu_index = 0;
 volatile unsigned long secondary_stack;
+extern unsigned char _estack[];
 
-int start_cpu(device_t cpu)
+static void stop_all_ap_cpus(void)
 {
-	extern unsigned char _estack[];
-	struct cpu_info *info;
-	unsigned long stack_end;
-	unsigned long apicid;
-	unsigned long index;
-	unsigned long count;
-	int result;
-
-	spin_lock(&start_cpu_lock);
-
-	/* Get the cpu's apicid */
-	apicid = cpu->path.apic.apic_id;
-
-	/* Get an index for the new processor */
-	index = ++last_cpu_index;
-
-	/* Find end of the new processors stack */
-	stack_end = ((unsigned long)_estack) - (CONFIG_STACK_SIZE*index) - sizeof(struct cpu_info);
-
-	/* Record the index and which cpu structure we are using */
-	info = (struct cpu_info *)stack_end;
-	info->index = index;
-	info->cpu   = cpu;
-
-	/* Advertise the new stack to start_cpu */
-	secondary_stack = stack_end;
-
-	/* Until the cpu starts up report the cpu is not enabled */
-	cpu->enabled = 0;
-	cpu->initialized = 0;
-
-	/* Start the cpu */
-	result = lapic_start_cpu(apicid);
-
-	if (result) {
-		result = 0;
-		/* Wait 1s or until the new cpu calls in */
-		for(count = 0; count < 100000 ; count++) {
-			if (secondary_stack == 0) {
-				result = 1;
-				break;
-			}
-			udelay(10);
-	}
-	}
-	secondary_stack = 0;
-	spin_unlock(&start_cpu_lock);
-	return result;
-}
-
-#if CONFIG_AP_IN_SIPI_WAIT
-
-/**
- * Sending INIT IPI to self is equivalent of asserting #INIT with a bit of delay.
- * An undefined number of instruction cycles will complete. All global locks
- * must be released before INIT IPI and no printk is allowed after this.
- * De-asserting INIT IPI is a no-op on later Intel CPUs.
- *
- * If you set DEBUG_HALT_SELF to 1, printk's after INIT IPI are enabled
- * but running thread may halt without releasing the lock and effectively
- * deadlock other CPUs.
- */
-#define DEBUG_HALT_SELF 0
-
-/**
- * Normally this function is defined in lapic.h as an always inline function
- * that just keeps the CPU in a hlt() loop. This does not work on all CPUs.
- * I think all hyperthreading CPUs might need this version, but I could only
- * verify this on the Intel Core Duo
- */
-void stop_this_cpu(void)
-{
-	int timeout;
 	unsigned long send_status;
-	unsigned long id;
-
-	id = lapic_read(LAPIC_ID) >> 24;
-
-	printk(BIOS_DEBUG, "CPU %ld going down...\n", id);
-
-	/* send an LAPIC INIT to myself */
-	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(id));
-	lapic_write_around(LAPIC_ICR, LAPIC_INT_LEVELTRIG | LAPIC_INT_ASSERT | LAPIC_DM_INIT);
+	int timeout;
+	/* send an LAPIC INIT to all but myself */
+	lapic_write_around(LAPIC_ICR2, 0);
+	lapic_write_around(LAPIC_ICR, LAPIC_INT_ASSERT | LAPIC_DM_INIT | LAPIC_DEST_ALLBUT);
 
 	/* wait for the ipi send to finish */
-#if DEBUG_HALT_SELF
 	printk(BIOS_SPEW, "Waiting for send to finish...\n");
-#endif
 	timeout = 0;
 	do {
-#if DEBUG_HALT_SELF
 		printk(BIOS_SPEW, "+");
-#endif
 		udelay(100);
 		send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
 	} while (send_status && (timeout++ < 1000));
 	if (timeout >= 1000) {
-#if DEBUG_HALT_SELF
 		printk(BIOS_ERR, "timed out\n");
-#endif
 	}
 	mdelay(10);
-
-#if DEBUG_HALT_SELF
-	printk(BIOS_SPEW, "Deasserting INIT.\n");
-#endif
-	/* Deassert the LAPIC INIT */
-	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(id));
-	lapic_write_around(LAPIC_ICR, LAPIC_INT_LEVELTRIG | LAPIC_DM_INIT);
-
-#if DEBUG_HALT_SELF
-	printk(BIOS_SPEW, "Waiting for send to finish...\n");
-#endif
-	timeout = 0;
-	do {
-#if DEBUG_HALT_SELF
-		printk(BIOS_SPEW, "+");
-#endif
-		udelay(100);
-		send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
-	} while (send_status && (timeout++ < 1000));
-	if (timeout >= 1000) {
-#if DEBUG_HALT_SELF
-		printk(BIOS_ERR, "timed out\n");
-#endif
-	}
-
-	while(1) {
-		hlt();
-	}
 }
-#endif
 
 #ifdef __SSE3__
 static __inline__ __attribute__((always_inline)) unsigned long readcr4(void)
@@ -381,66 +215,21 @@
 #endif
 
 /* C entry point of secondary cpus */
-void secondary_cpu_init(void)
+void secondary_cpu_init(int index)
 {
-	atomic_inc(&active_cpus);
-#if CONFIG_SERIAL_CPU_INIT
-	spin_lock(&start_cpu_lock);
-#endif
-
 #ifdef __SSE3__
-	/*
-	 * Seems that CR4 was cleared when AP start via lapic_start_cpu()
-	 * Turn on CR4.OSFXSR and CR4.OSXMMEXCPT when SSE options enabled
-	 */
-	u32 cr4_val;
-	cr4_val = readcr4();
-	cr4_val |= (1 << 9 | 1 << 10);
-	writecr4(cr4_val);
+        /*
+         * Seems that CR4 was cleared when AP start via lapic_start_cpu()
+         * Turn on CR4.OSFXSR and CR4.OSXMMEXCPT when SSE options enabled
+         */
+        u32 cr4_val;
+        cr4_val = readcr4();
+        cr4_val |= (1 << 9 | 1 << 10);
+        writecr4(cr4_val);
 #endif
-	cpu_initialize();
-#if CONFIG_SERIAL_CPU_INIT
-	spin_unlock(&start_cpu_lock);
-#endif
-
+	atomic_inc(&active_cpus);
+	cpu_initialize(current_cpu_bus, index);
 	atomic_dec(&active_cpus);
-
-	stop_this_cpu();
-}
-
-static void start_other_cpus(struct bus *cpu_bus, device_t bsp_cpu)
-{
-	device_t cpu;
-	/* Loop through the cpus once getting them started */
-
-	for(cpu = cpu_bus->children; cpu ; cpu = cpu->sibling) {
-		if (cpu->path.type != DEVICE_PATH_APIC) {
-			continue;
-		}
-	#if !CONFIG_SERIAL_CPU_INIT
-		if(cpu==bsp_cpu) {
-			continue;
-		}
-	#endif
-
-		if (!cpu->enabled) {
-			continue;
-		}
-
-		if (cpu->initialized) {
-			continue;
-		}
-
-		if (!start_cpu(cpu)) {
-			/* Record the error in cpu? */
-			printk(BIOS_ERR, "CPU 0x%02x would not start!\n",
-				cpu->path.apic.apic_id);
-		}
-#if CONFIG_SERIAL_CPU_INIT
-		udelay(10);
-#endif
-	}
-
 }
 
 static void wait_other_cpus_stop(struct bus *cpu_bus)
@@ -473,6 +262,7 @@
 				cpu->path.apic.apic_id);
 		}
 	}
+	stop_all_ap_cpus();
 	printk(BIOS_DEBUG, "All AP CPUs stopped (%ld loops)\n", loopcount);
 }
 
@@ -481,10 +271,6 @@
 void initialize_cpus(struct bus *cpu_bus)
 {
 	struct device_path cpu_path;
-	struct cpu_info *info;
-
-	/* Find the info struct for this cpu */
-	info = cpu_info();
 
 #if NEED_LAPIC == 1
 	/* Ensure the local apic is enabled */
@@ -499,9 +285,6 @@
 	cpu_path.cpu.id       = 0;
 #endif
 
-	/* Find the device structure for the boot cpu */
-	info->cpu = alloc_find_dev(cpu_bus, &cpu_path);
-
 #if CONFIG_SMP
 	copy_secondary_start_to_1m_below(); // why here? In case some day we can start core1 in amd_sibling_init
 #endif
@@ -512,21 +295,11 @@
 
 	cpus_ready_for_init();
 
-#if CONFIG_SMP
-	#if !CONFIG_SERIAL_CPU_INIT
-	/* start all aps at first, so we can init ECC all together */
-	start_other_cpus(cpu_bus, info->cpu);
-	#endif
-#endif
-
 	/* Initialize the bootstrap processor */
-	cpu_initialize();
+	cpu_initialize(cpu_bus, 0);
 
 #if CONFIG_SMP
-	#if CONFIG_SERIAL_CPU_INIT
-	start_other_cpus(cpu_bus, info->cpu);
-	#endif
-
+	lapic_start_cpus(cpu_bus);
 	/* Now wait the rest of the cpus stop*/
 	wait_other_cpus_stop(cpu_bus);
 #endif
diff --git a/src/cpu/x86/lapic/secondary.S b/src/cpu/x86/lapic/secondary.S
index dc00b08..67e44c4 100644
--- a/src/cpu/x86/lapic/secondary.S
+++ b/src/cpu/x86/lapic/secondary.S
@@ -2,7 +2,7 @@
 #include <cpu/x86/lapic_def.h>
 
 	.text
-	.globl _secondary_start, _secondary_start_end
+	.globl _secondary_start, _secondary_start_end, cpucount
 	.balign 4096
 _secondary_start:
 	.code16
@@ -38,15 +38,25 @@
 	/* Load the Interrupt descriptor table */
 	lidt	idtarg
 
-	/* Set the stack pointer, and flag that we are done */
-	xorl	%eax, %eax
-	movl	secondary_stack, %esp
-	movl	%eax, secondary_stack
+	/* increment our cpu index */
+	movl	$1, %eax
+	lock	xadd %eax, cpucount
+	incl	%eax
+	movl	%eax, %ecx
 
+	/* assign stack for this specific cpu */
+	mov	_stack, %esp
+	mov	$CONFIG_STACK_SIZE, %ebx
+	mul	%ebx
+	add	%eax, %esp
+
+	pushl	%ecx
 	call	secondary_cpu_init
 1:	hlt
 	jmp	1b
 
+cpucount:
+	.long	1
 gdtaddr:
 	.word   gdt_limit	/* the table limit */
 	.long   gdt             /* we know the offset */
diff --git a/src/cpu/x86/pae/pgtbl.c b/src/cpu/x86/pae/pgtbl.c
index 814c5f1..7aa17c2 100644
--- a/src/cpu/x86/pae/pgtbl.c
+++ b/src/cpu/x86/pae/pgtbl.c
@@ -3,8 +3,10 @@
 */
 
 #include <console/console.h>
+#include <device/device.h>
 #include <cpu/cpu.h>
 #include <cpu/x86/pae.h>
+#include <cpu/x86/lapic.h>
 #include <string.h>
 
 static void paging_off(void)
@@ -43,6 +45,14 @@
 		);
 }
 
+static int cpu_index(void)
+{
+	device_t dev = dev_find_lapic(lapicid());
+	if (!dev)
+		return -1;
+	return dev->path.apic.index;
+}
+
 void *map_2M_page(unsigned long page)
 {
 	struct pde {
@@ -60,7 +70,9 @@
 	unsigned long window;
 	void *result;
 	int i;
+
 	index = cpu_index();
+
 	if ((index < 0) || (index >= CONFIG_MAX_CPUS)) {
 		return MAPPING_ERROR;
 	}
diff --git a/src/drivers/i2c/w83795/w83795.c b/src/drivers/i2c/w83795/w83795.c
index 392471a9..12be4da 100644
--- a/src/drivers/i2c/w83795/w83795.c
+++ b/src/drivers/i2c/w83795/w83795.c
@@ -22,6 +22,7 @@
 #include <console/console.h>
 #include <device/device.h>
 #include "southbridge/amd/cimx/sb700/smbus.h" /*SMBUS_IO_BASE*/
+#include <cpu/x86/lapic.h>
 #include "w83795.h"
 
 static u32 w83795_set_bank(u8 bank)
@@ -224,10 +225,8 @@
 static void w83795_hwm_init(device_t dev)
 {
 	struct device *cpu;
-	struct cpu_info *info;
 
-	info = cpu_info();
-	cpu = info->cpu;
+	cpu = dev_find_lapic(lapicid());
 	if (!cpu)
 		die("CPU: missing cpu device structure");
 
diff --git a/src/include/cpu/cpu.h b/src/include/cpu/cpu.h
index c2113c1..9765dfd 100644
--- a/src/include/cpu/cpu.h
+++ b/src/include/cpu/cpu.h
@@ -4,10 +4,12 @@
 #include <arch/cpu.h>
 
 #if !defined(__ROMCC__)
-void cpu_initialize(void);
+void cpu_initialize(struct bus *cpu_bus, int index);
 struct bus;
 void initialize_cpus(struct bus *cpu_bus);
-void secondary_cpu_init(void);
+void secondary_cpu_init(int index);
+
+extern unsigned int cpucount;
 
 #if !CONFIG_WAIT_BEFORE_CPUS_INIT
 	#define cpus_ready_for_init() do {} while(0)
diff --git a/src/include/cpu/x86/lapic.h b/src/include/cpu/x86/lapic.h
index 078f2a7..5c48025 100644
--- a/src/include/cpu/x86/lapic.h
+++ b/src/include/cpu/x86/lapic.h
@@ -52,20 +52,13 @@
 }
 
 #ifndef __ROMCC__
-#if CONFIG_AP_IN_SIPI_WAIT != 1
-/* If we need to go back to sipi wait, we use the long non-inlined version of
- * this function in lapic_cpu_init.c
- */
 static inline __attribute__((always_inline)) void stop_this_cpu(void)
 {
-	/* Called by an AP when it is ready to halt and wait for a new task */
-	for(;;) {
-		hlt();
-	}
+       /* Called by an AP when it is ready to halt and wait for a new task */
+       for(;;) {
+               hlt();
+       }
 }
-#else
-void stop_this_cpu(void);
-#endif
 
 #if !defined(__PRE_RAM__)
 
diff --git a/src/include/device/path.h b/src/include/device/path.h
index 63e9538..018fb93 100644
--- a/src/include/device/path.h
+++ b/src/include/device/path.h
@@ -40,6 +40,7 @@
 	unsigned apic_id;
 	unsigned node_id;
 	unsigned core_id;
+	unsigned index;
 };
 
 struct apic_cluster_path