Synchronize rdtsc instructions

The CPU can arbitrarily reorder calls to rdtsc, significantly
reducing the precision of timing using the CPUs time stamp counter.
Unfortunately the method of synchronizing rdtsc is different
on AMD and Intel CPUs. There is a generic method, using the cpuid
instruction, but that uses up a lot of registers, and is very slow.
Hence, use the correct lfence/mfence instructions (for CPUs that
we know support it)

Change-Id: I17ecb48d283f38f23148c13159aceda704c64ea5
Signed-off-by: Stefan Reinauer <reinauer@google.com>
Reviewed-on: http://review.coreboot.org/1422
Reviewed-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Tested-by: build bot (Jenkins)
diff --git a/src/include/cpu/x86/tsc.h b/src/include/cpu/x86/tsc.h
index c573627..6ce7f5f 100644
--- a/src/include/cpu/x86/tsc.h
+++ b/src/include/cpu/x86/tsc.h
@@ -1,6 +1,14 @@
 #ifndef CPU_X86_TSC_H
 #define CPU_X86_TSC_H
 
+#if CONFIG_TSC_SYNC_MFENCE
+#define TSC_SYNC "mfence\n"
+#elif CONFIG_TSC_SYNC_LFENCE
+#define TSC_SYNC "lfence\n"
+#else
+#define TSC_SYNC
+#endif
+
 struct tsc_struct {
 	unsigned lo;
 	unsigned hi;
@@ -10,10 +18,11 @@
 static inline tsc_t rdtsc(void)
 {
 	tsc_t res;
-	__asm__ __volatile__ (
+	asm volatile (
+		TSC_SYNC
 		"rdtsc"
 		: "=a" (res.lo), "=d"(res.hi) /* outputs */
-		);
+	);
 	return res;
 }
 
@@ -22,7 +31,11 @@
 static inline unsigned long long rdtscll(void)
 {
 	unsigned long long val;
-	asm volatile ("rdtsc" : "=A" (val));
+	asm volatile (
+		TSC_SYNC
+		"rdtsc"
+		: "=A" (val)
+	);
 	return val;
 }
 #endif