x86 memcpy: Copy 4 bytes at once

This is a slight improvement over the rep movsb loop

Change-Id: Id71d9bfe5330b154a5c62fac85ce3955ae89b057
Signed-off-by: Stefan Reinauer <reinauer@google.com>
Reviewed-on: http://review.coreboot.org/1742
Tested-by: build bot (Jenkins)
Reviewed-by: Stefan Reinauer <stefan.reinauer@coreboot.org>
diff --git a/src/arch/x86/lib/memcpy.c b/src/arch/x86/lib/memcpy.c
index f8607cf..7f079ce 100644
--- a/src/arch/x86/lib/memcpy.c
+++ b/src/arch/x86/lib/memcpy.c
@@ -5,11 +5,13 @@
 	unsigned long d0, d1, d2;
 
 	asm volatile(
-		"rep movsb"
-		: "=S"(d0), "=D"(d1), "=c"(d2)
-		: "0"(src), "1"(dest), "2"(n)
+		"rep ; movsl\n\t"
+		"movl %4,%%ecx\n\t"
+		"rep ; movsb\n\t"
+		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		: "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
 		: "memory"
-		);
+	);
 
 	return dest;
 }