libpayload: Reorder default memcpy, speed up memset and memcmp

The current default memcpy first copies single bytes to align the
amount, then copies the rest as full words. In practice, the start of a
buffer is much more likely to be word-aligned then the end, and aligned
word access are usually more efficient. This patch reorders those
accesses to first copy as many full words as possible and then finish
the rest with byte accesses to optimize this common case.

This fixes a data abort when using USB on ARM without CONFIG_GPL. Due to
some limitations of how DMA memory is set up in coreboot on ARM, it
currently does not support unaligned accesses. (This could be fixed with
a more complicated patch, but it's usually not an issue... unless, of
course, your memcpy happens to be braindead).

Also add word-aligned accesses to memset and memcmp while I'm at it, and
make memcmp's return value standard's compliant.

BUG=chrome-os-partner:24957
TEST=Manual

Original-Change-Id: I2a7bcb35626a05a9a43fcfd99eb958b485d7622a
Original-Signed-off-by: Julius Werner <jwerner@chromium.org>
Original-Reviewed-on: https://chromium-review.googlesource.com/203547
Original-Reviewed-by: Stefan Reinauer <reinauer@chromium.org>
Original-Reviewed-by: David Hendricks <dhendrix@chromium.org>
(cherry picked from commit 05a64d2e107e1675cc3442e6dabe14a341e55673)
Signed-off-by: Marc Jones <marc.jones@se-eng.com>

Change-Id: I0030ca8a203c97587b0da31a0a5e9e11b0be050f
Reviewed-on: http://review.coreboot.org/8126
Tested-by: build bot (Jenkins)
Reviewed-by: Stefan Reinauer <stefan.reinauer@coreboot.org>
diff --git a/payloads/libpayload/libc/memory.c b/payloads/libpayload/libc/memory.c
index 12d7e33..aec60e4 100644
--- a/payloads/libpayload/libc/memory.c
+++ b/payloads/libpayload/libc/memory.c
@@ -35,12 +35,22 @@
 
 static void *default_memset(void *s, int c, size_t n)
 {
-	char *os = s;
+	size_t i;
+	void *ret = s;
+	unsigned long w = c & 0xff;
 
-	while (n--)
-		*(os++) = c;
+	for (i = 1; i < sizeof(unsigned long); i <<= 1)
+		w = (w << (i * 8)) | w;
 
-	return s;
+	for (i = 0; i < n / sizeof(unsigned long); i++)
+		((unsigned long *)s)[i] = w;
+
+	s += i * sizeof(unsigned long);
+
+	for (i = 0; i < n % sizeof(unsigned long); i++)
+		((u8 *)s)[i] = (u8)c;
+
+	return ret;
 }
 
 void *memset(void *s, int c, size_t n)
@@ -48,18 +58,17 @@
 
 static void *default_memcpy(void *dst, const void *src, size_t n)
 {
-	int i;
+	size_t i;
 	void *ret = dst;
 
-	for(i = 0; i < n % sizeof(unsigned long); i++)
-		((unsigned char *) dst)[i] = ((unsigned char *) src)[i];
-
-	n -= i;
-	src += i;
-	dst += i;
-
 	for(i = 0; i < n / sizeof(unsigned long); i++)
-		((unsigned long *) dst)[i] = ((unsigned long *) src)[i];
+		((unsigned long *)dst)[i] = ((unsigned long *)src)[i];
+
+	src += i * sizeof(unsigned long);
+	dst += i * sizeof(unsigned long);
+
+	for(i = 0; i < n % sizeof(unsigned long); i++)
+		((u8 *)dst)[i] = ((u8 *)src)[i];
 
 	return ret;
 }
@@ -69,8 +78,7 @@
 
 static void *default_memmove(void *dst, const void *src, size_t n)
 {
-	int i;
-	unsigned long offs;
+	size_t i, offs;
 
 	if (src > dst)
 		return memcpy(dst, src, n);
@@ -78,8 +86,7 @@
 	offs = n - (n % sizeof(unsigned long));
 
 	for (i = (n % sizeof(unsigned long)) - 1; i >= 0; i--)
-		((unsigned char *)dst)[i + offs] =
-			((unsigned char *)src)[i + offs];
+		((u8 *)dst)[i + offs] = ((u8 *)src)[i + offs];
 
 	for (i = n / sizeof(unsigned long) - 1; i >= 0; i--)
 		((unsigned long *)dst)[i] = ((unsigned long *)src)[i];
@@ -95,17 +102,27 @@
  *
  * @param s1 Pointer to the first area to compare.
  * @param s2 Pointer to the second area to compare.
- * @param len Size of the first area in bytes (both must have the same length).
- * @return If len is 0, return zero. If the areas match, return zero.
- *         Otherwise return non-zero.
+ * @param n Size of the first area in bytes (both must have the same length).
+ * @return If n is 0, return zero. Otherwise, return a value less than, equal
+ * 	   to, or greater than zero if s1 is found less than, equal to, or
+ * 	   greater than s2 respectively.
  */
 
-static int default_memcmp(const void *s1, const void *s2, size_t len)
+static int default_memcmp(const void *s1, const void *s2, size_t n)
 {
-	for (; len && *(char *)s1++ == *(char *)s2++; len--) ;
-	return len;
+	size_t i;
+
+	for (i = 0; i < n / sizeof(unsigned long); i++)
+		if (((unsigned long *)s1)[i] != ((unsigned long *)s2)[i])
+			break;	/* fall through to find differing byte */
+
+	for (i *= sizeof(unsigned long); i < n; i++)
+		if (((u8 *)s1)[i] != ((u8 *)s2)[i])
+			return ((u8 *)s1)[i] - ((u8 *)s2)[i];
+
+	return 0;
 }
 
-int memcmp(const void *s1, const void *s2, size_t len)
+int memcmp(const void *s1, const void *s2, size_t n)
 	__attribute__((weak, alias("default_memcmp")));