Handle unaligned sizes in iomemcpy().

This fixes a bug causing truncation of uncompressed files in
cbfs_copyfile().
diff --git a/src/coreboot.c b/src/coreboot.c
index 5d9a101..db0063b 100644
--- a/src/coreboot.c
+++ b/src/coreboot.c
@@ -510,11 +510,10 @@
     void *src = (void*)file + ntohl(file->offset);
     if (cbfs_iscomp(file)) {
         // Compressed - copy to temp ram and uncompress it.
-        u32 asize = ALIGN(size, 4);
-        void *temp = malloc_tmphigh(asize);
+        void *temp = malloc_tmphigh(size);
         if (!temp)
             return -1;
-        iomemcpy(temp, src, asize);
+        iomemcpy(temp, src, size);
         int ret = ulzma(dst, maxlen, temp, size);
         yield();
         free(temp);
diff --git a/src/util.c b/src/util.c
index 2c22dfc..8e02d1e 100644
--- a/src/util.c
+++ b/src/util.c
@@ -202,24 +202,27 @@
     return d1;
 }
 
-// Copy from memory mapped IO.  IO mem is very slow, so yield
-// periodically.  'len' must be 4 byte aligned.
+// Copy to/from memory mapped IO.  IO mem is very slow, so yield
+// periodically.
 void
 iomemcpy(void *d, const void *s, u32 len)
 {
     yield();
-    while (len) {
+    while (len > 3) {
         u32 copylen = len;
         if (copylen > 2048)
             copylen = 2048;
-        len -= copylen;
         copylen /= 4;
+        len -= copylen * 4;
         asm volatile(
             "rep movsl (%%esi),%%es:(%%edi)"
             : "+c"(copylen), "+S"(s), "+D"(d)
             : : "cc", "memory");
         yield();
     }
+    if (len)
+        // Copy any remaining bytes.
+        memcpy(d, s, len);
 }
 
 void *