Introduce optimized iomemcpy function for copying from io memory.

Reading from IO memory (pci rom or flash) is very slow, so add
   iomemcpy() which will yield during copy.  Use a 4 byte copy to
   optimize accesses.
Also, decompress cbfs data files from a temp memory buffer so that
   ulzma doesn't read from slow IO memory.
diff --git a/src/coreboot.c b/src/coreboot.c
index 7e0e361..7fa18e4 100644
--- a/src/coreboot.c
+++ b/src/coreboot.c
@@ -513,9 +513,18 @@
 
     u32 size = ntohl(file->len);
     void *src = (void*)file + ntohl(file->offset);
-    if (cbfs_iscomp(file))
-        // Compressed.
-        return ulzma(dst, maxlen, src, size);
+    if (cbfs_iscomp(file)) {
+        // Compressed - copy to temp ram and uncompress it.
+        u32 asize = ALIGN(size, 4);
+        void *temp = malloc_tmphigh(asize);
+        if (!temp)
+            return -1;
+        iomemcpy(temp, src, asize);
+        int ret = ulzma(dst, maxlen, temp, size);
+        yield();
+        free(temp);
+        return ret;
+    }
 
     // Not compressed.
     dprintf(3, "Copying data %d@%p to %d@%p\n", size, src, maxlen, dst);
@@ -523,7 +532,7 @@
         dprintf(1, "File too big to copy\n");
         return -1;
     }
-    memcpy(dst, src, size);
+    iomemcpy(dst, src, size);
     return size;
 }
 
diff --git a/src/optionroms.c b/src/optionroms.c
index bdc0cb5..4dd53cb 100644
--- a/src/optionroms.c
+++ b/src/optionroms.c
@@ -187,7 +187,7 @@
     }
     dprintf(4, "Copying option rom (size %d) from %p to %x\n"
             , romsize, rom, RomEnd);
-    memcpy((void*)RomEnd, rom, romsize);
+    iomemcpy((void*)RomEnd, rom, romsize);
     return (void*)RomEnd;
 }
 
diff --git a/src/util.c b/src/util.c
index 36f32e4..ec2054c 100644
--- a/src/util.c
+++ b/src/util.c
@@ -376,6 +376,26 @@
     return d1;
 }
 
+// Copy from memory mapped IO.  IO mem is very slow, so yield
+// periodically.  'len' must be 4 byte aligned.
+void
+iomemcpy(void *d, const void *s, u32 len)
+{
+    yield();
+    while (len) {
+        u32 copylen = len;
+        if (copylen > 1024)
+            copylen = 1024;
+        len -= copylen;
+        copylen /= 4;
+        asm volatile(
+            "rep movsl (%%esi),%%es:(%%edi)"
+            : "+c"(copylen), "+S"(s), "+D"(d)
+            : : "cc", "memory");
+        yield();
+    }
+}
+
 void *
 memmove(void *d, const void *s, size_t len)
 {
diff --git a/src/util.h b/src/util.h
index c3ecb41..3ab791e 100644
--- a/src/util.h
+++ b/src/util.h
@@ -164,6 +164,7 @@
 #endif
 inline void memcpy_far(u16 d_seg, void *d_far
                        , u16 s_seg, const void *s_far, size_t len);
+void iomemcpy(void *d, const void *s, u32 len);
 void *memmove(void *d, const void *s, size_t len);
 char *strtcpy(char *dest, const char *src, size_t len);
 struct bregs;