Make sure memcpy() works in 16bit mode also.

Make sure %es is set before doing a "rep movs" call.
diff --git a/src/util.c b/src/util.c
index 1faca26..283ec08 100644
--- a/src/util.c
+++ b/src/util.c
@@ -182,8 +182,11 @@
 void *
 #undef memcpy
 memcpy(void *d1, const void *s1, size_t len)
+#if MODE16 == 0
 #define memcpy __builtin_memcpy
+#endif
 {
+    SET_SEG(ES, GET_SEG(SS));
     void *d = d1;
     if (((u32)d1 | (u32)s1 | len) & 3) {
         // non-aligned memcpy
diff --git a/src/util.h b/src/util.h
index 9790798..e2db7f0 100644
--- a/src/util.h
+++ b/src/util.h
@@ -89,7 +89,10 @@
 inline void memset_far(u16 d_seg, void *d_far, u8 c, size_t len);
 inline void memset16_far(u16 d_seg, void *d_far, u16 c, size_t len);
 void *memset(void *s, int c, size_t n);
+void *memcpy(void *d1, const void *s1, size_t len);
+#if MODE16 == 0
 #define memcpy __builtin_memcpy
+#endif
 inline void memcpy_far(u16 d_seg, void *d_far
                        , u16 s_seg, const void *s_far, size_t len);
 void *memmove(void *d, const void *s, size_t len);