Fixup previous memcpy optimization.

Different gcc versions handle __builtin_memcpy differently.
Add -minline-all-string to force inlining of memcpy on old gcc.
Always use __builtin_memcpy for all memcpy calls.
Use memcpy4() for the option rom case where 4-byte accesses is important.
diff --git a/Makefile b/Makefile
index 1cf7ba2..19e63d4 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,8 @@
 COMMONCFLAGS = -Wall -Os -MD -m32 -march=i386 -mregparm=3 \
                -mpreferred-stack-boundary=2 -mrtd -freg-struct-return \
                -ffreestanding -fwhole-program -fomit-frame-pointer \
-               -fno-delete-null-pointer-checks -Wno-strict-aliasing
+               -fno-delete-null-pointer-checks -Wno-strict-aliasing \
+               -minline-all-stringops
 COMMONCFLAGS += $(call cc-option,$(CC),-nopie,)
 COMMONCFLAGS += $(call cc-option,$(CC),-fno-stack-protector,)
 COMMONCFLAGS += $(call cc-option,$(CC),-fno-stack-protector-all,)