Enhance experimental option rom "threading" - enable preemption.

When experimental support for parallelizing option roms and hardware
   init (default disabled) is selected, add support for checking on
   hardware init progress from the RTC irq handler.
Enable ability for RTC to be turned on for additional users.
Allow regular option roms (not just vga option roms) to run in
   parallel with hardware init.
Don't use stack in transition32 / transition16 until new mode is
   entered.
Also, cleanup leaking of data handlers in usb code.
Also, decrease frequency of iomemcpy checks (every 2K instead of 1K).
diff --git a/Makefile b/Makefile
index b259bb8..5002b3e 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@
 OUT=out/
 
 # Source files
-SRCBOTH=misc.c pmm.c output.c util.c stacks.c block.c floppy.c ata.c mouse.c \
+SRCBOTH=misc.c pmm.c stacks.c output.c util.c block.c floppy.c ata.c mouse.c \
         kbd.c pci.c serial.c clock.c pic.c cdrom.c ps2port.c smp.c resume.c \
         pnpbios.c pirtable.c vgahooks.c ramdisk.c \
         usb.c usb-uhci.c usb-ohci.c usb-hid.c paravirt.c
diff --git a/src/biosvar.h b/src/biosvar.h
index 7a8e33b..aeebf0f 100644
--- a/src/biosvar.h
+++ b/src/biosvar.h
@@ -206,6 +206,7 @@
 
     // 0x121 - Begin custom storage.
     u8 ps2ctr;
+    int RTCusers;
 
     // El Torito Emulation data
     struct cdemu_s cdemu;
diff --git a/src/clock.c b/src/clock.c
index 7077631..6706e5c 100644
--- a/src/clock.c
+++ b/src/clock.c
@@ -466,6 +466,32 @@
  * Periodic timer
  ****************************************************************/
 
+void
+useRTC()
+{
+    u16 ebda_seg = get_ebda_seg();
+    int count = GET_EBDA2(ebda_seg, RTCusers);
+    SET_EBDA2(ebda_seg, RTCusers, count+1);
+    if (count)
+        return;
+    // Turn on the Periodic Interrupt timer
+    u8 bRegister = inb_cmos(CMOS_STATUS_B);
+    outb_cmos(bRegister | RTC_B_PIE, CMOS_STATUS_B);
+}
+
+void
+releaseRTC()
+{
+    u16 ebda_seg = get_ebda_seg();
+    int count = GET_EBDA2(ebda_seg, RTCusers);
+    SET_EBDA2(ebda_seg, RTCusers, count-1);
+    if (count != 1)
+        return;
+    // Clear the Periodic Interrupt.
+    u8 bRegister = inb_cmos(CMOS_STATUS_B);
+    outb_cmos(bRegister & ~RTC_B_PIE, CMOS_STATUS_B);
+}
+
 static int
 set_usertimer(u32 usecs, u16 seg, u16 offset)
 {
@@ -476,22 +502,18 @@
     SET_BDA(rtc_wait_flag, RWS_WAIT_PENDING);  // Set status byte.
     SET_BDA(user_wait_complete_flag, SEGOFF(seg, offset));
     SET_BDA(user_wait_timeout, usecs);
-
-    // Turn on the Periodic Interrupt timer
-    u8 bRegister = inb_cmos(CMOS_STATUS_B);
-    outb_cmos(bRegister | RTC_B_PIE, CMOS_STATUS_B);
-
+    useRTC();
     return 0;
 }
 
 static void
 clear_usertimer()
 {
+    if (!(GET_BDA(rtc_wait_flag) & RWS_WAIT_PENDING))
+        return;
     // Turn off status byte.
     SET_BDA(rtc_wait_flag, 0);
-    // Clear the Periodic Interrupt.
-    u8 bRegister = inb_cmos(CMOS_STATUS_B);
-    outb_cmos(bRegister & ~RTC_B_PIE, CMOS_STATUS_B);
+    releaseRTC();
 }
 
 #define RET_ECLOCKINUSE  0x83
@@ -574,6 +596,8 @@
 
     // Handle Periodic Interrupt.
 
+    check_preempt();
+
     if (!GET_BDA(rtc_wait_flag))
         goto done;
 
diff --git a/src/misc.c b/src/misc.c
index b12e860..7d6e954 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -128,11 +128,6 @@
  * GDT and IDT tables
  ****************************************************************/
 
-struct descloc_s {
-    u16 length;
-    u32 addr;
-} PACKED;
-
 // Real mode IDT descriptor
 struct descloc_s rmode_IDT_info VAR16VISIBLE = {
     .length = sizeof(struct rmode_IVT) - 1,
diff --git a/src/optionroms.c b/src/optionroms.c
index 4dd53cb..27465ad 100644
--- a/src/optionroms.c
+++ b/src/optionroms.c
@@ -94,7 +94,9 @@
     br.es = SEG_BIOS;
     br.di = get_pnp_offset();
     br.code = SEGOFF(seg, offset);
+    start_preempt();
     call16big(&br);
+    finish_preempt();
 
     debug_serial_setup();
 }
diff --git a/src/post.c b/src/post.c
index 9417480..42a27bc 100644
--- a/src/post.c
+++ b/src/post.c
@@ -208,11 +208,16 @@
     ramdisk_setup();
 
     // Run option roms
-    if (CONFIG_THREADS && CONFIG_THREAD_OPTIONROMS)
-        // Run vga option rom (if running asynchronously)
+    if (CONFIG_THREADS && CONFIG_THREAD_OPTIONROMS) {
+        // Run option roms while hw init still in progress.
         vga_setup();
-    wait_threads();
-    optionrom_setup();
+        optionrom_setup();
+        wait_threads();
+    } else {
+        // Wait for hw init to finish and run non-vga option roms.
+        wait_threads();
+        optionrom_setup();
+    }
 
     // Run BCVs and show optional boot menu
     boot_prep();
diff --git a/src/romlayout.S b/src/romlayout.S
index 556c4b7..b651a2b 100644
--- a/src/romlayout.S
+++ b/src/romlayout.S
@@ -26,10 +26,10 @@
  ****************************************************************/
 
 // Place CPU into 32bit mode from 16bit mode.
-// Clobbers: flags, segment registers, cr0, idt/gdt
+// Clobbers: ecx, flags, segment registers, cr0, idt/gdt
         DECLFUNC transition32
 transition32:
-        pushl %eax
+        movl %eax, %ecx
 
         // Disable irqs (and clear direction flag)
         cli
@@ -67,15 +67,15 @@
         movw %ax, %fs
         movw %ax, %gs
 
-        popl %eax
+        movl %ecx, %eax
         retl
 
 // Place CPU into 16bit mode from 32bit mode.
-// Clobbers: flags, segment registers, cr0, idt/gdt
+// Clobbers: ecx, flags, segment registers, cr0, idt/gdt
         DECLFUNC transition16
         .global transition16big
 transition16:
-        pushl %eax
+        movl %eax, %ecx
 
         // restore data segment limits to 0xffff
         movl $SEG32_MODE16_DS, %eax
@@ -96,7 +96,7 @@
         ljmpw $SEG32_MODE16_CS, $1f
 
 transition16big:
-        pushl %eax
+        movl %eax, %ecx
 
         movl $SEG32_MODE16BIG_DS, %eax
         movw %ax, %ds
@@ -129,7 +129,7 @@
         movw %ax, %ds
         movw %ax, %ss  // Assume stack is in segment 0
 
-        popl %eax
+        movl %ecx, %eax
         retl
 
 // Call a 16bit function from 16bit mode with a specified cpu register state
diff --git a/src/stacks.c b/src/stacks.c
index d71381f..bd1c75f 100644
--- a/src/stacks.c
+++ b/src/stacks.c
@@ -6,6 +6,7 @@
 
 #include "biosvar.h" // get_ebda_seg
 #include "util.h" // dprintf
+#include "bregs.h" // CR0_PE
 
 
 /****************************************************************
@@ -51,15 +52,18 @@
     void *stackpos;
 };
 
-struct thread_info MainThread;
+struct thread_info VAR16VISIBLE MainThread;
+int VAR16VISIBLE CanPreempt;
 
 void
 thread_setup()
 {
     MainThread.next = &MainThread;
     MainThread.stackpos = NULL;
+    CanPreempt = 0;
 }
 
+// Return the 'struct thread_info' for the currently running thread.
 struct thread_info *
 getCurThread()
 {
@@ -69,6 +73,24 @@
     return (void*)ALIGN_DOWN(esp, THREADSTACKSIZE);
 }
 
+// Switch to next thread stack.
+static void
+switch_next(struct thread_info *cur)
+{
+    struct thread_info *next = cur->next;
+    asm volatile(
+        "  pushl $1f\n"                 // store return pc
+        "  pushl %%ebp\n"               // backup %ebp
+        "  movl %%esp, 4(%%eax)\n"      // cur->stackpos = %esp
+        "  movl 4(%%ecx), %%esp\n"      // %esp = next->stackpos
+        "  popl %%ebp\n"                // restore %ebp
+        "  retl\n"                      // restore pc
+        "1:\n"
+        : "+a"(cur), "+c"(next)
+        :
+        : "ebx", "edx", "esi", "edi", "cc", "memory");
+}
+
 // Briefly permit irqs to occur.
 void
 yield()
@@ -84,18 +106,7 @@
         check_irqs();
 
     // Switch to the next thread
-    struct thread_info *next = cur->next;
-    asm volatile(
-        "  pushl $1f\n"                 // store return pc
-        "  pushl %%ebp\n"               // backup %ebp
-        "  movl %%esp, 4(%%eax)\n"      // cur->stackpos = %esp
-        "  movl 4(%%ecx), %%esp\n"      // %esp = next->stackpos
-        "  popl %%ebp\n"                // restore %ebp
-        "  retl\n"                      // restore pc
-        "1:\n"
-        : "+a"(cur), "+c"(next)
-        :
-        : "ebx", "edx", "esi", "edi", "cc", "memory");
+    switch_next(cur);
 }
 
 // Last thing called from a thread (called on "next" stack).
@@ -110,6 +121,7 @@
     dprintf(DEBUG_thread, "\\%08x/ End thread\n", (u32)old);
 }
 
+// Create a new thread and start executing 'func' in it.
 void
 run_thread(void (*func)(void*), void *data)
 {
@@ -152,6 +164,7 @@
     func(data);
 }
 
+// Wait for all threads (other than the main thread) to complete.
 void
 wait_threads()
 {
@@ -161,3 +174,114 @@
     while (MainThread.next != &MainThread)
         yield();
 }
+
+
+/****************************************************************
+ * Thread preemption
+ ****************************************************************/
+
+static u32 PreemptCount;
+
+// Turn on RTC irqs and arrange for them to check the 32bit threads.
+void
+start_preempt()
+{
+    if (! CONFIG_THREADS || ! CONFIG_THREAD_OPTIONROMS)
+        return;
+    CanPreempt = 1;
+    PreemptCount = 0;
+    useRTC();
+}
+
+// Turn off RTC irqs / stop checking for thread execution.
+void
+finish_preempt()
+{
+    if (! CONFIG_THREADS || ! CONFIG_THREAD_OPTIONROMS)
+        return;
+    CanPreempt = 0;
+    releaseRTC();
+    dprintf(1, "Done preempt - %d checks\n", PreemptCount);
+}
+
+static inline u32 getcr0() {
+    u32 cr0;
+    asm("movl %%cr0, %0" : "=r"(cr0));
+    return cr0;
+}
+static inline void sgdt(struct descloc_s *desc) {
+    asm("sgdtl %0" : "=m"(*desc));
+}
+static inline void lgdt(struct descloc_s *desc) {
+    asm("lgdtl %0" : : "m"(*desc) : "memory");
+}
+
+#if !MODE16
+// Try to execute 32bit threads.
+void VISIBLE32
+yield_preempt()
+{
+    PreemptCount++;
+    switch_next(&MainThread);
+}
+#endif
+
+// 16bit code that checks if threads are pending and executes them if so.
+void
+check_preempt()
+{
+    ASSERT16();
+    if (! CONFIG_THREADS || ! CONFIG_THREAD_OPTIONROMS
+        || !GET_GLOBAL(CanPreempt)
+        || GET_GLOBAL(MainThread.next) == &MainThread)
+        return;
+    u32 cr0 = getcr0();
+    if (cr0 & CR0_PE)
+        // Called in 16bit protected mode?!
+        return;
+
+    // Backup cmos index register and disable nmi
+    u8 cmosindex = inb(PORT_CMOS_INDEX);
+    outb(cmosindex | NMI_DISABLE_BIT, PORT_CMOS_INDEX);
+    inb(PORT_CMOS_DATA);
+
+    // Backup fs/gs and gdt
+    u16 fs = GET_SEG(FS), gs = GET_SEG(GS);
+    struct descloc_s gdt;
+    sgdt(&gdt);
+
+    u32 bkup_ss, bkup_esp;
+    asm volatile(
+        // Backup ss/esp / set esp to flat stack location
+        "  movl %%ss, %0\n"
+        "  movl %%esp, %1\n"
+        "  shll $4, %0\n"
+        "  addl %0, %%esp\n"
+        "  movl %%ss, %0\n"
+
+        // Transition to 32bit mode, call yield_preempt, return to 16bit
+        "  pushl $(" __stringify(BUILD_BIOS_ADDR) " + 1f)\n"
+        "  jmp transition32\n"
+        "  .code32\n"
+        "1:calll (yield_preempt - " __stringify(BUILD_BIOS_ADDR) ")\n"
+        "  pushl $2f\n"
+        "  jmp transition16big\n"
+        "  .code16gcc\n"
+        "2:\n"
+
+        // Restore ds/ss/esp
+        "  movl %0, %%ds\n"
+        "  movl %0, %%ss\n"
+        "  movl %1, %%esp\n"
+        : "=&r" (bkup_ss), "=&r" (bkup_esp)
+        : : "eax", "ecx", "edx", "cc", "memory");
+
+    // Restore gdt and fs/gs
+    lgdt(&gdt);
+    SET_SEG(FS, fs);
+    SET_SEG(GS, gs);
+
+    // Restore cmos index register
+    outb(cmosindex, PORT_CMOS_INDEX);
+    inb(PORT_CMOS_DATA);
+}
diff --git a/src/usb-ohci.c b/src/usb-ohci.c
index b62090f..6ad1dbc 100644
--- a/src/usb-ohci.c
+++ b/src/usb-ohci.c
@@ -128,6 +128,7 @@
         return;
     struct usb_s *cntl = data;
 
+    // XXX - don't call pci_config_XXX from a thread
     cntl->type = USB_TYPE_OHCI;
     u32 baseaddr = pci_config_readl(cntl->bdf, PCI_BASE_ADDRESS_0);
     cntl->ohci.regs = (void*)(baseaddr & PCI_BASE_ADDRESS_MEM_MASK);
@@ -229,7 +230,8 @@
 
     int ret = wait_ed(ed);
     ed->hwINFO = ED_SKIP;
-    usleep(1); // XXX - in case controller still accessing tds
+    if (ret)
+        usleep(1); // XXX - in case controller still accessing tds
     free(tds);
     return ret;
 }
diff --git a/src/usb-uhci.c b/src/usb-uhci.c
index 64531a9..d98c08b 100644
--- a/src/usb-uhci.c
+++ b/src/usb-uhci.c
@@ -130,6 +130,7 @@
         return;
     struct usb_s *cntl = data;
 
+    // XXX - don't call pci_config_XXX from a thread
     cntl->type = USB_TYPE_UHCI;
     cntl->uhci.iobase = (pci_config_readl(cntl->bdf, PCI_BASE_ADDRESS_4)
                          & PCI_BASE_ADDRESS_IO_MASK);
@@ -218,10 +219,12 @@
     struct uhci_qh *data_qh = cntl->uhci.qh;
     data_qh->element = (u32)&tds[0];
     int ret = wait_qh(cntl, data_qh);
-    if (ret)
+    if (ret) {
+        data_qh->element = UHCI_PTR_TERM;
         // XXX - leak tds
         return ret;
-    // XXX - free(tds);
+    }
+    free(tds);
     return 0;
 }
 
diff --git a/src/util.c b/src/util.c
index d908ed9..f5ae0e1 100644
--- a/src/util.c
+++ b/src/util.c
@@ -228,8 +228,8 @@
     yield();
     while (len) {
         u32 copylen = len;
-        if (copylen > 1024)
-            copylen = 1024;
+        if (copylen > 2048)
+            copylen = 2048;
         len -= copylen;
         copylen /= 4;
         asm volatile(
diff --git a/src/util.h b/src/util.h
index 6837dbb..1eafce0 100644
--- a/src/util.h
+++ b/src/util.h
@@ -126,16 +126,6 @@
     return *(volatile const u8 *)addr;
 }
 
-// GDT bit manipulation
-#define GDT_BASE(v)  ((((u64)(v) & 0xff000000) << 32)           \
-                      | (((u64)(v) & 0x00ffffff) << 16))
-#define GDT_LIMIT(v) ((((u64)(v) & 0x000f0000) << 32)   \
-                      | (((u64)(v) & 0x0000ffff) << 0))
-#define GDT_CODE     (0x9bULL << 40) // Code segment - P,R,A bits also set
-#define GDT_DATA     (0x93ULL << 40) // Data segment - W,A bits also set
-#define GDT_B        (0x1ULL << 54)  // Big flag
-#define GDT_G        (0x1ULL << 55)  // Granularity flag
-
 #define call16_simpint(nr, peax, pflags) do {                           \
         ASSERT16();                                                     \
         asm volatile(                                                   \
@@ -150,6 +140,21 @@
             : "cc", "memory");                                          \
     } while (0)
 
+// GDT bit manipulation
+#define GDT_BASE(v)  ((((u64)(v) & 0xff000000) << 32)           \
+                      | (((u64)(v) & 0x00ffffff) << 16))
+#define GDT_LIMIT(v) ((((u64)(v) & 0x000f0000) << 32)   \
+                      | (((u64)(v) & 0x0000ffff) << 0))
+#define GDT_CODE     (0x9bULL << 40) // Code segment - P,R,A bits also set
+#define GDT_DATA     (0x93ULL << 40) // Data segment - W,A bits also set
+#define GDT_B        (0x1ULL << 54)  // Big flag
+#define GDT_G        (0x1ULL << 55)  // Granularity flag
+
+struct descloc_s {
+    u16 length;
+    u32 addr;
+} PACKED;
+
 // util.c
 struct bregs;
 inline void call16(struct bregs *callregs);
@@ -188,6 +193,9 @@
 void yield();
 void run_thread(void (*func)(void*), void *data);
 void wait_threads();
+void start_preempt();
+void finish_preempt();
+void check_preempt();
 
 // output.c
 void debug_serial_setup();
@@ -252,6 +260,8 @@
 u64 calc_future_tsc_usec(u32 usecs);
 void handle_1583(struct bregs *regs);
 void handle_1586(struct bregs *regs);
+void useRTC();
+void releaseRTC();
 
 // apm.c
 void VISIBLE16 handle_1553(struct bregs *regs);