x86: Add .data section support for pre-memory stages

x86 pre-memory stages do not support the `.data` section and as a
result developers are required to include runtime initialization code
instead of relying on C global variable definition.

To illustrate the impact of this lack of `.data` section support, here
are two limitations I personally ran into:

1. The inclusion of libgfxinit in romstage for Raptor Lake has
   required some changes in libgfxinit to ensure data is initialized at
   runtime. In addition, we had to manually map some `.data` symbols in
   the `_bss` region.

2. CBFS cache is currently not supported in pre-memory stages and
   enabling it would require to add an initialization function and
   find a generic spot to call it.

Other platforms do not have that limitation. Hence, resolving it would
help to align code and reduce compilation based restriction (cf. the
use of `ENV_HAS_DATA_SECTION` compilation flag in various places of
coreboot code).

We identified three cases to consider:

1. eXecute-In-Place pre-memory stages
   - code is in SPINOR
   - data is also stored in SPINOR but must be linked in Cache-As-RAM
     and copied there at runtime

2. `bootblock` stage is a bit different as it uses Cache-As-Ram but
   the memory mapping and its entry code different

3. pre-memory stages loaded in and executed from
   Cache-As-RAM (cf. `CONFIG_NO_XIP_EARLY_STAGES`).

eXecute-In-Place pre-memory stages (#1) require the creation of a new
ELF segment as the code segment Virtual Memory Address and Load Memory
Address are identical but the data needs to be linked in
cache-As-RAM (VMA) but to be stored right after the code (LMA).

Here is the output `readelf --segments` on a `romstage.debug` ELF
binary.

    Program Headers:
      Type    Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
      LOAD    0x000080 0x02000000 0x02000000 0x21960 0x21960 R E 0x20
      LOAD    0x0219e0 0xfefb1640 0x02021960 0x00018 0x00018 RW  0x4

     Section to Segment mapping:
      Segment Sections...
       00     .text
       01     .data

Segment 0 `VirtAddr` and `PhysAddr` are at the same address while they
are totally different for the Segment 1 holding the `.data`
section. Since we need the data section `VirtAddr` to be in the
Cache-As-Ram and its `PhysAddr` right after the `.text` section, the
use of a new segment is mandatory.

`bootblock` (#2) also uses this new segment to store the data right
after the code and load it to Cache-As-RAM at runtime. However, the
code involved is different.

Not eXecute-In-Place pre-memory stages (#3) do not really need any
special work other than enabling a data section as the code and data
VMA / LMA translation vector is the same.

TEST=#1 and #2 verified on rex and qemu 32 and 64 bits:
     - The `bootblock.debug`, `romstage.debug` and
       `verstage.debug` all have data stored at the end of the `.text`
       section and code to copy the data content to the Cache-As-RAM.
     - The CBFS stages included in the final image has not improperly
       relocated any of the `.data` section symbol.
     - Test purposes global data symbols we added in bootblock,
       romstage and verstage are properly accessible at runtime
     #3: for "Intel Apollolake DDR3 RVP1" board, we verified that the
     generated romstage ELF includes a .data section similarly to a
     regular memory enabled stage.

Change-Id: I030407fcc72776e59def476daa5b86ad0495debe
Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>
Reviewed-on: https://review.coreboot.org/c/coreboot/+/77289
Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
Reviewed-by: Arthur Heymans <arthur@aheymans.xyz>
diff --git a/Makefile.inc b/Makefile.inc
index b5665a7..22a4646 100644
--- a/Makefile.inc
+++ b/Makefile.inc
@@ -1267,7 +1267,11 @@
 #
 # Make sure that segment for .car.data is ignored while adding romstage.
 $(CONFIG_CBFS_PREFIX)/romstage-align := 64
+ifeq ($(CONFIG_NO_XIP_EARLY_STAGES),y)
 $(CONFIG_CBFS_PREFIX)/romstage-options := -S ".car.data"
+else
+$(CONFIG_CBFS_PREFIX)/romstage-options := -S ".car.data,.data"
+endif
 
 # If CAR does not support execution of code, romstage on x86 is expected to be
 # xip.
diff --git a/src/arch/x86/Makefile.inc b/src/arch/x86/Makefile.inc
index e0e8a3b..accb022 100644
--- a/src/arch/x86/Makefile.inc
+++ b/src/arch/x86/Makefile.inc
@@ -64,11 +64,6 @@
 $$(objcbfs)/$(1).debug: $$$$($(1)-libs) $$$$($(1)-objs)
 	@printf "    LINK       $$(subst $$(obj)/,,$$(@))\n"
 	$$(LD_$(1)) $$(LDFLAGS_$(1)) -o $$@ -L$$(obj) $$(COMPILER_RT_FLAGS_$(1)) --whole-archive --start-group $$(filter-out %.ld,$$($(1)-objs)) $$($(1)-libs) --no-whole-archive $$(COMPILER_RT_$(1)) --end-group -T $(call src-to-obj,$(1),$(CONFIG_MEMLAYOUT_LD_FILE)) --oformat $(2)
-	-LANG=C LC_ALL= $$(OBJCOPY_$(1)) --only-section .illegal_globals $$(@) $$(objcbfs)/$(1)_null.offenders >/dev/null 2>&1
-	if [ -z "$$$$($$(NM_$(1)) $$(objcbfs)/$(1)_null.offenders 2>&1 | grep 'no symbols')" ];then \
-		echo "Forbidden global variables in $(1):"; \
-		$$(NM_$(1)) $$(objcbfs)/$(1)_null.offenders; false; \
-	fi
 endef
 
 ###############################################################################
diff --git a/src/arch/x86/assembly_entry.S b/src/arch/x86/assembly_entry.S
index 869acc8..9a9a046 100644
--- a/src/arch/x86/assembly_entry.S
+++ b/src/arch/x86/assembly_entry.S
@@ -33,8 +33,8 @@
 	/* reset stack pointer to CAR/EARLYRAM stack */
 	mov	$_STACK_TOP, %esp
 
+#if ENV_SEPARATE_DATA_AND_BSS
 	/* clear .bss section as it is not shared */
-#if ENV_SEPARATE_BSS
 	cld
 	xor	%eax, %eax
 	movl	$(_ebss), %ecx
@@ -42,6 +42,14 @@
 	sub	%edi, %ecx
 	shrl	$2, %ecx
 	rep	stosl
+
+	/* Copy .data section content to Cache-As-Ram */
+	movl	$(_edata), %ecx
+	movl	$(_data), %edi
+	sub	%edi, %ecx
+	shrl	$2, %ecx
+	movl	$(_data_load),%esi
+	rep	movsl
 #endif
 
 #if ((ENV_SEPARATE_VERSTAGE && CONFIG(VERSTAGE_DEBUG_SPINLOOP)) \
diff --git a/src/arch/x86/car.ld b/src/arch/x86/car.ld
index ab6c3b0..a1a782f 100644
--- a/src/arch/x86/car.ld
+++ b/src/arch/x86/car.ld
@@ -59,7 +59,7 @@
 	 * cbmem console. This is useful for clearing this area on a per-stage
 	 * basis when more than one stage uses cache-as-ram. */
 
-#if ENV_SEPARATE_BSS
+#if ENV_SEPARATE_DATA_AND_BSS
 	. = ALIGN(ARCH_POINTER_ALIGN_SIZE);
 	_bss = .;
 	/* Allow global uninitialized variables for stages without CAR teardown. */
@@ -89,11 +89,32 @@
 	_shadow_size = (_ebss - _car_region_start) >> 3;
 	REGION(asan_shadow, ., _shadow_size, ARCH_POINTER_ALIGN_SIZE)
 #endif
-	_car_unallocated_start = .;
-	_car_region_end = . + CONFIG_DCACHE_RAM_SIZE - (. - _car_region_start)
-			  - CONFIG_FSP_T_RESERVED_SIZE;
 }
 
+#if ENV_SEPARATE_DATA_AND_BSS
+/* This symbol defines the load address of the Cache-As-RAM .data
+ * section. It should be right at the end of the .text section (_etext)
+ * and ARCH_POINTER_ALIGN_SIZE aligned. */
+_data_load = _etext;
+
+_bogus = ASSERT(_etext == ALIGN(_etext, ARCH_POINTER_ALIGN_SIZE), "Cache-As-RAM load address is improperly defined.");
+
+.data ALIGN(ARCH_POINTER_ALIGN_SIZE) : AT (_data_load) {
+	_data = .;
+	*(.data);
+	*(.data.*);
+	*(.sdata);
+	*(.sdata.*);
+	. = ALIGN(ARCH_POINTER_ALIGN_SIZE);
+	_edata = .;
+	RECORD_SIZE(data)
+} : data_segment
+#endif
+
+_car_unallocated_start = .;
+_car_region_end = . + CONFIG_DCACHE_RAM_SIZE - (. - _car_region_start)
+		  - CONFIG_FSP_T_RESERVED_SIZE;
+
 . = _car_region_start;
 .car.fspm_rc_heap . (NOLOAD) : {
 . += CONFIG_FSP_M_RC_HEAP_SIZE;
@@ -124,18 +145,11 @@
 _rom_mtrr_base = _rom_mtrr_mask;
 #endif
 
-/* Global variables are not allowed in romstage
- * This section is checked during stage creation to ensure
- * that there are no global variables present
- */
-
-. = 0xffffff00;
-.illegal_globals . : {
-	*(.data)
-	*(.data.*)
-}
-
+#if ENV_SEPARATE_DATA_AND_BSS
+_bogus = ASSERT((CONFIG_DCACHE_RAM_SIZE == 0) || (SIZEOF(.car.data) + SIZEOF(.data) <= CONFIG_DCACHE_RAM_SIZE), "Cache as RAM area is too full");
+#else
 _bogus = ASSERT((CONFIG_DCACHE_RAM_SIZE == 0) || (SIZEOF(.car.data) <= CONFIG_DCACHE_RAM_SIZE), "Cache as RAM area is too full");
+#endif
 #if CONFIG(PAGING_IN_CACHE_AS_RAM)
 _bogus2 = ASSERT(_pagetables == ALIGN(_pagetables, 4096), "_pagetables aren't 4KiB aligned");
 #endif
diff --git a/src/arch/x86/include/arch/header.ld b/src/arch/x86/include/arch/header.ld
index 5b380fa..2ee0212 100644
--- a/src/arch/x86/include/arch/header.ld
+++ b/src/arch/x86/include/arch/header.ld
@@ -3,6 +3,9 @@
 PHDRS
 {
 	to_load PT_LOAD;
+#if ENV_SEPARATE_DATA_AND_BSS
+	data_segment PT_LOAD;
+#endif
 }
 
 ENTRY(_start)
diff --git a/src/arch/x86/memlayout.ld b/src/arch/x86/memlayout.ld
index 549c2f9..f448bf8 100644
--- a/src/arch/x86/memlayout.ld
+++ b/src/arch/x86/memlayout.ld
@@ -3,6 +3,16 @@
 #include <memlayout.h>
 #include <arch/header.ld>
 
+/*
+ * The bootblock linker script should be included before the Cache-As-RAM linker
+ * script. Indeed, if it is included after and Cache-As-RAM .data section
+ * support is enabled, the definition order of the sections makes the linker
+ * create an image with an almost 4 GB hole.
+ */
+#if ENV_BOOTBLOCK
+INCLUDE "bootblock/arch/x86/bootblock.ld"
+#endif /* ENV_BOOTBLOCK */
+
 SECTIONS
 {
 	/*
@@ -36,7 +46,3 @@
 	POSTCAR(32M, 1M)
 #endif
 }
-
-#if ENV_BOOTBLOCK
-	INCLUDE "bootblock/arch/x86/bootblock.ld"
-#endif  /* ENV_BOOTBLOCK */
diff --git a/src/cpu/intel/car/core2/cache_as_ram.S b/src/cpu/intel/car/core2/cache_as_ram.S
index 9c60308..e134717 100644
--- a/src/cpu/intel/car/core2/cache_as_ram.S
+++ b/src/cpu/intel/car/core2/cache_as_ram.S
@@ -180,6 +180,9 @@
 	pushl	%eax	/* tsc[31:0] */
 #endif
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 before_c_entry:
 	post_code(POSTCODE_BOOTBLOCK_BEFORE_C_ENTRY)
 	call	bootblock_c_entry_bist
diff --git a/src/cpu/intel/car/non-evict/cache_as_ram.S b/src/cpu/intel/car/non-evict/cache_as_ram.S
index 18ac070..76986ff 100644
--- a/src/cpu/intel/car/non-evict/cache_as_ram.S
+++ b/src/cpu/intel/car/non-evict/cache_as_ram.S
@@ -233,6 +233,9 @@
 	pushl	%eax	/* tsc[31:0] */
 #endif
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 before_c_entry:
 	post_code(POSTCODE_BOOTBLOCK_BEFORE_C_ENTRY)
 	call	bootblock_c_entry_bist
diff --git a/src/cpu/intel/car/p3/cache_as_ram.S b/src/cpu/intel/car/p3/cache_as_ram.S
index 779dbcc..623cf41 100644
--- a/src/cpu/intel/car/p3/cache_as_ram.S
+++ b/src/cpu/intel/car/p3/cache_as_ram.S
@@ -155,6 +155,9 @@
 	movd	%mm1, %eax
 	pushl	%eax	/* tsc[31:0] */
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 before_c_entry:
 	post_code(POSTCODE_BOOTBLOCK_BEFORE_C_ENTRY)
 	call	bootblock_c_entry_bist
diff --git a/src/cpu/intel/car/p4-netburst/cache_as_ram.S b/src/cpu/intel/car/p4-netburst/cache_as_ram.S
index 9f514ef..f7c023b 100644
--- a/src/cpu/intel/car/p4-netburst/cache_as_ram.S
+++ b/src/cpu/intel/car/p4-netburst/cache_as_ram.S
@@ -380,6 +380,9 @@
 	pushl	%eax	/* tsc[31:0] */
 #endif
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 before_c_entry:
 	post_code(POSTCODE_BOOTBLOCK_BEFORE_C_ENTRY)
 	call	bootblock_c_entry_bist
diff --git a/src/cpu/qemu-x86/cache_as_ram_bootblock.S b/src/cpu/qemu-x86/cache_as_ram_bootblock.S
index fe872de..0943e35 100644
--- a/src/cpu/qemu-x86/cache_as_ram_bootblock.S
+++ b/src/cpu/qemu-x86/cache_as_ram_bootblock.S
@@ -100,6 +100,9 @@
 	pushl	%eax
 #endif
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 before_c_entry:
 	call	bootblock_c_entry_bist
 	/* Never returns */
diff --git a/src/cpu/x86/copy_data_section.inc b/src/cpu/x86/copy_data_section.inc
new file mode 100644
index 0000000..dccb8d3
--- /dev/null
+++ b/src/cpu/x86/copy_data_section.inc
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#if ENV_SEPARATE_DATA_AND_BSS
+
+/*
+ * Copy .data section content to Cache-As-Ram.
+ * This code can be included from 32 bits or 64 bits code. It also preserves
+ * registers.
+ */
+copy_data_section:
+#if ENV_X86_64
+	push	%rcx
+	push	%rdi
+	push	%rsi
+#else
+	pushl	%ecx
+	pushl	%edi
+	pushl	%esi
+#endif
+
+	movl	$(_edata), %ecx
+	movl	$(_data), %edi
+	sub	%edi, %ecx
+	movl	$(_data_load),%esi
+	shrl	$2, %ecx
+	rep	movsl
+
+#if ENV_X86_64
+	pop	%rsi
+	pop	%rdi
+	pop	%rcx
+#else
+	popl	%esi
+	popl	%edi
+	popl	%ecx
+#endif
+
+#endif	/* ENV_SEPARATE_DATA_AND_BSS */
diff --git a/src/drivers/amd/agesa/cache_as_ram.S b/src/drivers/amd/agesa/cache_as_ram.S
index 5e77263..c10c369 100644
--- a/src/drivers/amd/agesa/cache_as_ram.S
+++ b/src/drivers/amd/agesa/cache_as_ram.S
@@ -57,6 +57,9 @@
 	movd	%mm1, %eax
 	pushl	%eax		/* tsc[31:0] */
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 	post_code(POSTCODE_BOOTBLOCK_PRE_C_DONE)
 
 	call	bootblock_c_entry
diff --git a/src/drivers/intel/fsp1_1/cache_as_ram.S b/src/drivers/intel/fsp1_1/cache_as_ram.S
index 17e0a69..c8eae7f 100644
--- a/src/drivers/intel/fsp1_1/cache_as_ram.S
+++ b/src/drivers/intel/fsp1_1/cache_as_ram.S
@@ -180,6 +180,9 @@
 	movd	%mm0, %eax
 	pushl	%eax	/* tsc[31:0] */
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 before_romstage:
 	/* Call bootblock_c_entry(uint64_t base_timestamp) */
 	call	bootblock_c_entry
diff --git a/src/include/rules.h b/src/include/rules.h
index ae8118e..02c36f2 100644
--- a/src/include/rules.h
+++ b/src/include/rules.h
@@ -274,16 +274,16 @@
 #if ENV_X86
 /* Indicates memory layout is determined with arch/x86/car.ld. */
 #define ENV_CACHE_AS_RAM		(ENV_ROMSTAGE_OR_BEFORE && !CONFIG(RESET_VECTOR_IN_RAM))
-/* No .data sections with execute-in-place from ROM.  */
-#define ENV_HAS_DATA_SECTION	!ENV_CACHE_AS_RAM
 #else
-/* Both .data and .bss, sometimes SRAM not DRAM. */
-#define ENV_HAS_DATA_SECTION	1
 #define ENV_CACHE_AS_RAM		0
 #endif
 
-/* Indicates if the stage uses the _bss region defined in arch/x86/car.ld */
-#define ENV_SEPARATE_BSS	(ENV_CACHE_AS_RAM && (ENV_BOOTBLOCK || !CONFIG(NO_XIP_EARLY_STAGES)))
+/* Indicates .data section support. */
+#define ENV_HAS_DATA_SECTION		1
+
+/* Indicates if the stage uses the _data and _bss regions defined in
+ * arch/x86/car.ld */
+#define ENV_SEPARATE_DATA_AND_BSS	(ENV_CACHE_AS_RAM && (ENV_BOOTBLOCK || !CONFIG(NO_XIP_EARLY_STAGES)))
 
 /* Currently rmodules, ramstage and smm have heap. */
 #define ENV_HAS_HEAP_SECTION	(ENV_RMODULE || ENV_RAMSTAGE || ENV_SMM)
diff --git a/src/include/symbols.h b/src/include/symbols.h
index a03af08..5410798 100644
--- a/src/include/symbols.h
+++ b/src/include/symbols.h
@@ -50,6 +50,11 @@
 DECLARE_REGION(asan_shadow)
 #endif
 
+#if ENV_SEPARATE_DATA_AND_BSS
+DECLARE_REGION(data)
+DECLARE_REGION(data_load)
+#endif
+
 /* Regions for execution units. */
 
 DECLARE_REGION(payload)
diff --git a/src/lib/program.ld b/src/lib/program.ld
index 67f685f..f406f9f 100644
--- a/src/lib/program.ld
+++ b/src/lib/program.ld
@@ -72,7 +72,7 @@
 #endif
 
 /* Include data, bss, and heap in that order. Not defined for all stages. */
-#if ENV_HAS_DATA_SECTION
+#if !ENV_SEPARATE_DATA_AND_BSS
 .data . : {
 	. = ALIGN(ARCH_CACHELINE_ALIGN_SIZE);
 	_data = .;
@@ -116,7 +116,7 @@
 }
 #endif
 
-#if !ENV_SEPARATE_BSS
+#if !ENV_SEPARATE_DATA_AND_BSS
 .bss . : {
 	. = ALIGN(ARCH_POINTER_ALIGN_SIZE);
 	_bss = .;
diff --git a/src/security/vboot/Makefile.inc b/src/security/vboot/Makefile.inc
index 62a5660..f152444 100644
--- a/src/security/vboot/Makefile.inc
+++ b/src/security/vboot/Makefile.inc
@@ -137,7 +137,12 @@
 endif	# CONFIG_VBOOT_STARTS_BEFORE_BOOTBLOCK
 
 ifeq ($(CONFIG_ARCH_VERSTAGE_X86_32)$(CONFIG_ARCH_VERSTAGE_X86_64),y)
-$(CONFIG_CBFS_PREFIX)/verstage-options := -a 64 -S ".car.data"
+$(CONFIG_CBFS_PREFIX)/verstage-options := -a 64
+ifeq ($(CONFIG_NO_XIP_EARLY_STAGES),y)
+$(CONFIG_CBFS_PREFIX)/verstage-options += -S ".car.data"
+else
+$(CONFIG_CBFS_PREFIX)/verstage-options += -S ".car.data,.data"
+endif
 
 # If CAR does not support execution of code, verstage on x86 is expected to be
 # xip.
diff --git a/src/soc/amd/common/block/cpu/car/cache_as_ram.S b/src/soc/amd/common/block/cpu/car/cache_as_ram.S
index 2bd3f50..8d41826 100644
--- a/src/soc/amd/common/block/cpu/car/cache_as_ram.S
+++ b/src/soc/amd/common/block/cpu/car/cache_as_ram.S
@@ -41,6 +41,9 @@
 	movd	%mm1, %eax
 	pushl	%eax		/* tsc[31:0] */
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 before_carstage:
 	post_code(POSTCODE_BOOTBLOCK_PRE_C_DONE)
 
diff --git a/src/soc/intel/common/block/cpu/car/cache_as_ram.S b/src/soc/intel/common/block/cpu/car/cache_as_ram.S
index 3c8dc2e..61cbe307 100644
--- a/src/soc/intel/common/block/cpu/car/cache_as_ram.S
+++ b/src/soc/intel/common/block/cpu/car/cache_as_ram.S
@@ -295,6 +295,9 @@
 	pushl	%eax	/* tsc[31:0] */
 #endif
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 before_carstage:
 	post_code(POSTCODE_SOC_BEFORE_CARSTAGE)
 
diff --git a/src/soc/intel/common/block/cpu/car/cache_as_ram_fsp.S b/src/soc/intel/common/block/cpu/car/cache_as_ram_fsp.S
index 7532c7d..238a57e 100644
--- a/src/soc/intel/common/block/cpu/car/cache_as_ram_fsp.S
+++ b/src/soc/intel/common/block/cpu/car/cache_as_ram_fsp.S
@@ -93,6 +93,9 @@
 	movd	%mm1, %eax
 	push	%eax
 
+	/* Copy .data section content to Cache-As-Ram */
+#include <cpu/x86/copy_data_section.inc>
+
 	/* We can call into C functions now */
 	call	bootblock_c_entry