arch/arm64/armv8/mmu: Add support for 48bit VA

The VA space needs to be extended to support 48bit, as on Cavium SoCs
the MMIO starts at 1 << 47.

The following changes were done to coreboot and libpayload:
 * Use page table lvl 0
 * Increase VA bits to 48
 * Enable 256TB in MMU controller
 * Add additional asserts

Tested on Cavium SoC and two ARM64 Chromebooks.

Change-Id: I89e6a4809b6b725c3945bad7fce82b0dfee7c262
Signed-off-by: Patrick Rudolph <patrick.rudolph@9elements.com>
Reviewed-on: https://review.coreboot.org/24970
Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
Reviewed-by: Julius Werner <jwerner@chromium.org>
diff --git a/payloads/libpayload/arch/arm64/mmu.c b/payloads/libpayload/arch/arm64/mmu.c
index d84f969..c860ee0 100644
--- a/payloads/libpayload/arch/arm64/mmu.c
+++ b/payloads/libpayload/arch/arm64/mmu.c
@@ -172,6 +172,7 @@
 				uint64_t size,
 				uint64_t tag)
 {
+	uint64_t l0_index = (base_addr & L0_ADDR_MASK) >> L0_ADDR_SHIFT;
 	uint64_t l1_index = (base_addr & L1_ADDR_MASK) >> L1_ADDR_SHIFT;
 	uint64_t l2_index = (base_addr & L2_ADDR_MASK) >> L2_ADDR_SHIFT;
 	uint64_t l3_index = (base_addr & L3_ADDR_MASK) >> L3_ADDR_SHIFT;
@@ -179,12 +180,12 @@
 	uint64_t desc;
 	uint64_t attr = get_block_attr(tag);
 
-	/* L1 table lookup
-	 * If VA has bits more than L2 can resolve, lookup starts at L1
-	 * Assumption: we don't need L0 table in coreboot */
-	if (BITS_PER_VA > L1_ADDR_SHIFT) {
-		if ((size >= L1_XLAT_SIZE) &&
-		    IS_ALIGNED(base_addr, (1UL << L1_ADDR_SHIFT))) {
+	/* L0 entry stores a table descriptor (doesn't support blocks) */
+	table = get_next_level_table(&table[l0_index], L1_XLAT_SIZE);
+
+	/* L1 table lookup */
+	if ((size >= L1_XLAT_SIZE) &&
+	    IS_ALIGNED(base_addr, (1UL << L1_ADDR_SHIFT))) {
 			/* If block address is aligned and size is greater than
 			 * or equal to size addressed by each L1 entry, we can
 			 * directly store a block desc */
@@ -192,13 +193,12 @@
 			table[l1_index] = desc;
 			/* L2 lookup is not required */
 			return L1_XLAT_SIZE;
-		}
-		table = get_next_level_table(&table[l1_index], L2_XLAT_SIZE);
 	}
 
-	/* L2 table lookup
-	 * If lookup was performed at L1, L2 table addr is obtained from L1 desc
-	 * else, lookup starts at ttbr address */
+	/* L1 entry stores a table descriptor */
+	table = get_next_level_table(&table[l1_index], L2_XLAT_SIZE);
+
+	/* L2 table lookup */
 	if ((size >= L2_XLAT_SIZE) &&
 	    IS_ALIGNED(base_addr, (1UL << L2_ADDR_SHIFT))) {
 		/* If block address is aligned and size is greater than
@@ -226,6 +226,7 @@
 {
 	assert(!(addr & GRANULE_SIZE_MASK) &&
 	       !(size & GRANULE_SIZE_MASK) &&
+	       (addr + size < (1UL << BITS_PER_VA)) &&
 	       size >= GRANULE_SIZE);
 }
 
@@ -344,7 +345,7 @@
 
 	/* Initialize TCR flags */
 	raw_write_tcr_current(TCR_TOSZ | TCR_IRGN0_NM_WBWAC | TCR_ORGN0_NM_WBWAC |
-			      TCR_SH0_IS | TCR_TG0_4KB | TCR_PS_64GB |
+			      TCR_SH0_IS | TCR_TG0_4KB | TCR_PS_256TB |
 			      TCR_TBI_USED);
 
 	/* Initialize TTBR */
diff --git a/payloads/libpayload/include/arm64/arch/mmu.h b/payloads/libpayload/include/arm64/arch/mmu.h
index 2f87d09..3cea696 100644
--- a/payloads/libpayload/include/arm64/arch/mmu.h
+++ b/payloads/libpayload/include/arm64/arch/mmu.h
@@ -83,7 +83,7 @@
 /* XLAT Table Init Attributes */
 
 #define VA_START                   0x0
-#define BITS_PER_VA                33
+#define BITS_PER_VA                48
 #define MIN_64_BIT_ADDR            (1UL << 32)
 /* Granule size of 4KB is being used */
 #define GRANULE_SIZE_SHIFT         12
@@ -92,14 +92,12 @@
 #define GRANULE_SIZE_MASK          ((1 << GRANULE_SIZE_SHIFT) - 1)
 
 #define BITS_RESOLVED_PER_LVL   (GRANULE_SIZE_SHIFT - 3)
+#define L0_ADDR_SHIFT           (GRANULE_SIZE_SHIFT + BITS_RESOLVED_PER_LVL * 3)
 #define L1_ADDR_SHIFT           (GRANULE_SIZE_SHIFT + BITS_RESOLVED_PER_LVL * 2)
 #define L2_ADDR_SHIFT           (GRANULE_SIZE_SHIFT + BITS_RESOLVED_PER_LVL * 1)
 #define L3_ADDR_SHIFT           (GRANULE_SIZE_SHIFT + BITS_RESOLVED_PER_LVL * 0)
 
-#if BITS_PER_VA > L1_ADDR_SHIFT + BITS_RESOLVED_PER_LVL
-  #error "BITS_PER_VA too large (we don't have L0 table support)"
-#endif
-
+#define L0_ADDR_MASK     (((1UL << BITS_RESOLVED_PER_LVL) - 1) << L0_ADDR_SHIFT)
 #define L1_ADDR_MASK     (((1UL << BITS_RESOLVED_PER_LVL) - 1) << L1_ADDR_SHIFT)
 #define L2_ADDR_MASK     (((1UL << BITS_RESOLVED_PER_LVL) - 1) << L2_ADDR_SHIFT)
 #define L3_ADDR_MASK     (((1UL << BITS_RESOLVED_PER_LVL) - 1) << L3_ADDR_SHIFT)
@@ -109,6 +107,7 @@
 #define L3_XLAT_SIZE               (1UL << L3_ADDR_SHIFT)
 #define L2_XLAT_SIZE               (1UL << L2_ADDR_SHIFT)
 #define L1_XLAT_SIZE               (1UL << L1_ADDR_SHIFT)
+#define L0_XLAT_SIZE               (1UL << L0_ADDR_SHIFT)
 
 /* Block indices required for MAIR */
 #define BLOCK_INDEX_MEM_DEV_NGNRNE 0
diff --git a/src/arch/arm64/armv8/mmu.c b/src/arch/arm64/armv8/mmu.c
index 55bd703..a24e7c6 100644
--- a/src/arch/arm64/armv8/mmu.c
+++ b/src/arch/arm64/armv8/mmu.c
@@ -141,6 +141,7 @@
 				uint64_t size,
 				uint64_t tag)
 {
+	uint64_t l0_index = (base_addr & L0_ADDR_MASK) >> L0_ADDR_SHIFT;
 	uint64_t l1_index = (base_addr & L1_ADDR_MASK) >> L1_ADDR_SHIFT;
 	uint64_t l2_index = (base_addr & L2_ADDR_MASK) >> L2_ADDR_SHIFT;
 	uint64_t l3_index = (base_addr & L3_ADDR_MASK) >> L3_ADDR_SHIFT;
@@ -148,12 +149,12 @@
 	uint64_t desc;
 	uint64_t attr = get_block_attr(tag);
 
-	/* L1 table lookup
-	 * If VA has bits more than L2 can resolve, lookup starts at L1
-	 * Assumption: we don't need L0 table in coreboot */
-	if (BITS_PER_VA > L1_ADDR_SHIFT) {
-		if ((size >= L1_XLAT_SIZE) &&
-		    IS_ALIGNED(base_addr, (1UL << L1_ADDR_SHIFT))) {
+	/* L0 entry stores a table descriptor (doesn't support blocks) */
+	table = get_next_level_table(&table[l0_index], L1_XLAT_SIZE);
+
+	/* L1 table lookup */
+	if ((size >= L1_XLAT_SIZE) &&
+	    IS_ALIGNED(base_addr, (1UL << L1_ADDR_SHIFT))) {
 			/* If block address is aligned and size is greater than
 			 * or equal to size addressed by each L1 entry, we can
 			 * directly store a block desc */
@@ -161,13 +162,12 @@
 			table[l1_index] = desc;
 			/* L2 lookup is not required */
 			return L1_XLAT_SIZE;
-		}
-		table = get_next_level_table(&table[l1_index], L2_XLAT_SIZE);
 	}
 
-	/* L2 table lookup
-	 * If lookup was performed at L1, L2 table addr is obtained from L1 desc
-	 * else, lookup starts at ttbr address */
+	/* L1 entry stores a table descriptor */
+	table = get_next_level_table(&table[l1_index], L2_XLAT_SIZE);
+
+	/* L2 table lookup */
 	if ((size >= L2_XLAT_SIZE) &&
 	    IS_ALIGNED(base_addr, (1UL << L2_ADDR_SHIFT))) {
 		/* If block address is aligned and size is greater than
@@ -195,6 +195,7 @@
 {
 	assert(!(addr & GRANULE_SIZE_MASK) &&
 	       !(size & GRANULE_SIZE_MASK) &&
+	       (addr + size < (1UL << BITS_PER_VA)) &&
 	       size >= GRANULE_SIZE);
 }
 
@@ -202,7 +203,7 @@
  * Desc : Returns the page table entry governing a specific address. */
 static uint64_t get_pte(void *addr)
 {
-	int shift = BITS_PER_VA > L1_ADDR_SHIFT ? L1_ADDR_SHIFT : L2_ADDR_SHIFT;
+	int shift = L0_ADDR_SHIFT;
 	uint64_t *pte = (uint64_t *)_ttb;
 
 	while (1) {
@@ -257,8 +258,8 @@
 	for (; _ettb - (u8 *)table > 0; table += GRANULE_SIZE/sizeof(*table))
 		table[0] = UNUSED_DESC;
 
-	/* Initialize the root table (L1) to be completely unmapped. */
-	uint64_t *root = setup_new_table(INVALID_DESC, L1_XLAT_SIZE);
+	/* Initialize the root table (L0) to be completely unmapped. */
+	uint64_t *root = setup_new_table(INVALID_DESC, L0_XLAT_SIZE);
 	assert((u8 *)root == _ttb);
 
 	/* Initialize TTBR */
@@ -269,7 +270,7 @@
 
 	/* Initialize TCR flags */
 	raw_write_tcr_el3(TCR_TOSZ | TCR_IRGN0_NM_WBWAC | TCR_ORGN0_NM_WBWAC |
-			  TCR_SH0_IS | TCR_TG0_4KB | TCR_PS_64GB |
+			  TCR_SH0_IS | TCR_TG0_4KB | TCR_PS_256TB |
 			  TCR_TBI_USED);
 }
 
diff --git a/src/arch/arm64/include/armv8/arch/mmu.h b/src/arch/arm64/include/armv8/arch/mmu.h
index a812073..f0e551e 100644
--- a/src/arch/arm64/include/armv8/arch/mmu.h
+++ b/src/arch/arm64/include/armv8/arch/mmu.h
@@ -69,7 +69,7 @@
 /* XLAT Table Init Attributes */
 
 #define VA_START                   0x0
-#define BITS_PER_VA                33
+#define BITS_PER_VA                48
 /* Granule size of 4KB is being used */
 #define GRANULE_SIZE_SHIFT         12
 #define GRANULE_SIZE               (1 << GRANULE_SIZE_SHIFT)
@@ -77,14 +77,12 @@
 #define GRANULE_SIZE_MASK          ((1 << GRANULE_SIZE_SHIFT) - 1)
 
 #define BITS_RESOLVED_PER_LVL   (GRANULE_SIZE_SHIFT - 3)
+#define L0_ADDR_SHIFT           (GRANULE_SIZE_SHIFT + BITS_RESOLVED_PER_LVL * 3)
 #define L1_ADDR_SHIFT           (GRANULE_SIZE_SHIFT + BITS_RESOLVED_PER_LVL * 2)
 #define L2_ADDR_SHIFT           (GRANULE_SIZE_SHIFT + BITS_RESOLVED_PER_LVL * 1)
 #define L3_ADDR_SHIFT           (GRANULE_SIZE_SHIFT + BITS_RESOLVED_PER_LVL * 0)
 
-#if BITS_PER_VA > L1_ADDR_SHIFT + BITS_RESOLVED_PER_LVL
-  #error "BITS_PER_VA too large (we don't have L0 table support)"
-#endif
-
+#define L0_ADDR_MASK     (((1UL << BITS_RESOLVED_PER_LVL) - 1) << L0_ADDR_SHIFT)
 #define L1_ADDR_MASK     (((1UL << BITS_RESOLVED_PER_LVL) - 1) << L1_ADDR_SHIFT)
 #define L2_ADDR_MASK     (((1UL << BITS_RESOLVED_PER_LVL) - 1) << L2_ADDR_SHIFT)
 #define L3_ADDR_MASK     (((1UL << BITS_RESOLVED_PER_LVL) - 1) << L3_ADDR_SHIFT)
@@ -94,6 +92,7 @@
 #define L3_XLAT_SIZE               (1UL << L3_ADDR_SHIFT)
 #define L2_XLAT_SIZE               (1UL << L2_ADDR_SHIFT)
 #define L1_XLAT_SIZE               (1UL << L1_ADDR_SHIFT)
+#define L0_XLAT_SIZE               (1UL << L0_ADDR_SHIFT)
 
 /* Block indices required for MAIR */
 #define BLOCK_INDEX_MEM_DEV_NGNRNE 0