David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 1 | /***********************license start*********************************** |
| 2 | * Copyright (c) 2003-2017 Cavium Inc. (support@cavium.com). All rights |
| 3 | * reserved. |
| 4 | * |
| 5 | * |
| 6 | * Redistribution and use in source and binary forms, with or without |
| 7 | * modification, are permitted provided that the following conditions are |
| 8 | * met: |
| 9 | * |
| 10 | * * Redistributions of source code must retain the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer. |
| 12 | * |
| 13 | * * Redistributions in binary form must reproduce the above |
| 14 | * copyright notice, this list of conditions and the following |
| 15 | * disclaimer in the documentation and/or other materials provided |
| 16 | * with the distribution. |
| 17 | * |
| 18 | * * Neither the name of Cavium Inc. nor the names of |
| 19 | * its contributors may be used to endorse or promote products |
| 20 | * derived from this software without specific prior written |
| 21 | * permission. |
| 22 | * |
| 23 | * This Software, including technical data, may be subject to U.S. export |
| 24 | * control laws, including the U.S. Export Administration Act and its |
| 25 | * associated regulations, and may be subject to export or import |
| 26 | * regulations in other countries. |
| 27 | * |
| 28 | * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS" |
| 29 | * AND WITH ALL FAULTS AND CAVIUM INC. MAKES NO PROMISES, REPRESENTATIONS OR |
| 30 | * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT |
| 31 | * TO THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY |
| 32 | * REPRESENTATION OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT |
| 33 | * DEFECTS, AND CAVIUM SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES |
| 34 | * OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR |
| 35 | * PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, |
| 36 | * QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. THE ENTIRE RISK |
| 37 | * ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE LIES WITH YOU. |
| 38 | ***********************license end**************************************/ |
| 39 | #include <bdk.h> |
| 40 | #include "libbdk-arch/bdk-csrs-gti.h" |
| 41 | #include "libbdk-arch/bdk-csrs-ocx.h" |
| 42 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 43 | #include <bdk-minimal.h> /* for printf --> printk */ |
| 44 | #include <libbdk-dram/bdk-dram-test.h> |
| 45 | #include <libbdk-hal/bdk-atomic.h> |
| 46 | #include <libbdk-hal/bdk-clock.h> |
| 47 | #include <libbdk-hal/bdk-utils.h> |
| 48 | #include <libbdk-os/bdk-init.h> |
| 49 | #include <libbdk-os/bdk-thread.h> |
| 50 | |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 51 | /* This code is an optional part of the BDK. It is only linked in |
| 52 | if BDK_REQUIRE() needs it */ |
| 53 | BDK_REQUIRE_DEFINE(DRAM_TEST); |
| 54 | |
| 55 | #define MAX_ERRORS_TO_REPORT 50 |
| 56 | #define RETRY_LIMIT 1000 |
| 57 | |
| 58 | typedef struct |
| 59 | { |
| 60 | const char * name; /* Friendly name for the test */ |
| 61 | __bdk_dram_test_t test_func; /* Function to call */ |
| 62 | int bursts; /* Bursts parameter to pass to the test */ |
| 63 | int max_cores; /* Maximum number of cores the test should be run on in parallel. Zero means all */ |
| 64 | } dram_test_info_t; |
| 65 | |
| 66 | static const dram_test_info_t TEST_INFO[] = { |
| 67 | /* Name, Test function, Bursts, Max Cores */ |
| 68 | { "Data Bus", __bdk_dram_test_mem_data_bus, 8, 1}, |
| 69 | { "Address Bus", __bdk_dram_test_mem_address_bus, 0, 1}, |
| 70 | { "Marching Rows", __bdk_dram_test_mem_rows, 16, 0}, |
| 71 | { "Random Data", __bdk_dram_test_mem_random, 32, 0}, |
| 72 | { "Random XOR (32 Burst)", __bdk_dram_test_mem_xor, 32, 0}, |
| 73 | { "Self Address", __bdk_dram_test_mem_self_addr, 1, 0}, |
| 74 | { "March C- Solid Bits", __bdk_dram_test_mem_solid, 1, 0}, |
| 75 | { "March C- Checkerboard", __bdk_dram_test_mem_checkerboard, 1, 0}, |
| 76 | { "Walking Ones Left", __bdk_dram_test_mem_leftwalk1, 1, 0}, |
| 77 | { "Walking Ones Right", __bdk_dram_test_mem_rightwalk1, 1, 0}, |
| 78 | { "Walking Zeros Left", __bdk_dram_test_mem_leftwalk0, 1, 0}, |
| 79 | { "Walking Zeros Right", __bdk_dram_test_mem_rightwalk0, 1, 0}, |
| 80 | { "Random XOR (224 Burst)", __bdk_dram_test_mem_xor, 224, 0}, |
| 81 | { "Fast Scan", __bdk_dram_test_fast_scan, 0, 0}, |
| 82 | { NULL, NULL, 0, 0} |
| 83 | }; |
| 84 | |
| 85 | /* These variables count the number of ECC errors. They should only be accessed atomically */ |
| 86 | int64_t __bdk_dram_ecc_single_bit_errors[BDK_MAX_MEM_CHANS]; |
| 87 | int64_t __bdk_dram_ecc_double_bit_errors[BDK_MAX_MEM_CHANS]; |
| 88 | |
| 89 | static int64_t dram_test_thread_done; |
| 90 | static int64_t dram_test_thread_errors; |
| 91 | static uint64_t dram_test_thread_start; |
| 92 | static uint64_t dram_test_thread_end; |
| 93 | static uint64_t dram_test_thread_size; |
| 94 | |
| 95 | /** |
| 96 | * Force the memory at the pointer location to be written to memory and evicted |
| 97 | * from L2. L1 will be unaffected. |
| 98 | * |
| 99 | * @param address Physical memory location |
| 100 | */ |
| 101 | void __bdk_dram_flush_to_mem(uint64_t address) |
| 102 | { |
| 103 | BDK_MB; |
Jacob Garber | 4926e98 | 2019-07-26 11:45:43 -0600 | [diff] [blame] | 104 | char *ptr = bdk_phys_to_ptr(address); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 105 | BDK_CACHE_WBI_L2(ptr); |
| 106 | } |
| 107 | |
| 108 | /** |
| 109 | * Force a memory region to be written to DRAM and evicted from L2 |
| 110 | * |
| 111 | * @param area Start of the region |
| 112 | * @param max_address |
| 113 | * End of the region (exclusive) |
| 114 | */ |
| 115 | void __bdk_dram_flush_to_mem_range(uint64_t area, uint64_t max_address) |
| 116 | { |
Jacob Garber | 4926e98 | 2019-07-26 11:45:43 -0600 | [diff] [blame] | 117 | char *ptr = bdk_phys_to_ptr(area); |
| 118 | char *end = bdk_phys_to_ptr(max_address); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 119 | BDK_MB; |
| 120 | while (ptr < end) |
| 121 | { |
| 122 | BDK_CACHE_WBI_L2(ptr); |
| 123 | ptr += 128; |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | /** |
| 128 | * Convert a test enumeration into a string |
| 129 | * |
| 130 | * @param test Test to convert |
| 131 | * |
| 132 | * @return String for display |
| 133 | */ |
| 134 | const char *bdk_dram_get_test_name(int test) |
| 135 | { |
| 136 | if (test < (int)(sizeof(TEST_INFO) / sizeof(TEST_INFO[0]))) |
| 137 | return TEST_INFO[test].name; |
| 138 | else |
| 139 | return NULL; |
| 140 | } |
| 141 | |
| 142 | static bdk_dram_test_flags_t dram_test_flags; // FIXME: Don't use global |
| 143 | /** |
| 144 | * This function is run as a thread to perform memory tests over multiple cores. |
| 145 | * Each thread gets a section of memory to work on, which is controlled by global |
| 146 | * variables at the beginning of this file. |
| 147 | * |
| 148 | * @param arg Number of the region we should check |
Jonathan Neuschäfer | 45e6c82 | 2018-12-11 17:53:07 +0100 | [diff] [blame] | 149 | * @param arg1 Pointer to the test_info structure |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 150 | */ |
| 151 | static void dram_test_thread(int arg, void *arg1) |
| 152 | { |
| 153 | const dram_test_info_t *test_info = arg1; |
| 154 | const int bursts = test_info->bursts; |
| 155 | const int range_number = arg; |
| 156 | |
| 157 | /* Figure out our work memory range. |
| 158 | * |
| 159 | * Note start_address and end_address just provide the physical offset |
| 160 | * portion of the address and do not have the node bits set. This is |
| 161 | * to simplify address checks and calculations. Later, when about to run |
| 162 | * the memory test, the routines adds in the node bits to form the final |
| 163 | * addresses. |
| 164 | */ |
| 165 | uint64_t start_address = dram_test_thread_start + dram_test_thread_size * range_number; |
| 166 | uint64_t end_address = start_address + dram_test_thread_size; |
| 167 | if (end_address > dram_test_thread_end) |
| 168 | end_address = dram_test_thread_end; |
| 169 | |
| 170 | bdk_node_t test_node = bdk_numa_local(); |
| 171 | if (dram_test_flags & BDK_DRAM_TEST_USE_CCPI) |
| 172 | test_node ^= 1; |
| 173 | /* Insert the node part of the address */ |
| 174 | start_address = bdk_numa_get_address(test_node, start_address); |
| 175 | end_address = bdk_numa_get_address(test_node, end_address); |
| 176 | /* Test the region */ |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 177 | BDK_TRACE(DRAM_TEST, " Node %d, core %d, Testing [0x%011llx:0x%011llx]\n", |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 178 | bdk_numa_local(), bdk_get_core_num() & 127, start_address, end_address - 1); |
| 179 | test_info->test_func(start_address, end_address, bursts); |
| 180 | |
| 181 | /* Report that we're done */ |
| 182 | BDK_TRACE(DRAM_TEST, "Thread %d on node %d done with memory test\n", range_number, bdk_numa_local()); |
| 183 | bdk_atomic_add64_nosync(&dram_test_thread_done, 1); |
| 184 | } |
| 185 | |
| 186 | /** |
| 187 | * Run the memory test. |
| 188 | * |
| 189 | * @param test_info |
| 190 | * @param start_address |
| 191 | * Physical address to start at |
| 192 | * @param length Length of memory block |
| 193 | * @param flags Flags to control memory test options. Zero defaults to testing all |
| 194 | * node with statistics and progress output. |
| 195 | * |
| 196 | * @return Number of errors found. Zero is success. Negative means the test |
| 197 | * did not run due to some other failure. |
| 198 | */ |
| 199 | static int __bdk_dram_run_test(const dram_test_info_t *test_info, uint64_t start_address, |
| 200 | uint64_t length, bdk_dram_test_flags_t flags) |
| 201 | { |
| 202 | /* Figure out the addess of the byte one off the top of memory */ |
| 203 | uint64_t max_address = bdk_dram_get_size_mbytes(bdk_numa_local()); |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 204 | BDK_TRACE(DRAM_TEST, "DRAM available per node: %llu MB\n", max_address); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 205 | max_address <<= 20; |
| 206 | |
| 207 | /* Make sure we have enough */ |
| 208 | if (max_address < (16<<20)) |
| 209 | { |
| 210 | bdk_error("DRAM size is too small\n"); |
| 211 | return -1; |
| 212 | } |
| 213 | |
| 214 | /* Make sure the amount is sane */ |
| 215 | if (CAVIUM_IS_MODEL(CAVIUM_CN8XXX)) |
| 216 | { |
| 217 | if (max_address > (1ull << 40)) /* 40 bits in CN8XXX */ |
| 218 | max_address = 1ull << 40; |
| 219 | } |
| 220 | else |
| 221 | { |
| 222 | if (max_address > (1ull << 43)) /* 43 bits in CN9XXX */ |
| 223 | max_address = 1ull << 43; |
| 224 | } |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 225 | BDK_TRACE(DRAM_TEST, "DRAM max address: 0x%011llx\n", max_address-1); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 226 | |
| 227 | /* Make sure the start address is lower than the top of memory */ |
| 228 | if (start_address >= max_address) |
| 229 | { |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 230 | bdk_error("Start address is larger than the amount of memory: 0x%011llx versus 0x%011llx\n", |
| 231 | start_address, max_address); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 232 | return -1; |
| 233 | } |
| 234 | if (length == (uint64_t)-1) |
| 235 | length = max_address - start_address; |
| 236 | |
| 237 | /* Final range checks */ |
| 238 | uint64_t end_address = start_address + length; |
| 239 | if (end_address > max_address) |
| 240 | { |
| 241 | end_address = max_address; |
| 242 | length = end_address - start_address; |
| 243 | } |
| 244 | if (length == 0) |
| 245 | return 0; |
| 246 | |
| 247 | /* Ready to run the test. Figure out how many cores we need */ |
| 248 | int max_cores = test_info->max_cores; |
| 249 | int total_cores_all_nodes = max_cores; |
| 250 | |
| 251 | /* Figure out the number of cores available in the system */ |
| 252 | if (max_cores == 0) |
| 253 | { |
| 254 | max_cores += bdk_get_num_running_cores(bdk_numa_local()); |
| 255 | /* Calculate the total number of cores being used. The per node number |
| 256 | is confusing to people */ |
| 257 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 258 | if (flags & (1 << node)) |
| 259 | { |
| 260 | if (flags & BDK_DRAM_TEST_USE_CCPI) |
| 261 | total_cores_all_nodes += bdk_get_num_running_cores(node ^ 1); |
| 262 | else |
| 263 | total_cores_all_nodes += bdk_get_num_running_cores(node); |
| 264 | } |
| 265 | } |
| 266 | if (!(flags & BDK_DRAM_TEST_NO_BANNERS)) |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 267 | printf("Starting Test \"%s\" for [0x%011llx:0x%011llx] using %d core(s)\n", |
| 268 | test_info->name, start_address, end_address - 1, total_cores_all_nodes); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 269 | |
| 270 | /* Remember the LMC perf counters for stats after the test */ |
| 271 | uint64_t start_dram_dclk[BDK_NUMA_MAX_NODES][4]; |
| 272 | uint64_t start_dram_ops[BDK_NUMA_MAX_NODES][4]; |
| 273 | uint64_t stop_dram_dclk[BDK_NUMA_MAX_NODES][4]; |
| 274 | uint64_t stop_dram_ops[BDK_NUMA_MAX_NODES][4]; |
| 275 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 276 | { |
| 277 | if (flags & (1 << node)) |
| 278 | { |
| 279 | const int num_dram_controllers = __bdk_dram_get_num_lmc(node); |
| 280 | for (int i = 0; i < num_dram_controllers; i++) |
| 281 | { |
| 282 | start_dram_dclk[node][i] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(i)); |
| 283 | start_dram_ops[node][i] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(i)); |
| 284 | } |
| 285 | } |
| 286 | } |
| 287 | /* Remember the CCPI link counters for stats after the test */ |
| 288 | uint64_t start_ccpi_data[BDK_NUMA_MAX_NODES][3]; |
| 289 | uint64_t start_ccpi_idle[BDK_NUMA_MAX_NODES][3]; |
| 290 | uint64_t start_ccpi_err[BDK_NUMA_MAX_NODES][3]; |
| 291 | uint64_t stop_ccpi_data[BDK_NUMA_MAX_NODES][3]; |
| 292 | uint64_t stop_ccpi_idle[BDK_NUMA_MAX_NODES][3]; |
| 293 | uint64_t stop_ccpi_err[BDK_NUMA_MAX_NODES][3]; |
| 294 | if (!bdk_numa_is_only_one()) |
| 295 | { |
| 296 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 297 | { |
| 298 | if (flags & (1 << node)) |
| 299 | { |
| 300 | for (int link = 0; link < 3; link++) |
| 301 | { |
| 302 | start_ccpi_data[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_DATA_CNT(link)); |
| 303 | start_ccpi_idle[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_IDLE_CNT(link)); |
| 304 | start_ccpi_err[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_ERR_CNT(link)); |
| 305 | } |
| 306 | } |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | /* WARNING: This code assumes the same memory range is being tested on |
| 311 | all nodes. The same number of cores are used on each node to test |
| 312 | its local memory */ |
| 313 | uint64_t work_address = start_address; |
| 314 | dram_test_flags = flags; |
| 315 | bdk_atomic_set64(&dram_test_thread_errors, 0); |
| 316 | while ((work_address < end_address) && ((dram_test_thread_errors == 0) || (flags & BDK_DRAM_TEST_NO_STOP_ERROR))) |
| 317 | { |
| 318 | /* Check at most MAX_CHUNK_SIZE across each iteration. We only report |
| 319 | progress between chunks, so keep them reasonably small */ |
| 320 | const uint64_t MAX_CHUNK_SIZE = 1ull << 28; /* 256MB */ |
| 321 | uint64_t size = end_address - work_address; |
| 322 | if (size > MAX_CHUNK_SIZE) |
| 323 | size = MAX_CHUNK_SIZE; |
| 324 | |
| 325 | /* Divide memory evenly between the cores. Round the size up so that |
| 326 | all memory is covered. The last core may have slightly less memory to |
| 327 | test */ |
| 328 | uint64_t thread_size = (size + (max_cores - 1)) / max_cores; |
| 329 | thread_size += 127; |
| 330 | thread_size &= -128; |
| 331 | dram_test_thread_start = work_address; |
| 332 | dram_test_thread_end = work_address + size; |
| 333 | dram_test_thread_size = thread_size; |
| 334 | BDK_WMB; |
| 335 | |
| 336 | /* Poke the watchdog */ |
| 337 | BDK_CSR_WRITE(bdk_numa_local(), BDK_GTI_CWD_POKEX(0), 0); |
| 338 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 339 | /* disable progress output when batch mode is ON */ |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 340 | if (!(flags & BDK_DRAM_TEST_NO_PROGRESS)) { |
| 341 | |
| 342 | /* Report progress percentage */ |
| 343 | int percent_x10 = (work_address - start_address) * 1000 / (end_address - start_address); |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 344 | printf(" %3d.%d%% complete, testing [0x%011llx:0x%011llx]\r", |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 345 | percent_x10 / 10, percent_x10 % 10, work_address, work_address + size - 1); |
| 346 | fflush(stdout); |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 347 | } |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 348 | |
| 349 | work_address += size; |
| 350 | |
| 351 | /* Start threads for all the cores */ |
| 352 | int total_count = 0; |
| 353 | bdk_atomic_set64(&dram_test_thread_done, 0); |
| 354 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 355 | { |
| 356 | if (flags & (1 << node)) |
| 357 | { |
| 358 | const int num_cores = bdk_get_num_cores(node); |
| 359 | int per_node = 0; |
| 360 | for (int core = 0; core < num_cores; core++) |
| 361 | { |
| 362 | if (per_node >= max_cores) |
| 363 | break; |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 364 | BDK_TRACE(DRAM_TEST, "Starting thread %d on node %d for memory test\n", per_node, node); |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 365 | dram_test_thread(per_node, (void *)test_info); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 366 | } |
| 367 | } |
| 368 | } |
| 369 | |
| 370 | #if 0 |
| 371 | /* Wait for threads to finish */ |
| 372 | while (bdk_atomic_get64(&dram_test_thread_done) < total_count) |
| 373 | bdk_thread_yield(); |
| 374 | #else |
| 375 | #define TIMEOUT_SECS 30 // FIXME: long enough so multicore RXOR 224 should not print out |
| 376 | /* Wait for threads to finish, with progress */ |
| 377 | int cur_count; |
| 378 | uint64_t cur_time; |
Peter Lemenkov | 7bbe3bb | 2018-12-07 11:23:21 +0100 | [diff] [blame] | 379 | uint64_t period = bdk_clock_get_rate(bdk_numa_local(), BDK_CLOCK_TIME) * TIMEOUT_SECS; // FIXME? |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 380 | uint64_t timeout = bdk_clock_get_count(BDK_CLOCK_TIME) + period; |
| 381 | do { |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 382 | cur_count = bdk_atomic_get64(&dram_test_thread_done); |
| 383 | cur_time = bdk_clock_get_count(BDK_CLOCK_TIME); |
| 384 | if (cur_time >= timeout) { |
| 385 | BDK_TRACE(DRAM_TEST, "N%d: Waiting for %d cores\n", |
| 386 | bdk_numa_local(), total_count - cur_count); |
| 387 | timeout = cur_time + period; |
| 388 | } |
| 389 | } while (cur_count < total_count); |
| 390 | #endif |
| 391 | } |
| 392 | |
| 393 | /* Get the DRAM perf counters */ |
| 394 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 395 | { |
| 396 | if (flags & (1 << node)) |
| 397 | { |
| 398 | const int num_dram_controllers = __bdk_dram_get_num_lmc(node); |
| 399 | for (int i = 0; i < num_dram_controllers; i++) |
| 400 | { |
| 401 | stop_dram_dclk[node][i] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(i)); |
| 402 | stop_dram_ops[node][i] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(i)); |
| 403 | } |
| 404 | } |
| 405 | } |
| 406 | /* Get the CCPI link counters */ |
| 407 | if (!bdk_numa_is_only_one()) |
| 408 | { |
| 409 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 410 | { |
| 411 | if (flags & (1 << node)) |
| 412 | { |
| 413 | for (int link = 0; link < 3; link++) |
| 414 | { |
| 415 | stop_ccpi_data[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_DATA_CNT(link)); |
| 416 | stop_ccpi_idle[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_IDLE_CNT(link)); |
| 417 | stop_ccpi_err[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_ERR_CNT(link)); |
| 418 | } |
| 419 | } |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | /* disable progress output when batch mode is ON */ |
| 424 | if (!(flags & BDK_DRAM_TEST_NO_PROGRESS)) { |
| 425 | |
| 426 | /* Report progress percentage as complete */ |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 427 | printf(" %3d.%d%% complete, testing [0x%011llx:0x%011llx]\n", |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 428 | 100, 0, start_address, end_address - 1); |
| 429 | fflush(stdout); |
| 430 | } |
| 431 | |
| 432 | if (!(flags & BDK_DRAM_TEST_NO_STATS)) |
| 433 | { |
| 434 | /* Display LMC load */ |
| 435 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 436 | { |
| 437 | if (flags & (1 << node)) |
| 438 | { |
| 439 | const int num_dram_controllers = __bdk_dram_get_num_lmc(node); |
| 440 | for (int i = 0; i < num_dram_controllers; i++) |
| 441 | { |
| 442 | uint64_t ops = stop_dram_ops[node][i] - start_dram_ops[node][i]; |
| 443 | uint64_t dclk = stop_dram_dclk[node][i] - start_dram_dclk[node][i]; |
| 444 | if (dclk == 0) |
| 445 | dclk = 1; |
| 446 | uint64_t percent_x10 = ops * 1000 / dclk; |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 447 | printf(" Node %d, LMC%d: ops %llu, cycles %llu, used %llu.%llu%%\n", |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 448 | node, i, ops, dclk, percent_x10 / 10, percent_x10 % 10); |
| 449 | } |
| 450 | } |
| 451 | } |
| 452 | if (flags & BDK_DRAM_TEST_USE_CCPI) |
| 453 | { |
| 454 | /* Display CCPI load */ |
| 455 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 456 | { |
| 457 | if (flags & (1 << node)) |
| 458 | { |
| 459 | for (int link = 0; link < 3; link++) |
| 460 | { |
| 461 | uint64_t busy = stop_ccpi_data[node][link] - start_ccpi_data[node][link]; |
| 462 | busy += stop_ccpi_err[node][link] - start_ccpi_err[node][link]; |
| 463 | uint64_t total = stop_ccpi_idle[node][link] - start_ccpi_idle[node][link]; |
| 464 | total += busy; |
| 465 | if (total == 0) |
| 466 | continue; |
| 467 | uint64_t percent_x10 = busy * 1000 / total; |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 468 | printf(" Node %d, CCPI%d: busy %llu, total %llu, used %llu.%llu%%\n", |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 469 | node, link, busy, total, percent_x10 / 10, percent_x10 % 10); |
| 470 | } |
| 471 | } |
| 472 | } |
| 473 | } |
| 474 | } |
| 475 | return dram_test_thread_errors; |
| 476 | } |
| 477 | |
| 478 | /** |
| 479 | * Perform a memory test. |
| 480 | * |
| 481 | * @param test Test type to run |
| 482 | * @param start_address |
| 483 | * Physical address to start at |
| 484 | * @param length Length of memory block |
| 485 | * @param flags Flags to control memory test options. Zero defaults to testing all |
| 486 | * node with statistics and progress output. |
| 487 | * |
| 488 | * @return Number of errors found. Zero is success. Negative means the test |
| 489 | * did not run due to some other failure. |
| 490 | */ |
| 491 | int bdk_dram_test(int test, uint64_t start_address, uint64_t length, bdk_dram_test_flags_t flags) |
| 492 | { |
| 493 | /* These limits are arbitrary. They just make sure we aren't doing something |
| 494 | silly, like test a non cache line aligned memory region */ |
| 495 | if (start_address & 0xffff) |
| 496 | { |
| 497 | bdk_error("DRAM test start address must be aligned on a 64KB boundary\n"); |
| 498 | return -1; |
| 499 | } |
| 500 | if (length & 0xffff) |
| 501 | { |
| 502 | bdk_error("DRAM test length must be a multiple of 64KB\n"); |
| 503 | return -1; |
| 504 | } |
| 505 | |
| 506 | const char *name = bdk_dram_get_test_name(test); |
| 507 | if (name == NULL) |
| 508 | { |
| 509 | bdk_error("Invalid DRAM test number %d\n", test); |
| 510 | return -1; |
| 511 | } |
| 512 | |
| 513 | /* If no nodes are selected assume the user meant all nodes */ |
| 514 | if ((flags & (BDK_DRAM_TEST_NODE0 | BDK_DRAM_TEST_NODE1 | BDK_DRAM_TEST_NODE2 | BDK_DRAM_TEST_NODE3)) == 0) |
| 515 | flags |= BDK_DRAM_TEST_NODE0 | BDK_DRAM_TEST_NODE1 | BDK_DRAM_TEST_NODE2 | BDK_DRAM_TEST_NODE3; |
| 516 | |
| 517 | /* Remove nodes from the flags that don't exist */ |
| 518 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 519 | { |
| 520 | if (flags & BDK_DRAM_TEST_USE_CCPI) |
| 521 | { |
| 522 | if (!bdk_numa_exists(node ^ 1)) |
| 523 | flags &= ~(1 << node); |
| 524 | } |
| 525 | else |
| 526 | { |
| 527 | if (!bdk_numa_exists(node)) |
| 528 | flags &= ~(1 << node); |
| 529 | } |
| 530 | } |
| 531 | |
| 532 | |
| 533 | /* Make sure the start address is higher that the BDK's active range */ |
| 534 | uint64_t top_of_bdk = bdk_dram_get_top_of_bdk(); |
| 535 | if (start_address < top_of_bdk) |
| 536 | start_address = top_of_bdk; |
| 537 | |
| 538 | /* Clear ECC error counters before starting the test */ |
| 539 | for (int chan = 0; chan < BDK_MAX_MEM_CHANS; chan++) { |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 540 | bdk_atomic_set64(&__bdk_dram_ecc_single_bit_errors[chan], 0); |
| 541 | bdk_atomic_set64(&__bdk_dram_ecc_double_bit_errors[chan], 0); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 542 | } |
| 543 | |
| 544 | /* Make sure at least one core from each node is running */ |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 545 | /* FIXME(dhendrix): we only care about core0 on node0 for now */ |
| 546 | #if 0 |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 547 | for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++) |
| 548 | { |
| 549 | if (flags & (1<<node)) |
| 550 | { |
| 551 | int use_node = (flags & BDK_DRAM_TEST_USE_CCPI) ? node ^ 1 : node; |
| 552 | if (bdk_get_running_coremask(use_node) == 0) |
| 553 | bdk_init_cores(use_node, 1); |
| 554 | } |
| 555 | } |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 556 | #endif |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 557 | |
| 558 | /* This returns any data compare errors found */ |
| 559 | int errors = __bdk_dram_run_test(&TEST_INFO[test], start_address, length, flags); |
| 560 | |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 561 | /* Check ECC error counters after the test */ |
| 562 | int64_t ecc_single = 0; |
| 563 | int64_t ecc_double = 0; |
| 564 | int64_t ecc_single_errs[BDK_MAX_MEM_CHANS]; |
| 565 | int64_t ecc_double_errs[BDK_MAX_MEM_CHANS]; |
| 566 | |
| 567 | for (int chan = 0; chan < BDK_MAX_MEM_CHANS; chan++) { |
| 568 | ecc_single += (ecc_single_errs[chan] = bdk_atomic_get64(&__bdk_dram_ecc_single_bit_errors[chan])); |
| 569 | ecc_double += (ecc_double_errs[chan] = bdk_atomic_get64(&__bdk_dram_ecc_double_bit_errors[chan])); |
| 570 | } |
| 571 | |
| 572 | /* Always print any ECC errors */ |
| 573 | if (ecc_single || ecc_double) |
| 574 | { |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 575 | printf("Test \"%s\": ECC errors, %lld/%lld/%lld/%lld corrected, %lld/%lld/%lld/%lld uncorrected\n", |
| 576 | name, |
| 577 | ecc_single_errs[0], ecc_single_errs[1], ecc_single_errs[2], ecc_single_errs[3], |
| 578 | ecc_double_errs[0], ecc_double_errs[1], ecc_double_errs[2], ecc_double_errs[3]); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 579 | } |
| 580 | if (errors || ecc_double || ecc_single) { |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 581 | printf("Test \"%s\": FAIL: %lld single, %lld double, %d compare errors\n", |
| 582 | name, ecc_single, ecc_double, errors); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 583 | } |
| 584 | else |
| 585 | BDK_TRACE(DRAM_TEST, "Test \"%s\": PASS\n", name); |
| 586 | |
| 587 | return (errors + ecc_double + ecc_single); |
| 588 | } |
| 589 | |
| 590 | /** |
| 591 | * Report a DRAM address in decoded format. |
| 592 | * |
| 593 | * @param address Physical address the error occurred at |
| 594 | * |
| 595 | */ |
| 596 | static void __bdk_dram_report_address_decode(uint64_t address, char *buffer, int len) |
| 597 | { |
| 598 | int node, lmc, dimm, prank, lrank, bank, row, col; |
| 599 | |
| 600 | bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col); |
| 601 | |
| 602 | snprintf(buffer, len, "[0x%011lx] (N%d,LMC%d,DIMM%d,Rank%d/%d,Bank%02d,Row 0x%05x,Col 0x%04x)", |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 603 | address, node, lmc, dimm, prank, lrank, bank, row, col); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 604 | } |
| 605 | |
| 606 | /** |
| 607 | * Report a DRAM address in a new decoded format. |
| 608 | * |
| 609 | * @param address Physical address the error occurred at |
| 610 | * @param xor XOR of data read vs expected data |
| 611 | * |
| 612 | */ |
| 613 | static void __bdk_dram_report_address_decode_new(uint64_t address, uint64_t orig_xor, char *buffer, int len) |
| 614 | { |
| 615 | int node, lmc, dimm, prank, lrank, bank, row, col; |
| 616 | |
| 617 | int byte = 8; // means no byte-lanes in error, should not happen |
| 618 | uint64_t bits, print_bits = 0; |
| 619 | uint64_t xor = orig_xor; |
| 620 | |
| 621 | // find the byte-lane(s) with errors |
| 622 | for (int i = 0; i < 8; i++) { |
| 623 | bits = xor & 0xffULL; |
| 624 | xor >>= 8; |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 625 | if (bits) { |
| 626 | if (byte != 8) { |
| 627 | byte = 9; // means more than 1 byte-lane was present |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 628 | print_bits = orig_xor; // print the full original |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 629 | break; // quit now |
| 630 | } else { |
| 631 | byte = i; // keep checking |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 632 | print_bits = bits; |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 633 | } |
| 634 | } |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 635 | } |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 636 | |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 637 | bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col); |
| 638 | |
| 639 | snprintf(buffer, len, "N%d.LMC%d: CMP byte %d xor 0x%02lx (DIMM%d,Rank%d/%d,Bank%02d,Row 0x%05x,Col 0x%04x)[0x%011lx]", |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 640 | node, lmc, byte, print_bits, dimm, prank, lrank, bank, row, col, address); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 641 | } |
| 642 | |
| 643 | /** |
| 644 | * Report a DRAM error. Errors are not shown after MAX_ERRORS_TO_REPORT is |
| 645 | * exceeded. Used when a single address is involved in the failure. |
| 646 | * |
| 647 | * @param address Physical address the error occurred at |
| 648 | * @param data Data read from memory |
| 649 | * @param correct Correct data |
| 650 | * @param burst Which burst this is from, informational only |
Peter Lemenkov | 7bbe3bb | 2018-12-07 11:23:21 +0100 | [diff] [blame] | 651 | * @param fails -1 for no retries done, >= 0 number of failures during retries |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 652 | * |
| 653 | * @return Zero if a message was logged, non-zero if the error limit has been reached |
| 654 | */ |
| 655 | void __bdk_dram_report_error(uint64_t address, uint64_t data, uint64_t correct, int burst, int fails) |
| 656 | { |
| 657 | char buffer[128]; |
| 658 | char failbuf[32]; |
| 659 | int64_t errors = bdk_atomic_fetch_and_add64(&dram_test_thread_errors, 1); |
| 660 | uint64_t xor = data ^ correct; |
| 661 | |
| 662 | if (errors < MAX_ERRORS_TO_REPORT) |
| 663 | { |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 664 | if (fails < 0) { |
| 665 | snprintf(failbuf, sizeof(failbuf), " "); |
| 666 | } else { |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 667 | int percent_x10 = fails * 1000 / RETRY_LIMIT; |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 668 | snprintf(failbuf, sizeof(failbuf), ", retries failed %3d.%d%%", |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 669 | percent_x10 / 10, percent_x10 % 10); |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 670 | } |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 671 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 672 | __bdk_dram_report_address_decode_new(address, xor, buffer, sizeof(buffer)); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 673 | bdk_error("%s%s\n", buffer, failbuf); |
| 674 | |
| 675 | if (errors == MAX_ERRORS_TO_REPORT-1) |
| 676 | bdk_error("No further DRAM errors will be reported\n"); |
| 677 | } |
| 678 | return; |
| 679 | } |
| 680 | |
| 681 | /** |
| 682 | * Report a DRAM error. Errors are not shown after MAX_ERRORS_TO_REPORT is |
| 683 | * exceeded. Used when two addresses might be involved in the failure. |
| 684 | * |
| 685 | * @param address1 First address involved in the failure |
| 686 | * @param data1 Data from the first address |
| 687 | * @param address2 Second address involved in the failure |
| 688 | * @param data2 Data from second address |
| 689 | * @param burst Which burst this is from, informational only |
Peter Lemenkov | 7bbe3bb | 2018-12-07 11:23:21 +0100 | [diff] [blame] | 690 | * @param fails -1 for no retries done, >= 0 number of failures during retries |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 691 | * |
| 692 | * @return Zero if a message was logged, non-zero if the error limit has been reached |
| 693 | */ |
| 694 | void __bdk_dram_report_error2(uint64_t address1, uint64_t data1, uint64_t address2, uint64_t data2, |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 695 | int burst, int fails) |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 696 | { |
| 697 | int64_t errors = bdk_atomic_fetch_and_add64(&dram_test_thread_errors, 1); |
| 698 | if (errors < MAX_ERRORS_TO_REPORT) |
| 699 | { |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 700 | char buffer1[80], buffer2[80]; |
| 701 | char failbuf[32]; |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 702 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 703 | if (fails < 0) { |
| 704 | snprintf(failbuf, sizeof(failbuf), " "); |
| 705 | } else { |
| 706 | snprintf(failbuf, sizeof(failbuf), ", retried %d failed %d", RETRY_LIMIT, fails); |
| 707 | } |
| 708 | __bdk_dram_report_address_decode(address1, buffer1, sizeof(buffer1)); |
| 709 | __bdk_dram_report_address_decode(address2, buffer2, sizeof(buffer2)); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 710 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 711 | bdk_error("compare: data1: 0x%016llx, xor: 0x%016llx%s\n" |
| 712 | " %s\n %s\n", |
| 713 | data1, data1 ^ data2, failbuf, |
| 714 | buffer1, buffer2); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 715 | |
| 716 | if (errors == MAX_ERRORS_TO_REPORT-1) |
| 717 | bdk_error("No further DRAM errors will be reported\n"); |
| 718 | } |
| 719 | return; |
| 720 | } |
| 721 | |
| 722 | /* Report the circumstances of a failure and try re-reading the memory |
| 723 | * location to see if the error is transient or permanent. |
| 724 | * |
| 725 | * Note: re-reading requires using evicting addresses |
| 726 | */ |
| 727 | int __bdk_dram_retry_failure(int burst, uint64_t address, uint64_t data, uint64_t expected) |
| 728 | { |
| 729 | int refail = 0; |
| 730 | |
| 731 | // bypass the retries if we are already over the limit... |
| 732 | if (bdk_atomic_get64(&dram_test_thread_errors) < MAX_ERRORS_TO_REPORT) { |
| 733 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 734 | /* Try re-reading the memory location. A transient error may fail |
| 735 | * on one read and work on another. Keep on retrying even when a |
| 736 | * read succeeds. |
| 737 | */ |
| 738 | for (int i = 0; i < RETRY_LIMIT; i++) { |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 739 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 740 | __bdk_dram_flush_to_mem(address); |
| 741 | BDK_DCACHE_INVALIDATE; |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 742 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 743 | uint64_t new = __bdk_dram_read64(address); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 744 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 745 | if (new != expected) { |
| 746 | refail++; |
| 747 | } |
| 748 | } |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 749 | } else |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 750 | refail = -1; |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 751 | |
| 752 | // this will increment the errors always, but maybe not print... |
| 753 | __bdk_dram_report_error(address, data, expected, burst, refail); |
| 754 | |
| 755 | return 1; |
| 756 | } |
| 757 | |
| 758 | /** |
| 759 | * retry_failure2 |
| 760 | * |
| 761 | * @param burst |
| 762 | * @param address1 |
| 763 | * @param address2 |
| 764 | */ |
| 765 | int __bdk_dram_retry_failure2(int burst, uint64_t address1, uint64_t data1, uint64_t address2, uint64_t data2) |
| 766 | { |
| 767 | int refail = 0; |
| 768 | |
| 769 | // bypass the retries if we are already over the limit... |
| 770 | if (bdk_atomic_get64(&dram_test_thread_errors) < MAX_ERRORS_TO_REPORT) { |
| 771 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 772 | for (int i = 0; i < RETRY_LIMIT; i++) { |
| 773 | __bdk_dram_flush_to_mem(address1); |
| 774 | __bdk_dram_flush_to_mem(address2); |
| 775 | BDK_DCACHE_INVALIDATE; |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 776 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 777 | uint64_t d1 = __bdk_dram_read64(address1); |
| 778 | uint64_t d2 = __bdk_dram_read64(address2); |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 779 | |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 780 | if (d1 != d2) { |
| 781 | refail++; |
| 782 | } |
| 783 | } |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 784 | } else |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 785 | refail = -1; |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 786 | |
| 787 | // this will increment the errors always, but maybe not print... |
| 788 | __bdk_dram_report_error2(address1, data1, address2, data2, burst, refail); |
| 789 | |
| 790 | return 1; |
| 791 | } |
| 792 | |
| 793 | /** |
| 794 | * Inject a DRAM error at a specific address in memory. The injection can either |
| 795 | * be a single bit inside the byte, or a double bit error in the ECC byte. Double |
| 796 | * bit errors may corrupt memory, causing software to crash. The corruption is |
| 797 | * written to memory and will continue to exist until the cache line is written |
| 798 | * again. After a call to this function, the BDK should report a ECC error. Double |
| 799 | * bit errors corrupt bits 0-1. |
| 800 | * |
| 801 | * @param address Physical address to corrupt. Any byte alignment is supported |
| 802 | * @param bit Bit to corrupt in the byte (0-7), or -1 to create a double bit fault in the ECC |
| 803 | * byte. |
| 804 | */ |
| 805 | void bdk_dram_test_inject_error(uint64_t address, int bit) |
| 806 | { |
| 807 | uint64_t aligned_address = address & -16; |
| 808 | int corrupt_bit = -1; |
| 809 | if (bit >= 0) |
| 810 | corrupt_bit = (address & 0xf) * 8 + bit; |
| 811 | |
| 812 | /* Extract the DRAM controller information */ |
| 813 | int node, lmc, dimm, prank, lrank, bank, row, col; |
| 814 | bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col); |
| 815 | |
| 816 | /* Read the current data */ |
| 817 | uint64_t data = __bdk_dram_read64(aligned_address); |
| 818 | |
| 819 | /* Program LMC to inject the error */ |
| 820 | if ((corrupt_bit >= 0) && (corrupt_bit < 64)) |
| 821 | BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 1ull << corrupt_bit); |
| 822 | else if (bit == -1) |
| 823 | BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 3); |
| 824 | else |
| 825 | BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 0); |
| 826 | if (corrupt_bit >= 64) |
| 827 | BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 1ull << (corrupt_bit - 64)); |
| 828 | else |
| 829 | BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 0); |
| 830 | BDK_CSR_MODIFY(c, node, BDK_LMCX_ECC_PARITY_TEST(lmc), |
| 831 | c.s.ecc_corrupt_idx = (address & 0x7f) >> 4; |
| 832 | c.s.ecc_corrupt_ena = 1); |
| 833 | BDK_CSR_READ(node, BDK_LMCX_ECC_PARITY_TEST(lmc)); |
| 834 | |
| 835 | /* Perform a write and push it to DRAM. This creates the error */ |
| 836 | __bdk_dram_write64(aligned_address, data); |
| 837 | __bdk_dram_flush_to_mem(aligned_address); |
| 838 | |
| 839 | /* Disable error injection */ |
| 840 | BDK_CSR_MODIFY(c, node, BDK_LMCX_ECC_PARITY_TEST(lmc), |
| 841 | c.s.ecc_corrupt_ena = 0); |
| 842 | BDK_CSR_READ(node, BDK_LMCX_ECC_PARITY_TEST(lmc)); |
| 843 | BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 0); |
| 844 | BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 0); |
| 845 | |
| 846 | /* Read back the data, which should now cause an error */ |
David Hendricks | 7d48ac5 | 2018-03-09 14:30:38 -0800 | [diff] [blame] | 847 | printf("Loading the injected error address 0x%llx, node=%d, lmc=%d, dimm=%d, rank=%d/%d, bank=%d, row=%d, col=%d\n", |
David Hendricks | 2004b93 | 2018-03-09 13:58:27 -0800 | [diff] [blame] | 848 | address, node, lmc, dimm, prank, lrank, bank, row, col); |
| 849 | __bdk_dram_read64(aligned_address); |
| 850 | } |