blob: 9c78667116646c941136c7e92fea51c6bc0097a9 [file] [log] [blame]
David Hendricks2004b932018-03-09 13:58:27 -08001/***********************license start***********************************
2* Copyright (c) 2003-2017 Cavium Inc. (support@cavium.com). All rights
3* reserved.
4*
5*
6* Redistribution and use in source and binary forms, with or without
7* modification, are permitted provided that the following conditions are
8* met:
9*
10* * Redistributions of source code must retain the above copyright
11* notice, this list of conditions and the following disclaimer.
12*
13* * Redistributions in binary form must reproduce the above
14* copyright notice, this list of conditions and the following
15* disclaimer in the documentation and/or other materials provided
16* with the distribution.
17*
18* * Neither the name of Cavium Inc. nor the names of
19* its contributors may be used to endorse or promote products
20* derived from this software without specific prior written
21* permission.
22*
23* This Software, including technical data, may be subject to U.S. export
24* control laws, including the U.S. Export Administration Act and its
25* associated regulations, and may be subject to export or import
26* regulations in other countries.
27*
28* TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
29* AND WITH ALL FAULTS AND CAVIUM INC. MAKES NO PROMISES, REPRESENTATIONS OR
30* WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT
31* TO THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY
32* REPRESENTATION OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT
33* DEFECTS, AND CAVIUM SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES
34* OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR
35* PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT,
36* QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. THE ENTIRE RISK
37* ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
38***********************license end**************************************/
39#include <bdk.h>
40#include "libbdk-arch/bdk-csrs-gti.h"
41#include "libbdk-arch/bdk-csrs-ocx.h"
42
David Hendricks7d48ac52018-03-09 14:30:38 -080043#include <bdk-minimal.h> /* for printf --> printk */
44#include <libbdk-dram/bdk-dram-test.h>
45#include <libbdk-hal/bdk-atomic.h>
46#include <libbdk-hal/bdk-clock.h>
47#include <libbdk-hal/bdk-utils.h>
48#include <libbdk-os/bdk-init.h>
49#include <libbdk-os/bdk-thread.h>
50
David Hendricks2004b932018-03-09 13:58:27 -080051/* This code is an optional part of the BDK. It is only linked in
52 if BDK_REQUIRE() needs it */
53BDK_REQUIRE_DEFINE(DRAM_TEST);
54
55#define MAX_ERRORS_TO_REPORT 50
56#define RETRY_LIMIT 1000
57
58typedef struct
59{
60 const char * name; /* Friendly name for the test */
61 __bdk_dram_test_t test_func; /* Function to call */
62 int bursts; /* Bursts parameter to pass to the test */
63 int max_cores; /* Maximum number of cores the test should be run on in parallel. Zero means all */
64} dram_test_info_t;
65
66static const dram_test_info_t TEST_INFO[] = {
67 /* Name, Test function, Bursts, Max Cores */
68 { "Data Bus", __bdk_dram_test_mem_data_bus, 8, 1},
69 { "Address Bus", __bdk_dram_test_mem_address_bus, 0, 1},
70 { "Marching Rows", __bdk_dram_test_mem_rows, 16, 0},
71 { "Random Data", __bdk_dram_test_mem_random, 32, 0},
72 { "Random XOR (32 Burst)", __bdk_dram_test_mem_xor, 32, 0},
73 { "Self Address", __bdk_dram_test_mem_self_addr, 1, 0},
74 { "March C- Solid Bits", __bdk_dram_test_mem_solid, 1, 0},
75 { "March C- Checkerboard", __bdk_dram_test_mem_checkerboard, 1, 0},
76 { "Walking Ones Left", __bdk_dram_test_mem_leftwalk1, 1, 0},
77 { "Walking Ones Right", __bdk_dram_test_mem_rightwalk1, 1, 0},
78 { "Walking Zeros Left", __bdk_dram_test_mem_leftwalk0, 1, 0},
79 { "Walking Zeros Right", __bdk_dram_test_mem_rightwalk0, 1, 0},
80 { "Random XOR (224 Burst)", __bdk_dram_test_mem_xor, 224, 0},
81 { "Fast Scan", __bdk_dram_test_fast_scan, 0, 0},
82 { NULL, NULL, 0, 0}
83};
84
85/* These variables count the number of ECC errors. They should only be accessed atomically */
86int64_t __bdk_dram_ecc_single_bit_errors[BDK_MAX_MEM_CHANS];
87int64_t __bdk_dram_ecc_double_bit_errors[BDK_MAX_MEM_CHANS];
88
89static int64_t dram_test_thread_done;
90static int64_t dram_test_thread_errors;
91static uint64_t dram_test_thread_start;
92static uint64_t dram_test_thread_end;
93static uint64_t dram_test_thread_size;
94
95/**
96 * Force the memory at the pointer location to be written to memory and evicted
97 * from L2. L1 will be unaffected.
98 *
99 * @param address Physical memory location
100 */
101void __bdk_dram_flush_to_mem(uint64_t address)
102{
103 BDK_MB;
Jacob Garber4926e982019-07-26 11:45:43 -0600104 char *ptr = bdk_phys_to_ptr(address);
David Hendricks2004b932018-03-09 13:58:27 -0800105 BDK_CACHE_WBI_L2(ptr);
106}
107
108/**
109 * Force a memory region to be written to DRAM and evicted from L2
110 *
111 * @param area Start of the region
112 * @param max_address
113 * End of the region (exclusive)
114 */
115void __bdk_dram_flush_to_mem_range(uint64_t area, uint64_t max_address)
116{
Jacob Garber4926e982019-07-26 11:45:43 -0600117 char *ptr = bdk_phys_to_ptr(area);
118 char *end = bdk_phys_to_ptr(max_address);
David Hendricks2004b932018-03-09 13:58:27 -0800119 BDK_MB;
120 while (ptr < end)
121 {
122 BDK_CACHE_WBI_L2(ptr);
123 ptr += 128;
124 }
125}
126
127/**
128 * Convert a test enumeration into a string
129 *
130 * @param test Test to convert
131 *
132 * @return String for display
133 */
134const char *bdk_dram_get_test_name(int test)
135{
136 if (test < (int)(sizeof(TEST_INFO) / sizeof(TEST_INFO[0])))
137 return TEST_INFO[test].name;
138 else
139 return NULL;
140}
141
142static bdk_dram_test_flags_t dram_test_flags; // FIXME: Don't use global
143/**
144 * This function is run as a thread to perform memory tests over multiple cores.
145 * Each thread gets a section of memory to work on, which is controlled by global
146 * variables at the beginning of this file.
147 *
148 * @param arg Number of the region we should check
Jonathan Neuschäfer45e6c822018-12-11 17:53:07 +0100149 * @param arg1 Pointer to the test_info structure
David Hendricks2004b932018-03-09 13:58:27 -0800150 */
151static void dram_test_thread(int arg, void *arg1)
152{
153 const dram_test_info_t *test_info = arg1;
154 const int bursts = test_info->bursts;
155 const int range_number = arg;
156
157 /* Figure out our work memory range.
158 *
159 * Note start_address and end_address just provide the physical offset
160 * portion of the address and do not have the node bits set. This is
161 * to simplify address checks and calculations. Later, when about to run
162 * the memory test, the routines adds in the node bits to form the final
163 * addresses.
164 */
165 uint64_t start_address = dram_test_thread_start + dram_test_thread_size * range_number;
166 uint64_t end_address = start_address + dram_test_thread_size;
167 if (end_address > dram_test_thread_end)
168 end_address = dram_test_thread_end;
169
170 bdk_node_t test_node = bdk_numa_local();
171 if (dram_test_flags & BDK_DRAM_TEST_USE_CCPI)
172 test_node ^= 1;
173 /* Insert the node part of the address */
174 start_address = bdk_numa_get_address(test_node, start_address);
175 end_address = bdk_numa_get_address(test_node, end_address);
176 /* Test the region */
David Hendricks7d48ac52018-03-09 14:30:38 -0800177 BDK_TRACE(DRAM_TEST, " Node %d, core %d, Testing [0x%011llx:0x%011llx]\n",
David Hendricks2004b932018-03-09 13:58:27 -0800178 bdk_numa_local(), bdk_get_core_num() & 127, start_address, end_address - 1);
179 test_info->test_func(start_address, end_address, bursts);
180
181 /* Report that we're done */
182 BDK_TRACE(DRAM_TEST, "Thread %d on node %d done with memory test\n", range_number, bdk_numa_local());
183 bdk_atomic_add64_nosync(&dram_test_thread_done, 1);
184}
185
186/**
187 * Run the memory test.
188 *
189 * @param test_info
190 * @param start_address
191 * Physical address to start at
192 * @param length Length of memory block
193 * @param flags Flags to control memory test options. Zero defaults to testing all
194 * node with statistics and progress output.
195 *
196 * @return Number of errors found. Zero is success. Negative means the test
197 * did not run due to some other failure.
198 */
199static int __bdk_dram_run_test(const dram_test_info_t *test_info, uint64_t start_address,
200 uint64_t length, bdk_dram_test_flags_t flags)
201{
202 /* Figure out the addess of the byte one off the top of memory */
203 uint64_t max_address = bdk_dram_get_size_mbytes(bdk_numa_local());
David Hendricks7d48ac52018-03-09 14:30:38 -0800204 BDK_TRACE(DRAM_TEST, "DRAM available per node: %llu MB\n", max_address);
David Hendricks2004b932018-03-09 13:58:27 -0800205 max_address <<= 20;
206
207 /* Make sure we have enough */
208 if (max_address < (16<<20))
209 {
210 bdk_error("DRAM size is too small\n");
211 return -1;
212 }
213
214 /* Make sure the amount is sane */
215 if (CAVIUM_IS_MODEL(CAVIUM_CN8XXX))
216 {
217 if (max_address > (1ull << 40)) /* 40 bits in CN8XXX */
218 max_address = 1ull << 40;
219 }
220 else
221 {
222 if (max_address > (1ull << 43)) /* 43 bits in CN9XXX */
223 max_address = 1ull << 43;
224 }
David Hendricks7d48ac52018-03-09 14:30:38 -0800225 BDK_TRACE(DRAM_TEST, "DRAM max address: 0x%011llx\n", max_address-1);
David Hendricks2004b932018-03-09 13:58:27 -0800226
227 /* Make sure the start address is lower than the top of memory */
228 if (start_address >= max_address)
229 {
David Hendricks7d48ac52018-03-09 14:30:38 -0800230 bdk_error("Start address is larger than the amount of memory: 0x%011llx versus 0x%011llx\n",
231 start_address, max_address);
David Hendricks2004b932018-03-09 13:58:27 -0800232 return -1;
233 }
234 if (length == (uint64_t)-1)
235 length = max_address - start_address;
236
237 /* Final range checks */
238 uint64_t end_address = start_address + length;
239 if (end_address > max_address)
240 {
241 end_address = max_address;
242 length = end_address - start_address;
243 }
244 if (length == 0)
245 return 0;
246
247 /* Ready to run the test. Figure out how many cores we need */
248 int max_cores = test_info->max_cores;
249 int total_cores_all_nodes = max_cores;
250
251 /* Figure out the number of cores available in the system */
252 if (max_cores == 0)
253 {
254 max_cores += bdk_get_num_running_cores(bdk_numa_local());
255 /* Calculate the total number of cores being used. The per node number
256 is confusing to people */
257 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
258 if (flags & (1 << node))
259 {
260 if (flags & BDK_DRAM_TEST_USE_CCPI)
261 total_cores_all_nodes += bdk_get_num_running_cores(node ^ 1);
262 else
263 total_cores_all_nodes += bdk_get_num_running_cores(node);
264 }
265 }
266 if (!(flags & BDK_DRAM_TEST_NO_BANNERS))
David Hendricks7d48ac52018-03-09 14:30:38 -0800267 printf("Starting Test \"%s\" for [0x%011llx:0x%011llx] using %d core(s)\n",
268 test_info->name, start_address, end_address - 1, total_cores_all_nodes);
David Hendricks2004b932018-03-09 13:58:27 -0800269
270 /* Remember the LMC perf counters for stats after the test */
271 uint64_t start_dram_dclk[BDK_NUMA_MAX_NODES][4];
272 uint64_t start_dram_ops[BDK_NUMA_MAX_NODES][4];
273 uint64_t stop_dram_dclk[BDK_NUMA_MAX_NODES][4];
274 uint64_t stop_dram_ops[BDK_NUMA_MAX_NODES][4];
275 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
276 {
277 if (flags & (1 << node))
278 {
279 const int num_dram_controllers = __bdk_dram_get_num_lmc(node);
280 for (int i = 0; i < num_dram_controllers; i++)
281 {
282 start_dram_dclk[node][i] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(i));
283 start_dram_ops[node][i] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(i));
284 }
285 }
286 }
287 /* Remember the CCPI link counters for stats after the test */
288 uint64_t start_ccpi_data[BDK_NUMA_MAX_NODES][3];
289 uint64_t start_ccpi_idle[BDK_NUMA_MAX_NODES][3];
290 uint64_t start_ccpi_err[BDK_NUMA_MAX_NODES][3];
291 uint64_t stop_ccpi_data[BDK_NUMA_MAX_NODES][3];
292 uint64_t stop_ccpi_idle[BDK_NUMA_MAX_NODES][3];
293 uint64_t stop_ccpi_err[BDK_NUMA_MAX_NODES][3];
294 if (!bdk_numa_is_only_one())
295 {
296 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
297 {
298 if (flags & (1 << node))
299 {
300 for (int link = 0; link < 3; link++)
301 {
302 start_ccpi_data[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_DATA_CNT(link));
303 start_ccpi_idle[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_IDLE_CNT(link));
304 start_ccpi_err[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_ERR_CNT(link));
305 }
306 }
307 }
308 }
309
310 /* WARNING: This code assumes the same memory range is being tested on
311 all nodes. The same number of cores are used on each node to test
312 its local memory */
313 uint64_t work_address = start_address;
314 dram_test_flags = flags;
315 bdk_atomic_set64(&dram_test_thread_errors, 0);
316 while ((work_address < end_address) && ((dram_test_thread_errors == 0) || (flags & BDK_DRAM_TEST_NO_STOP_ERROR)))
317 {
318 /* Check at most MAX_CHUNK_SIZE across each iteration. We only report
319 progress between chunks, so keep them reasonably small */
320 const uint64_t MAX_CHUNK_SIZE = 1ull << 28; /* 256MB */
321 uint64_t size = end_address - work_address;
322 if (size > MAX_CHUNK_SIZE)
323 size = MAX_CHUNK_SIZE;
324
325 /* Divide memory evenly between the cores. Round the size up so that
326 all memory is covered. The last core may have slightly less memory to
327 test */
328 uint64_t thread_size = (size + (max_cores - 1)) / max_cores;
329 thread_size += 127;
330 thread_size &= -128;
331 dram_test_thread_start = work_address;
332 dram_test_thread_end = work_address + size;
333 dram_test_thread_size = thread_size;
334 BDK_WMB;
335
336 /* Poke the watchdog */
337 BDK_CSR_WRITE(bdk_numa_local(), BDK_GTI_CWD_POKEX(0), 0);
338
David Hendricks7d48ac52018-03-09 14:30:38 -0800339 /* disable progress output when batch mode is ON */
David Hendricks2004b932018-03-09 13:58:27 -0800340 if (!(flags & BDK_DRAM_TEST_NO_PROGRESS)) {
341
342 /* Report progress percentage */
343 int percent_x10 = (work_address - start_address) * 1000 / (end_address - start_address);
David Hendricks7d48ac52018-03-09 14:30:38 -0800344 printf(" %3d.%d%% complete, testing [0x%011llx:0x%011llx]\r",
David Hendricks2004b932018-03-09 13:58:27 -0800345 percent_x10 / 10, percent_x10 % 10, work_address, work_address + size - 1);
346 fflush(stdout);
David Hendricks7d48ac52018-03-09 14:30:38 -0800347 }
David Hendricks2004b932018-03-09 13:58:27 -0800348
349 work_address += size;
350
351 /* Start threads for all the cores */
352 int total_count = 0;
353 bdk_atomic_set64(&dram_test_thread_done, 0);
354 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
355 {
356 if (flags & (1 << node))
357 {
358 const int num_cores = bdk_get_num_cores(node);
359 int per_node = 0;
360 for (int core = 0; core < num_cores; core++)
361 {
362 if (per_node >= max_cores)
363 break;
David Hendricks2004b932018-03-09 13:58:27 -0800364 BDK_TRACE(DRAM_TEST, "Starting thread %d on node %d for memory test\n", per_node, node);
David Hendricks7d48ac52018-03-09 14:30:38 -0800365 dram_test_thread(per_node, (void *)test_info);
David Hendricks2004b932018-03-09 13:58:27 -0800366 }
367 }
368 }
369
370#if 0
371 /* Wait for threads to finish */
372 while (bdk_atomic_get64(&dram_test_thread_done) < total_count)
373 bdk_thread_yield();
374#else
375#define TIMEOUT_SECS 30 // FIXME: long enough so multicore RXOR 224 should not print out
376 /* Wait for threads to finish, with progress */
377 int cur_count;
378 uint64_t cur_time;
Peter Lemenkov7bbe3bb2018-12-07 11:23:21 +0100379 uint64_t period = bdk_clock_get_rate(bdk_numa_local(), BDK_CLOCK_TIME) * TIMEOUT_SECS; // FIXME?
David Hendricks2004b932018-03-09 13:58:27 -0800380 uint64_t timeout = bdk_clock_get_count(BDK_CLOCK_TIME) + period;
381 do {
David Hendricks2004b932018-03-09 13:58:27 -0800382 cur_count = bdk_atomic_get64(&dram_test_thread_done);
383 cur_time = bdk_clock_get_count(BDK_CLOCK_TIME);
384 if (cur_time >= timeout) {
385 BDK_TRACE(DRAM_TEST, "N%d: Waiting for %d cores\n",
386 bdk_numa_local(), total_count - cur_count);
387 timeout = cur_time + period;
388 }
389 } while (cur_count < total_count);
390#endif
391 }
392
393 /* Get the DRAM perf counters */
394 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
395 {
396 if (flags & (1 << node))
397 {
398 const int num_dram_controllers = __bdk_dram_get_num_lmc(node);
399 for (int i = 0; i < num_dram_controllers; i++)
400 {
401 stop_dram_dclk[node][i] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(i));
402 stop_dram_ops[node][i] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(i));
403 }
404 }
405 }
406 /* Get the CCPI link counters */
407 if (!bdk_numa_is_only_one())
408 {
409 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
410 {
411 if (flags & (1 << node))
412 {
413 for (int link = 0; link < 3; link++)
414 {
415 stop_ccpi_data[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_DATA_CNT(link));
416 stop_ccpi_idle[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_IDLE_CNT(link));
417 stop_ccpi_err[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_ERR_CNT(link));
418 }
419 }
420 }
421 }
422
423 /* disable progress output when batch mode is ON */
424 if (!(flags & BDK_DRAM_TEST_NO_PROGRESS)) {
425
426 /* Report progress percentage as complete */
David Hendricks7d48ac52018-03-09 14:30:38 -0800427 printf(" %3d.%d%% complete, testing [0x%011llx:0x%011llx]\n",
David Hendricks2004b932018-03-09 13:58:27 -0800428 100, 0, start_address, end_address - 1);
429 fflush(stdout);
430 }
431
432 if (!(flags & BDK_DRAM_TEST_NO_STATS))
433 {
434 /* Display LMC load */
435 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
436 {
437 if (flags & (1 << node))
438 {
439 const int num_dram_controllers = __bdk_dram_get_num_lmc(node);
440 for (int i = 0; i < num_dram_controllers; i++)
441 {
442 uint64_t ops = stop_dram_ops[node][i] - start_dram_ops[node][i];
443 uint64_t dclk = stop_dram_dclk[node][i] - start_dram_dclk[node][i];
444 if (dclk == 0)
445 dclk = 1;
446 uint64_t percent_x10 = ops * 1000 / dclk;
David Hendricks7d48ac52018-03-09 14:30:38 -0800447 printf(" Node %d, LMC%d: ops %llu, cycles %llu, used %llu.%llu%%\n",
David Hendricks2004b932018-03-09 13:58:27 -0800448 node, i, ops, dclk, percent_x10 / 10, percent_x10 % 10);
449 }
450 }
451 }
452 if (flags & BDK_DRAM_TEST_USE_CCPI)
453 {
454 /* Display CCPI load */
455 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
456 {
457 if (flags & (1 << node))
458 {
459 for (int link = 0; link < 3; link++)
460 {
461 uint64_t busy = stop_ccpi_data[node][link] - start_ccpi_data[node][link];
462 busy += stop_ccpi_err[node][link] - start_ccpi_err[node][link];
463 uint64_t total = stop_ccpi_idle[node][link] - start_ccpi_idle[node][link];
464 total += busy;
465 if (total == 0)
466 continue;
467 uint64_t percent_x10 = busy * 1000 / total;
David Hendricks7d48ac52018-03-09 14:30:38 -0800468 printf(" Node %d, CCPI%d: busy %llu, total %llu, used %llu.%llu%%\n",
David Hendricks2004b932018-03-09 13:58:27 -0800469 node, link, busy, total, percent_x10 / 10, percent_x10 % 10);
470 }
471 }
472 }
473 }
474 }
475 return dram_test_thread_errors;
476}
477
478/**
479 * Perform a memory test.
480 *
481 * @param test Test type to run
482 * @param start_address
483 * Physical address to start at
484 * @param length Length of memory block
485 * @param flags Flags to control memory test options. Zero defaults to testing all
486 * node with statistics and progress output.
487 *
488 * @return Number of errors found. Zero is success. Negative means the test
489 * did not run due to some other failure.
490 */
491int bdk_dram_test(int test, uint64_t start_address, uint64_t length, bdk_dram_test_flags_t flags)
492{
493 /* These limits are arbitrary. They just make sure we aren't doing something
494 silly, like test a non cache line aligned memory region */
495 if (start_address & 0xffff)
496 {
497 bdk_error("DRAM test start address must be aligned on a 64KB boundary\n");
498 return -1;
499 }
500 if (length & 0xffff)
501 {
502 bdk_error("DRAM test length must be a multiple of 64KB\n");
503 return -1;
504 }
505
506 const char *name = bdk_dram_get_test_name(test);
507 if (name == NULL)
508 {
509 bdk_error("Invalid DRAM test number %d\n", test);
510 return -1;
511 }
512
513 /* If no nodes are selected assume the user meant all nodes */
514 if ((flags & (BDK_DRAM_TEST_NODE0 | BDK_DRAM_TEST_NODE1 | BDK_DRAM_TEST_NODE2 | BDK_DRAM_TEST_NODE3)) == 0)
515 flags |= BDK_DRAM_TEST_NODE0 | BDK_DRAM_TEST_NODE1 | BDK_DRAM_TEST_NODE2 | BDK_DRAM_TEST_NODE3;
516
517 /* Remove nodes from the flags that don't exist */
518 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
519 {
520 if (flags & BDK_DRAM_TEST_USE_CCPI)
521 {
522 if (!bdk_numa_exists(node ^ 1))
523 flags &= ~(1 << node);
524 }
525 else
526 {
527 if (!bdk_numa_exists(node))
528 flags &= ~(1 << node);
529 }
530 }
531
532
533 /* Make sure the start address is higher that the BDK's active range */
534 uint64_t top_of_bdk = bdk_dram_get_top_of_bdk();
535 if (start_address < top_of_bdk)
536 start_address = top_of_bdk;
537
538 /* Clear ECC error counters before starting the test */
539 for (int chan = 0; chan < BDK_MAX_MEM_CHANS; chan++) {
David Hendricks7d48ac52018-03-09 14:30:38 -0800540 bdk_atomic_set64(&__bdk_dram_ecc_single_bit_errors[chan], 0);
541 bdk_atomic_set64(&__bdk_dram_ecc_double_bit_errors[chan], 0);
David Hendricks2004b932018-03-09 13:58:27 -0800542 }
543
544 /* Make sure at least one core from each node is running */
David Hendricks7d48ac52018-03-09 14:30:38 -0800545 /* FIXME(dhendrix): we only care about core0 on node0 for now */
546#if 0
David Hendricks2004b932018-03-09 13:58:27 -0800547 for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
548 {
549 if (flags & (1<<node))
550 {
551 int use_node = (flags & BDK_DRAM_TEST_USE_CCPI) ? node ^ 1 : node;
552 if (bdk_get_running_coremask(use_node) == 0)
553 bdk_init_cores(use_node, 1);
554 }
555 }
David Hendricks7d48ac52018-03-09 14:30:38 -0800556#endif
David Hendricks2004b932018-03-09 13:58:27 -0800557
558 /* This returns any data compare errors found */
559 int errors = __bdk_dram_run_test(&TEST_INFO[test], start_address, length, flags);
560
David Hendricks2004b932018-03-09 13:58:27 -0800561 /* Check ECC error counters after the test */
562 int64_t ecc_single = 0;
563 int64_t ecc_double = 0;
564 int64_t ecc_single_errs[BDK_MAX_MEM_CHANS];
565 int64_t ecc_double_errs[BDK_MAX_MEM_CHANS];
566
567 for (int chan = 0; chan < BDK_MAX_MEM_CHANS; chan++) {
568 ecc_single += (ecc_single_errs[chan] = bdk_atomic_get64(&__bdk_dram_ecc_single_bit_errors[chan]));
569 ecc_double += (ecc_double_errs[chan] = bdk_atomic_get64(&__bdk_dram_ecc_double_bit_errors[chan]));
570 }
571
572 /* Always print any ECC errors */
573 if (ecc_single || ecc_double)
574 {
David Hendricks7d48ac52018-03-09 14:30:38 -0800575 printf("Test \"%s\": ECC errors, %lld/%lld/%lld/%lld corrected, %lld/%lld/%lld/%lld uncorrected\n",
576 name,
577 ecc_single_errs[0], ecc_single_errs[1], ecc_single_errs[2], ecc_single_errs[3],
578 ecc_double_errs[0], ecc_double_errs[1], ecc_double_errs[2], ecc_double_errs[3]);
David Hendricks2004b932018-03-09 13:58:27 -0800579 }
580 if (errors || ecc_double || ecc_single) {
David Hendricks7d48ac52018-03-09 14:30:38 -0800581 printf("Test \"%s\": FAIL: %lld single, %lld double, %d compare errors\n",
582 name, ecc_single, ecc_double, errors);
David Hendricks2004b932018-03-09 13:58:27 -0800583 }
584 else
585 BDK_TRACE(DRAM_TEST, "Test \"%s\": PASS\n", name);
586
587 return (errors + ecc_double + ecc_single);
588}
589
590/**
591 * Report a DRAM address in decoded format.
592 *
593 * @param address Physical address the error occurred at
594 *
595 */
596static void __bdk_dram_report_address_decode(uint64_t address, char *buffer, int len)
597{
598 int node, lmc, dimm, prank, lrank, bank, row, col;
599
600 bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col);
601
602 snprintf(buffer, len, "[0x%011lx] (N%d,LMC%d,DIMM%d,Rank%d/%d,Bank%02d,Row 0x%05x,Col 0x%04x)",
David Hendricks7d48ac52018-03-09 14:30:38 -0800603 address, node, lmc, dimm, prank, lrank, bank, row, col);
David Hendricks2004b932018-03-09 13:58:27 -0800604}
605
606/**
607 * Report a DRAM address in a new decoded format.
608 *
609 * @param address Physical address the error occurred at
610 * @param xor XOR of data read vs expected data
611 *
612 */
613static void __bdk_dram_report_address_decode_new(uint64_t address, uint64_t orig_xor, char *buffer, int len)
614{
615 int node, lmc, dimm, prank, lrank, bank, row, col;
616
617 int byte = 8; // means no byte-lanes in error, should not happen
618 uint64_t bits, print_bits = 0;
619 uint64_t xor = orig_xor;
620
621 // find the byte-lane(s) with errors
622 for (int i = 0; i < 8; i++) {
623 bits = xor & 0xffULL;
624 xor >>= 8;
David Hendricks7d48ac52018-03-09 14:30:38 -0800625 if (bits) {
626 if (byte != 8) {
627 byte = 9; // means more than 1 byte-lane was present
David Hendricks2004b932018-03-09 13:58:27 -0800628 print_bits = orig_xor; // print the full original
David Hendricks7d48ac52018-03-09 14:30:38 -0800629 break; // quit now
630 } else {
631 byte = i; // keep checking
David Hendricks2004b932018-03-09 13:58:27 -0800632 print_bits = bits;
David Hendricks7d48ac52018-03-09 14:30:38 -0800633 }
634 }
David Hendricks2004b932018-03-09 13:58:27 -0800635 }
David Hendricks7d48ac52018-03-09 14:30:38 -0800636
David Hendricks2004b932018-03-09 13:58:27 -0800637 bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col);
638
639 snprintf(buffer, len, "N%d.LMC%d: CMP byte %d xor 0x%02lx (DIMM%d,Rank%d/%d,Bank%02d,Row 0x%05x,Col 0x%04x)[0x%011lx]",
David Hendricks7d48ac52018-03-09 14:30:38 -0800640 node, lmc, byte, print_bits, dimm, prank, lrank, bank, row, col, address);
David Hendricks2004b932018-03-09 13:58:27 -0800641}
642
643/**
644 * Report a DRAM error. Errors are not shown after MAX_ERRORS_TO_REPORT is
645 * exceeded. Used when a single address is involved in the failure.
646 *
647 * @param address Physical address the error occurred at
648 * @param data Data read from memory
649 * @param correct Correct data
650 * @param burst Which burst this is from, informational only
Peter Lemenkov7bbe3bb2018-12-07 11:23:21 +0100651 * @param fails -1 for no retries done, >= 0 number of failures during retries
David Hendricks2004b932018-03-09 13:58:27 -0800652 *
653 * @return Zero if a message was logged, non-zero if the error limit has been reached
654 */
655void __bdk_dram_report_error(uint64_t address, uint64_t data, uint64_t correct, int burst, int fails)
656{
657 char buffer[128];
658 char failbuf[32];
659 int64_t errors = bdk_atomic_fetch_and_add64(&dram_test_thread_errors, 1);
660 uint64_t xor = data ^ correct;
661
662 if (errors < MAX_ERRORS_TO_REPORT)
663 {
David Hendricks7d48ac52018-03-09 14:30:38 -0800664 if (fails < 0) {
665 snprintf(failbuf, sizeof(failbuf), " ");
666 } else {
David Hendricks2004b932018-03-09 13:58:27 -0800667 int percent_x10 = fails * 1000 / RETRY_LIMIT;
David Hendricks7d48ac52018-03-09 14:30:38 -0800668 snprintf(failbuf, sizeof(failbuf), ", retries failed %3d.%d%%",
David Hendricks2004b932018-03-09 13:58:27 -0800669 percent_x10 / 10, percent_x10 % 10);
David Hendricks7d48ac52018-03-09 14:30:38 -0800670 }
David Hendricks2004b932018-03-09 13:58:27 -0800671
David Hendricks7d48ac52018-03-09 14:30:38 -0800672 __bdk_dram_report_address_decode_new(address, xor, buffer, sizeof(buffer));
David Hendricks2004b932018-03-09 13:58:27 -0800673 bdk_error("%s%s\n", buffer, failbuf);
674
675 if (errors == MAX_ERRORS_TO_REPORT-1)
676 bdk_error("No further DRAM errors will be reported\n");
677 }
678 return;
679}
680
681/**
682 * Report a DRAM error. Errors are not shown after MAX_ERRORS_TO_REPORT is
683 * exceeded. Used when two addresses might be involved in the failure.
684 *
685 * @param address1 First address involved in the failure
686 * @param data1 Data from the first address
687 * @param address2 Second address involved in the failure
688 * @param data2 Data from second address
689 * @param burst Which burst this is from, informational only
Peter Lemenkov7bbe3bb2018-12-07 11:23:21 +0100690 * @param fails -1 for no retries done, >= 0 number of failures during retries
David Hendricks2004b932018-03-09 13:58:27 -0800691 *
692 * @return Zero if a message was logged, non-zero if the error limit has been reached
693 */
694void __bdk_dram_report_error2(uint64_t address1, uint64_t data1, uint64_t address2, uint64_t data2,
David Hendricks7d48ac52018-03-09 14:30:38 -0800695 int burst, int fails)
David Hendricks2004b932018-03-09 13:58:27 -0800696{
697 int64_t errors = bdk_atomic_fetch_and_add64(&dram_test_thread_errors, 1);
698 if (errors < MAX_ERRORS_TO_REPORT)
699 {
David Hendricks7d48ac52018-03-09 14:30:38 -0800700 char buffer1[80], buffer2[80];
701 char failbuf[32];
David Hendricks2004b932018-03-09 13:58:27 -0800702
David Hendricks7d48ac52018-03-09 14:30:38 -0800703 if (fails < 0) {
704 snprintf(failbuf, sizeof(failbuf), " ");
705 } else {
706 snprintf(failbuf, sizeof(failbuf), ", retried %d failed %d", RETRY_LIMIT, fails);
707 }
708 __bdk_dram_report_address_decode(address1, buffer1, sizeof(buffer1));
709 __bdk_dram_report_address_decode(address2, buffer2, sizeof(buffer2));
David Hendricks2004b932018-03-09 13:58:27 -0800710
David Hendricks7d48ac52018-03-09 14:30:38 -0800711 bdk_error("compare: data1: 0x%016llx, xor: 0x%016llx%s\n"
712 " %s\n %s\n",
713 data1, data1 ^ data2, failbuf,
714 buffer1, buffer2);
David Hendricks2004b932018-03-09 13:58:27 -0800715
716 if (errors == MAX_ERRORS_TO_REPORT-1)
717 bdk_error("No further DRAM errors will be reported\n");
718 }
719 return;
720}
721
722/* Report the circumstances of a failure and try re-reading the memory
723 * location to see if the error is transient or permanent.
724 *
725 * Note: re-reading requires using evicting addresses
726 */
727int __bdk_dram_retry_failure(int burst, uint64_t address, uint64_t data, uint64_t expected)
728{
729 int refail = 0;
730
731 // bypass the retries if we are already over the limit...
732 if (bdk_atomic_get64(&dram_test_thread_errors) < MAX_ERRORS_TO_REPORT) {
733
David Hendricks7d48ac52018-03-09 14:30:38 -0800734 /* Try re-reading the memory location. A transient error may fail
735 * on one read and work on another. Keep on retrying even when a
736 * read succeeds.
737 */
738 for (int i = 0; i < RETRY_LIMIT; i++) {
David Hendricks2004b932018-03-09 13:58:27 -0800739
David Hendricks7d48ac52018-03-09 14:30:38 -0800740 __bdk_dram_flush_to_mem(address);
741 BDK_DCACHE_INVALIDATE;
David Hendricks2004b932018-03-09 13:58:27 -0800742
David Hendricks7d48ac52018-03-09 14:30:38 -0800743 uint64_t new = __bdk_dram_read64(address);
David Hendricks2004b932018-03-09 13:58:27 -0800744
David Hendricks7d48ac52018-03-09 14:30:38 -0800745 if (new != expected) {
746 refail++;
747 }
748 }
David Hendricks2004b932018-03-09 13:58:27 -0800749 } else
David Hendricks7d48ac52018-03-09 14:30:38 -0800750 refail = -1;
David Hendricks2004b932018-03-09 13:58:27 -0800751
752 // this will increment the errors always, but maybe not print...
753 __bdk_dram_report_error(address, data, expected, burst, refail);
754
755 return 1;
756}
757
758/**
759 * retry_failure2
760 *
761 * @param burst
762 * @param address1
763 * @param address2
764 */
765int __bdk_dram_retry_failure2(int burst, uint64_t address1, uint64_t data1, uint64_t address2, uint64_t data2)
766{
767 int refail = 0;
768
769 // bypass the retries if we are already over the limit...
770 if (bdk_atomic_get64(&dram_test_thread_errors) < MAX_ERRORS_TO_REPORT) {
771
David Hendricks7d48ac52018-03-09 14:30:38 -0800772 for (int i = 0; i < RETRY_LIMIT; i++) {
773 __bdk_dram_flush_to_mem(address1);
774 __bdk_dram_flush_to_mem(address2);
775 BDK_DCACHE_INVALIDATE;
David Hendricks2004b932018-03-09 13:58:27 -0800776
David Hendricks7d48ac52018-03-09 14:30:38 -0800777 uint64_t d1 = __bdk_dram_read64(address1);
778 uint64_t d2 = __bdk_dram_read64(address2);
David Hendricks2004b932018-03-09 13:58:27 -0800779
David Hendricks7d48ac52018-03-09 14:30:38 -0800780 if (d1 != d2) {
781 refail++;
782 }
783 }
David Hendricks2004b932018-03-09 13:58:27 -0800784 } else
David Hendricks7d48ac52018-03-09 14:30:38 -0800785 refail = -1;
David Hendricks2004b932018-03-09 13:58:27 -0800786
787 // this will increment the errors always, but maybe not print...
788 __bdk_dram_report_error2(address1, data1, address2, data2, burst, refail);
789
790 return 1;
791}
792
793/**
794 * Inject a DRAM error at a specific address in memory. The injection can either
795 * be a single bit inside the byte, or a double bit error in the ECC byte. Double
796 * bit errors may corrupt memory, causing software to crash. The corruption is
797 * written to memory and will continue to exist until the cache line is written
798 * again. After a call to this function, the BDK should report a ECC error. Double
799 * bit errors corrupt bits 0-1.
800 *
801 * @param address Physical address to corrupt. Any byte alignment is supported
802 * @param bit Bit to corrupt in the byte (0-7), or -1 to create a double bit fault in the ECC
803 * byte.
804 */
805void bdk_dram_test_inject_error(uint64_t address, int bit)
806{
807 uint64_t aligned_address = address & -16;
808 int corrupt_bit = -1;
809 if (bit >= 0)
810 corrupt_bit = (address & 0xf) * 8 + bit;
811
812 /* Extract the DRAM controller information */
813 int node, lmc, dimm, prank, lrank, bank, row, col;
814 bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col);
815
816 /* Read the current data */
817 uint64_t data = __bdk_dram_read64(aligned_address);
818
819 /* Program LMC to inject the error */
820 if ((corrupt_bit >= 0) && (corrupt_bit < 64))
821 BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 1ull << corrupt_bit);
822 else if (bit == -1)
823 BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 3);
824 else
825 BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 0);
826 if (corrupt_bit >= 64)
827 BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 1ull << (corrupt_bit - 64));
828 else
829 BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 0);
830 BDK_CSR_MODIFY(c, node, BDK_LMCX_ECC_PARITY_TEST(lmc),
831 c.s.ecc_corrupt_idx = (address & 0x7f) >> 4;
832 c.s.ecc_corrupt_ena = 1);
833 BDK_CSR_READ(node, BDK_LMCX_ECC_PARITY_TEST(lmc));
834
835 /* Perform a write and push it to DRAM. This creates the error */
836 __bdk_dram_write64(aligned_address, data);
837 __bdk_dram_flush_to_mem(aligned_address);
838
839 /* Disable error injection */
840 BDK_CSR_MODIFY(c, node, BDK_LMCX_ECC_PARITY_TEST(lmc),
841 c.s.ecc_corrupt_ena = 0);
842 BDK_CSR_READ(node, BDK_LMCX_ECC_PARITY_TEST(lmc));
843 BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 0);
844 BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 0);
845
846 /* Read back the data, which should now cause an error */
David Hendricks7d48ac52018-03-09 14:30:38 -0800847 printf("Loading the injected error address 0x%llx, node=%d, lmc=%d, dimm=%d, rank=%d/%d, bank=%d, row=%d, col=%d\n",
David Hendricks2004b932018-03-09 13:58:27 -0800848 address, node, lmc, dimm, prank, lrank, bank, row, col);
849 __bdk_dram_read64(aligned_address);
850}