| /***********************license start*********************************** |
| * Copyright (c) 2003-2017 Cavium Inc. (support@cavium.com). All rights |
| * reserved. |
| * |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following |
| * disclaimer in the documentation and/or other materials provided |
| * with the distribution. |
| * |
| * * Neither the name of Cavium Inc. nor the names of |
| * its contributors may be used to endorse or promote products |
| * derived from this software without specific prior written |
| * permission. |
| * |
| * This Software, including technical data, may be subject to U.S. export |
| * control laws, including the U.S. Export Administration Act and its |
| * associated regulations, and may be subject to export or import |
| * regulations in other countries. |
| * |
| * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS" |
| * AND WITH ALL FAULTS AND CAVIUM INC. MAKES NO PROMISES, REPRESENTATIONS OR |
| * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT |
| * TO THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY |
| * REPRESENTATION OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT |
| * DEFECTS, AND CAVIUM SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES |
| * OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR |
| * PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, |
| * QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. THE ENTIRE RISK |
| * ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE LIES WITH YOU. |
| ***********************license end**************************************/ |
| #include <bdk.h> |
| #include "dram-internal.h" |
| |
| #include <string.h> |
| #include <lame_string.h> /* for strtoul */ |
| #include <libbdk-hal/bdk-atomic.h> |
| #include <libbdk-hal/bdk-clock.h> |
| #include <libbdk-hal/bdk-rng.h> |
| #include <libbdk-os/bdk-init.h> |
| |
| // if enhanced verbosity levels are defined, use them |
| #if defined(VB_PRT) |
| #define ddr_print2(format, ...) VB_PRT(VBL_FAE, format, ##__VA_ARGS__) |
| #define ddr_print3(format, ...) VB_PRT(VBL_TME, format, ##__VA_ARGS__) |
| #define ddr_print4(format, ...) VB_PRT(VBL_DEV, format, ##__VA_ARGS__) |
| #define ddr_print5(format, ...) VB_PRT(VBL_DEV3, format, ##__VA_ARGS__) |
| #else |
| #define ddr_print2 ddr_print |
| #define ddr_print4 ddr_print |
| #define ddr_print5 ddr_print |
| #endif |
| |
| static int64_t test_dram_byte_threads_done; |
| static uint64_t test_dram_byte_threads_errs; |
| static uint64_t test_dram_byte_lmc_errs[4]; |
| |
| #if 0 |
| /* |
| * Suggested testing patterns. |
| */ |
| static const uint64_t test_pattern_2[] = { |
| 0xFFFFFFFFFFFFFFFFULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0xFFFFFFFFFFFFFFFFULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0xFFFFFFFFFFFFFFFFULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0xFFFFFFFFFFFFFFFFULL, |
| 0x5555555555555555ULL, |
| 0xFFFFFFFFFFFFFFFFULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xFFFFFFFFFFFFFFFFULL, |
| 0x5555555555555555ULL, |
| }; |
| /* |
| * or possibly |
| */ |
| static const uint64_t test_pattern_3[] = { |
| 0xFDFDFDFDFDFDFDFDULL, |
| 0x8787878787878787ULL, |
| 0xFEFEFEFEFEFEFEFEULL, |
| 0xC3C3C3C3C3C3C3C3ULL, |
| 0x7F7F7F7F7F7F7F7FULL, |
| 0xE1E1E1E1E1E1E1E1ULL, |
| 0xBFBFBFBFBFBFBFBFULL, |
| 0xF0F0F0F0F0F0F0F0ULL, |
| 0xDFDFDFDFDFDFDFDFULL, |
| 0x7878787878787878ULL, |
| 0xEFEFEFEFEFEFEFEFULL, |
| 0x3C3C3C3C3C3C3C3CULL, |
| 0xF7F7F7F7F7F7F7F7ULL, |
| 0x1E1E1E1E1E1E1E1EULL, |
| 0xFBFBFBFBFBFBFBFBULL, |
| 0x0F0F0F0F0F0F0F0FULL, |
| }; |
| |
| static const uint64_t test_pattern_1[] = { |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| #if 0 // only need a cacheline size |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| 0xAAAAAAAAAAAAAAAAULL, |
| 0x5555555555555555ULL, |
| #endif |
| }; |
| |
| // setup default for test pattern array |
| static const uint64_t *dram_tune_test_pattern = test_pattern_1; |
| #endif |
| |
| // set this to 1 to shorten the testing to exit when all byte lanes have errors |
| // having this at 0 forces the testing to take place over the entire range every iteration, |
| // hopefully ensuring an even load on the memory subsystem |
| #define EXIT_WHEN_ALL_LANES_HAVE_ERRORS 0 |
| |
| #define DEFAULT_TEST_BURSTS 5 // FIXME: this is what works so far...// FIXME: was 7 |
| int dram_tune_use_bursts = DEFAULT_TEST_BURSTS; |
| |
| // dram_tune_rank_offset is used to offset the second area used in test_dram_mem_xor. |
| // |
| // If only a single-rank DIMM, the offset will be 256MB from the start of the first area, |
| // which is more than enough for the restricted looping/address range actually tested... |
| // |
| // If a 2-rank DIMM, the offset will be the size of a rank's address space, so the effect |
| // will be to have the first and second areas in different ranks on the same DIMM. |
| // |
| // So, we default this to single-rank, and it will be overridden when 2-ranks are detected. |
| // |
| |
| // FIXME: ASSUME that we have DIMMS no less than 4GB in size |
| |
| // offset to first area that avoids any boot stuff in low range (below 256MB) |
| #define AREA_BASE_OFFSET (1ULL << 28) // bit 28 always ON |
| |
| // offset to duplicate area; may coincide with rank 1 base address for 2-rank 4GB DIMM |
| #define AREA_DUPE_OFFSET (1ULL << 31) // bit 31 always ON |
| |
| // defaults to DUPE, but will be set elsewhere to offset to next RANK if multi-rank DIMM |
| static uint64_t dram_tune_rank_offset = AREA_DUPE_OFFSET; // default |
| |
| // defaults to 0, but will be set elsewhere to the address offset to next DIMM if multi-slot |
| static uint64_t dram_tune_dimm_offset = 0; // default |
| |
| |
| static int speed_bin_offset[3] = {25, 20, 15}; |
| static int speed_bin_winlen[3] = {70, 60, 60}; |
| |
| static int |
| get_speed_bin(bdk_node_t node, int lmc) |
| { |
| uint32_t mts_speed = (libdram_get_freq_from_pll(node, lmc) / 1000000) * 2; |
| int ret = 0; |
| |
| // FIXME: is this reasonable speed "binning"? |
| if (mts_speed >= 1700) { |
| if (mts_speed >= 2000) |
| ret = 2; |
| else |
| ret = 1; |
| } |
| |
| debug_print("N%d.LMC%d: %s: returning bin %d for MTS %d\n", |
| node, lmc, __func__, ret, mts_speed); |
| |
| return ret; |
| } |
| |
| static int is_low_risk_offset(int speed_bin, int offset) |
| { |
| return (_abs(offset) <= speed_bin_offset[speed_bin]); |
| } |
| static int is_low_risk_winlen(int speed_bin, int winlen) |
| { |
| return (winlen >= speed_bin_winlen[speed_bin]); |
| } |
| |
| #define ENABLE_PREFETCH 0 |
| #define ENABLE_WBIL2 1 |
| #define ENABLE_SBLKDTY 0 |
| |
| #define BDK_SYS_CVMCACHE_INV_L2 "#0,c11,c1,#1" // L2 Cache Invalidate |
| #define BDK_CACHE_INV_L2(address) { asm volatile ("sys " BDK_SYS_CVMCACHE_INV_L2 ", %0" : : "r" (address)); } |
| |
| int dram_tuning_mem_xor(bdk_node_t node, int lmc, uint64_t p, uint64_t bitmask, uint64_t *xor_data) |
| { |
| uint64_t p1, p2, d1, d2; |
| uint64_t v, v1; |
| uint64_t p2offset = 0x10000000/* was: dram_tune_rank_offset; */; // FIXME? |
| uint64_t datamask; |
| uint64_t xor; |
| uint64_t i, j, k; |
| uint64_t ii; |
| int errors = 0; |
| //uint64_t index; |
| uint64_t pattern1 = bdk_rng_get_random64(); |
| uint64_t pattern2 = 0; |
| uint64_t bad_bits[2] = {0,0}; |
| |
| #if ENABLE_SBLKDTY |
| BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 0); |
| #endif |
| |
| // Byte lanes may be clear in the mask to indicate no testing on that lane. |
| datamask = bitmask; |
| |
| // final address must include LMC and node |
| p |= (lmc<<7); /* Map address into proper interface */ |
| p = bdk_numa_get_address(node, p); /* Map to node */ |
| |
| /* Add offset to both test regions to not clobber boot stuff |
| * when running from L2 for NAND boot. |
| */ |
| p += AREA_BASE_OFFSET; // make sure base is out of the way of boot |
| |
| #define II_INC (1ULL << 29) |
| #define II_MAX (1ULL << 31) |
| #define K_INC (1ULL << 14) |
| #define K_MAX (1ULL << 20) |
| #define J_INC (1ULL << 9) |
| #define J_MAX (1ULL << 12) |
| #define I_INC (1ULL << 3) |
| #define I_MAX (1ULL << 7) |
| |
| debug_print("N%d.LMC%d: dram_tuning_mem_xor: phys_addr=0x%lx\n", |
| node, lmc, p); |
| |
| #if 0 |
| int ix; |
| // add this loop to fill memory with the test pattern first |
| // loops are ordered so that only entire cachelines are written |
| for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!! |
| for (k = 0; k < K_MAX; k += K_INC) { |
| for (j = 0; j < J_MAX; j += J_INC) { |
| p1 = p + ii + k + j; |
| p2 = p1 + p2offset; |
| for (i = 0, ix = 0; i < I_MAX; i += I_INC, ix++) { |
| |
| v = dram_tune_test_pattern[ix]; |
| v1 = v; // write the same thing to both areas |
| |
| __bdk_dram_write64(p1 + i, v); |
| __bdk_dram_write64(p2 + i, v1); |
| |
| } |
| #if ENABLE_WBIL2 |
| BDK_CACHE_WBI_L2(p1); |
| BDK_CACHE_WBI_L2(p2); |
| #endif |
| } |
| } |
| } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */ |
| #endif |
| |
| #if ENABLE_PREFETCH |
| BDK_PREFETCH(p , BDK_CACHE_LINE_SIZE); |
| BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE); |
| #endif |
| |
| // loops are ordered so that only a single 64-bit slot is written to each cacheline at one time, |
| // then the cachelines are forced out; this should maximize read/write traffic |
| for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!! |
| for (k = 0; k < K_MAX; k += K_INC) { |
| for (i = 0; i < I_MAX; i += I_INC) { |
| for (j = 0; j < J_MAX; j += J_INC) { |
| |
| p1 = p + ii + k + j; |
| p2 = p1 + p2offset; |
| |
| #if ENABLE_PREFETCH |
| if (j < (J_MAX - J_INC)) { |
| BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE); |
| BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE); |
| } |
| #endif |
| |
| v = pattern1 * (p1 + i); |
| v1 = v; // write the same thing to both areas |
| |
| __bdk_dram_write64(p1 + i, v); |
| __bdk_dram_write64(p2 + i, v1); |
| |
| #if ENABLE_WBIL2 |
| BDK_CACHE_WBI_L2(p1); |
| BDK_CACHE_WBI_L2(p2); |
| #endif |
| } |
| } |
| } |
| } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */ |
| |
| BDK_DCACHE_INVALIDATE; |
| |
| debug_print("N%d.LMC%d: dram_tuning_mem_xor: done INIT loop\n", |
| node, lmc); |
| |
| /* Make a series of passes over the memory areas. */ |
| |
| for (int burst = 0; burst < 1/* was: dram_tune_use_bursts*/; burst++) |
| { |
| uint64_t this_pattern = bdk_rng_get_random64(); |
| pattern2 ^= this_pattern; |
| |
| /* XOR the data with a random value, applying the change to both |
| * memory areas. |
| */ |
| #if ENABLE_PREFETCH |
| BDK_PREFETCH(p , BDK_CACHE_LINE_SIZE); |
| BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE); |
| #endif |
| |
| for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!! |
| for (k = 0; k < K_MAX; k += K_INC) { |
| for (i = 0; i < I_MAX; i += I_INC) { // FIXME: rearranged, did not make much difference? |
| for (j = 0; j < J_MAX; j += J_INC) { |
| |
| p1 = p + ii + k + j; |
| p2 = p1 + p2offset; |
| |
| #if ENABLE_PREFETCH |
| if (j < (J_MAX - J_INC)) { |
| BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE); |
| BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE); |
| } |
| #endif |
| |
| v = __bdk_dram_read64(p1 + i) ^ this_pattern; |
| v1 = __bdk_dram_read64(p2 + i) ^ this_pattern; |
| |
| #if ENABLE_WBIL2 |
| BDK_CACHE_INV_L2(p1); |
| BDK_CACHE_INV_L2(p2); |
| #endif |
| |
| __bdk_dram_write64(p1 + i, v); |
| __bdk_dram_write64(p2 + i, v1); |
| |
| #if ENABLE_WBIL2 |
| BDK_CACHE_WBI_L2(p1); |
| BDK_CACHE_WBI_L2(p2); |
| #endif |
| } |
| } |
| } |
| } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */ |
| |
| BDK_DCACHE_INVALIDATE; |
| |
| debug_print("N%d.LMC%d: dram_tuning_mem_xor: done MODIFY loop\n", |
| node, lmc); |
| |
| #if ENABLE_PREFETCH |
| BDK_PREFETCH(p , BDK_CACHE_LINE_SIZE); |
| BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE); |
| #endif |
| |
| /* Look for differences in the areas. If there is a mismatch, reset |
| * both memory locations with the same pattern. Failing to do so |
| * means that on all subsequent passes the pair of locations remain |
| * out of sync giving spurious errors. |
| */ |
| // FIXME: change the loop order so that an entire cache line is compared at one time |
| // FIXME: this is so that a read error that occurs *anywhere* on the cacheline will be caught, |
| // FIXME: rather than comparing only 1 cacheline slot at a time, where an error on a different |
| // FIXME: slot will be missed that time around |
| // Does the above make sense? |
| |
| for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!! |
| for (k = 0; k < K_MAX; k += K_INC) { |
| for (j = 0; j < J_MAX; j += J_INC) { |
| |
| p1 = p + ii + k + j; |
| p2 = p1 + p2offset; |
| |
| #if ENABLE_PREFETCH |
| if (j < (J_MAX - J_INC)) { |
| BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE); |
| BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE); |
| } |
| #endif |
| |
| // process entire cachelines in the innermost loop |
| for (i = 0; i < I_MAX; i += I_INC) { |
| |
| v = ((p1 + i) * pattern1) ^ pattern2; // FIXME: this should predict what we find...??? |
| d1 = __bdk_dram_read64(p1 + i); |
| d2 = __bdk_dram_read64(p2 + i); |
| |
| xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes |
| |
| if (!xor) |
| continue; |
| |
| // accumulate bad bits |
| bad_bits[0] |= xor; |
| //bad_bits[1] |= ~mpr_data1 & 0xffUL; // cannot do ECC here |
| |
| int bybit = 1; |
| uint64_t bymsk = 0xffULL; // start in byte lane 0 |
| while (xor != 0) { |
| debug_print("ERROR(%03d): [0x%016lX] [0x%016lX] expected 0x%016lX d1 %016lX d2 %016lX\n", |
| burst, p1, p2, v, d1, d2); |
| if (xor & bymsk) { // error(s) in this lane |
| errors |= bybit; // set the byte error bit |
| xor &= ~bymsk; // clear byte lane in error bits |
| datamask &= ~bymsk; // clear the byte lane in the mask |
| #if EXIT_WHEN_ALL_LANES_HAVE_ERRORS |
| if (datamask == 0) { // nothing left to do |
| return errors; // completely done when errors found in all byte lanes in datamask |
| } |
| #endif /* EXIT_WHEN_ALL_LANES_HAVE_ERRORS */ |
| } |
| bymsk <<= 8; // move mask into next byte lane |
| bybit <<= 1; // move bit into next byte position |
| } |
| } |
| #if ENABLE_WBIL2 |
| BDK_CACHE_WBI_L2(p1); |
| BDK_CACHE_WBI_L2(p2); |
| #endif |
| } |
| } |
| } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */ |
| |
| debug_print("N%d.LMC%d: dram_tuning_mem_xor: done TEST loop\n", |
| node, lmc); |
| |
| } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */ |
| |
| if (xor_data != NULL) { // send the bad bits back... |
| xor_data[0] = bad_bits[0]; |
| xor_data[1] = bad_bits[1]; // let it be zeroed |
| } |
| |
| #if ENABLE_SBLKDTY |
| BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 1); |
| #endif |
| |
| return errors; |
| } |
| |
| #undef II_INC |
| #undef II_MAX |
| |
| #define EXTRACT(v, lsb, width) (((v) >> (lsb)) & ((1ull << (width)) - 1)) |
| #define LMCNO(address, xbits) (EXTRACT(address, 7, xbits) ^ EXTRACT(address, 20, xbits) ^ EXTRACT(address, 12, xbits)) |
| |
| // cores to use |
| #define DEFAULT_USE_CORES 44 // FIXME: was (1 << CORE_BITS) |
| int dram_tune_use_cores = DEFAULT_USE_CORES; // max cores to use, override available |
| int dram_tune_max_cores; // max cores available on a node |
| #define CORE_SHIFT 22 // FIXME: offset into rank_address passed to test_dram_byte |
| |
| typedef void (*__dram_tuning_thread_t)(int arg, void *arg1); |
| |
| typedef struct |
| { |
| bdk_node_t node; |
| int64_t num_lmcs; |
| uint64_t byte_mask; |
| } test_dram_byte_info_t; |
| |
| static int dram_tune_use_xor2 = 1; // FIXME: do NOT default to original mem_xor (LMC-based) code |
| |
| static int |
| run_dram_tuning_threads(bdk_node_t node, int num_lmcs, uint64_t bytemask) |
| { |
| test_dram_byte_info_t test_dram_byte_info; |
| test_dram_byte_info_t *test_info = &test_dram_byte_info; |
| int total_count = 0; |
| |
| test_info->node = node; |
| test_info->num_lmcs = num_lmcs; |
| test_info->byte_mask = bytemask; |
| |
| // init some global data |
| bdk_atomic_set64(&test_dram_byte_threads_done, 0); |
| bdk_atomic_set64((int64_t *)&test_dram_byte_threads_errs, 0); |
| bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[0], 0); |
| bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[1], 0); |
| bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[2], 0); |
| bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[3], 0); |
| |
| /* Start threads for cores on the node */ |
| if (bdk_numa_exists(node)) { |
| /* FIXME(dhendrix): We shouldn't hit this. */ |
| die("bdk_numa_exists() is non-zero\n"); |
| } |
| |
| #if 0 |
| /* Wait for threads to finish */ |
| while (bdk_atomic_get64(&test_dram_byte_threads_done) < total_count) |
| bdk_thread_yield(); |
| #else |
| #define TIMEOUT_SECS 5 // FIXME: long enough so a pass for a given setting will not print |
| /* Wait for threads to finish, with progress */ |
| int cur_count; |
| uint64_t cur_time; |
| uint64_t period = bdk_clock_get_rate(bdk_numa_local(), BDK_CLOCK_TIME) * TIMEOUT_SECS; // FIXME? |
| uint64_t timeout = bdk_clock_get_count(BDK_CLOCK_TIME) + period; |
| do { |
| // bdk_thread_yield(); /* FIXME(dhendrix): don't yield... */ |
| cur_count = bdk_atomic_get64(&test_dram_byte_threads_done); |
| cur_time = bdk_clock_get_count(BDK_CLOCK_TIME); |
| if (cur_time >= timeout) { |
| printf("Waiting for %d cores\n", total_count - cur_count); |
| timeout = cur_time + period; |
| } |
| } while (cur_count < total_count); |
| #endif |
| |
| // NOTE: this is the summary of errors across all LMCs |
| return (int)bdk_atomic_get64((int64_t *)&test_dram_byte_threads_errs); |
| } |
| |
| /* These variables count the number of ECC errors. They should only be accessed atomically */ |
| /* FIXME(dhendrix): redundant declaration in original BDK sources */ |
| //extern int64_t __bdk_dram_ecc_single_bit_errors[]; |
| extern int64_t __bdk_dram_ecc_double_bit_errors[]; |
| |
| #define DEFAULT_SAMPLE_GRAN 3 // sample for errors every N offset values |
| #define MIN_BYTE_OFFSET -63 |
| #define MAX_BYTE_OFFSET +63 |
| int dram_tune_use_gran = DEFAULT_SAMPLE_GRAN; |
| |
| static int |
| auto_set_dll_offset(bdk_node_t node, int dll_offset_mode, |
| int num_lmcs, int ddr_interface_64b, |
| int do_tune) |
| { |
| int byte_offset; |
| //unsigned short result[9]; |
| int byte; |
| int byte_delay_start[4][9]; |
| int byte_delay_count[4][9]; |
| uint64_t byte_delay_windows [4][9]; |
| int byte_delay_best_start[4][9]; |
| int byte_delay_best_count[4][9]; |
| //int this_rodt; |
| uint64_t ops_sum[4], dclk_sum[4]; |
| uint64_t start_dram_dclk[4], stop_dram_dclk[4]; |
| uint64_t start_dram_ops[4], stop_dram_ops[4]; |
| int errors, tot_errors; |
| int lmc; |
| const char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write"; /* FIXME(dhendrix): const */ |
| int mode_is_read = (dll_offset_mode == 2); |
| const char *mode_blk = (dll_offset_mode == 2) ? " " : ""; /* FIXME(dhendrix): const */ |
| int start_offset, end_offset, incr_offset; |
| |
| int speed_bin = get_speed_bin(node, 0); // FIXME: just get from LMC0? |
| int needs_review_count = 0; |
| |
| if (dram_tune_use_gran != DEFAULT_SAMPLE_GRAN) { |
| ddr_print2("N%d: Changing sample granularity from %d to %d\n", |
| node, DEFAULT_SAMPLE_GRAN, dram_tune_use_gran); |
| } |
| // ensure sample is taken at 0 |
| start_offset = MIN_BYTE_OFFSET - (MIN_BYTE_OFFSET % dram_tune_use_gran); |
| end_offset = MAX_BYTE_OFFSET - (MAX_BYTE_OFFSET % dram_tune_use_gran); |
| incr_offset = dram_tune_use_gran; |
| |
| memset(ops_sum, 0, sizeof(ops_sum)); |
| memset(dclk_sum, 0, sizeof(dclk_sum)); |
| memset(byte_delay_start, 0, sizeof(byte_delay_start)); |
| memset(byte_delay_count, 0, sizeof(byte_delay_count)); |
| memset(byte_delay_windows, 0, sizeof(byte_delay_windows)); |
| memset(byte_delay_best_start, 0, sizeof(byte_delay_best_start)); |
| memset(byte_delay_best_count, 0, sizeof(byte_delay_best_count)); |
| |
| // FIXME? consult LMC0 only |
| BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(0)); |
| if (lmcx_config.s.rank_ena) { // replace the default offset when there is more than 1 rank... |
| dram_tune_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2)); |
| /* FIXME(dhendrix): %lx --> %llx */ |
| ddr_print2("N%d: Tuning multiple ranks per DIMM (rank offset 0x%llx).\n", node, dram_tune_rank_offset); |
| } |
| if (lmcx_config.s.init_status & 0x0c) { // bit 2 or 3 set indicates 2 DIMMs |
| dram_tune_dimm_offset = 1ull << (28 + lmcx_config.s.pbank_lsb + (num_lmcs/2)); |
| /* FIXME(dhendrix): %lx --> %llx */ |
| ddr_print2("N%d: Tuning multiple DIMMs per channel (DIMM offset 0x%llx)\n", node, dram_tune_dimm_offset); |
| } |
| |
| // FIXME? do this for LMC0 only |
| //BDK_CSR_INIT(comp_ctl2, node, BDK_LMCX_COMP_CTL2(0)); |
| //this_rodt = comp_ctl2.s.rodt_ctl; |
| |
| // construct the bytemask |
| int bytes_todo = (ddr_interface_64b) ? 0xff : 0x0f; |
| uint64_t bytemask = 0; |
| for (byte = 0; byte < 8; ++byte) { |
| if (bytes_todo & (1 << byte)) { |
| bytemask |= 0xfful << (8*byte); // set the bytes bits in the bytemask |
| } |
| } /* for (byte = 0; byte < 8; ++byte) */ |
| |
| // now loop through selected legal values for the DLL byte offset... |
| |
| for (byte_offset = start_offset; byte_offset <= end_offset; byte_offset += incr_offset) { |
| |
| // do the setup on active LMCs |
| for (lmc = 0; lmc < num_lmcs; lmc++) { |
| change_dll_offset_enable(node, lmc, 0); |
| |
| // set all byte lanes at once |
| load_dll_offset(node, lmc, dll_offset_mode, byte_offset, 10 /* All bytes at once */); |
| // but then clear the ECC byte lane so it should be neutral for the test... |
| load_dll_offset(node, lmc, dll_offset_mode, 0, 8); |
| |
| change_dll_offset_enable(node, lmc, 1); |
| |
| // record start cycle CSRs here for utilization measure |
| start_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc)); |
| start_dram_ops[lmc] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc)); |
| } /* for (lmc = 0; lmc < num_lmcs; lmc++) */ |
| |
| bdk_watchdog_poke(); |
| |
| // run the test(s) |
| // only 1 call should be enough, let the bursts, etc, control the load... |
| run_dram_tuning_threads(node, num_lmcs, bytemask); |
| |
| for (lmc = 0; lmc < num_lmcs; lmc++) { |
| // record stop cycle CSRs here for utilization measure |
| stop_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc)); |
| stop_dram_ops[lmc] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc)); |
| |
| // accumulate... |
| ops_sum[lmc] += stop_dram_ops[lmc] - start_dram_ops[lmc]; |
| dclk_sum[lmc] += stop_dram_dclk[lmc] - start_dram_dclk[lmc]; |
| |
| errors = test_dram_byte_lmc_errs[lmc]; |
| |
| // check errors by byte, but not ECC |
| for (byte = 0; byte < 8; ++byte) { |
| if (!(bytes_todo & (1 << byte))) // is this byte lane to be done |
| continue; // no |
| |
| byte_delay_windows[lmc][byte] <<= 1; // always put in a zero |
| if (errors & (1 << byte)) { // yes, an error in this byte lane |
| byte_delay_count[lmc][byte] = 0; // stop now always |
| } else { // no error in this byte lane |
| if (byte_delay_count[lmc][byte] == 0) { // first success, set run start |
| byte_delay_start[lmc][byte] = byte_offset; |
| } |
| byte_delay_count[lmc][byte] += incr_offset; // bump run length |
| |
| if (byte_delay_count[lmc][byte] > byte_delay_best_count[lmc][byte]) { |
| byte_delay_best_count[lmc][byte] = byte_delay_count[lmc][byte]; |
| byte_delay_best_start[lmc][byte] = byte_delay_start[lmc][byte]; |
| } |
| byte_delay_windows[lmc][byte] |= 1ULL; // for pass, put in a 1 |
| } |
| } /* for (byte = 0; byte < 8; ++byte) */ |
| |
| // only print when there are errors and verbose... |
| if (errors) { |
| debug_print("DLL %s Offset Test %3d: errors 0x%x\n", |
| mode_str, byte_offset, errors); |
| } |
| } /* for (lmc = 0; lmc < num_lmcs; lmc++) */ |
| |
| } /* for (byte_offset=-63; byte_offset<63; byte_offset += incr_offset) */ |
| |
| // done with testing, load up and/or print out the offsets we found... |
| |
| // only when margining... |
| if (!do_tune) { |
| printf(" \n"); |
| printf("-------------------------------------\n"); |
| #if 0 |
| uint32_t mts_speed = (libdram_get_freq_from_pll(node, 0) * 2) / 1000000; // FIXME: sample LMC0 |
| printf("N%d: Starting %s Timing Margining for %d MT/s.\n", node, mode_str, mts_speed); |
| #else |
| printf("N%d: Starting %s Timing Margining.\n", node, mode_str); |
| #endif |
| printf(" \n"); |
| } /* if (!do_tune) */ |
| |
| for (lmc = 0; lmc < num_lmcs; lmc++) { |
| #if 1 |
| // FIXME FIXME |
| // FIXME: this just makes ECC always show 0 |
| byte_delay_best_start[lmc][8] = start_offset; |
| byte_delay_best_count[lmc][8] = end_offset - start_offset + incr_offset; |
| #endif |
| |
| // disable offsets while we load... |
| change_dll_offset_enable(node, lmc, 0); |
| |
| // only when margining... |
| if (!do_tune) { |
| // print the heading |
| printf(" \n"); |
| printf("N%d.LMC%d: %s Timing Margin %s : ", node, lmc, mode_str, mode_blk); |
| printf(" ECC/8 "); |
| for (byte = 7; byte >= 0; byte--) { |
| printf(" Byte %d ", byte); |
| } |
| printf("\n"); |
| } /* if (!do_tune) */ |
| |
| // print and load the offset values |
| // print the windows bit arrays |
| // only when margining... |
| if (!do_tune) { |
| printf("N%d.LMC%d: DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk); |
| } else { |
| ddr_print("N%d.LMC%d: SW DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk); |
| } |
| for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order |
| |
| int count = byte_delay_best_count[lmc][byte]; |
| if (count == 0) |
| count = incr_offset; // should make non-tested ECC byte come out 0 |
| |
| byte_offset = byte_delay_best_start[lmc][byte] + |
| ((count - incr_offset) / 2); // adj by incr |
| |
| if (!do_tune) { // do counting and special flag if margining |
| int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) && |
| !is_low_risk_offset(speed_bin, byte_offset); |
| |
| printf("%10d%c", byte_offset, (will_need_review) ? '<' :' '); |
| |
| if (will_need_review) |
| needs_review_count++; |
| } else { // if just tuning, make the printout less lengthy |
| ddr_print("%5d ", byte_offset); |
| } |
| |
| // FIXME? should we be able to override this? |
| if (mode_is_read) // for READ offsets, always store what we found |
| load_dll_offset(node, lmc, dll_offset_mode, byte_offset, byte); |
| else // for WRITE offsets, always store 0 |
| load_dll_offset(node, lmc, dll_offset_mode, 0, byte); |
| |
| } |
| if (!do_tune) { |
| printf("\n"); |
| } else { |
| ddr_print("\n"); |
| } |
| |
| |
| // re-enable the offsets now that we are done loading |
| change_dll_offset_enable(node, lmc, 1); |
| |
| // only when margining... |
| if (!do_tune) { |
| // print the window sizes |
| printf("N%d.LMC%d: DLL %s Window Length %s : ", node, lmc, mode_str, mode_blk); |
| for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order |
| int count = byte_delay_best_count[lmc][byte]; |
| if (count == 0) |
| count = incr_offset; // should make non-tested ECC byte come out 0 |
| |
| // do this again since the "needs review" test is an AND... |
| byte_offset = byte_delay_best_start[lmc][byte] + |
| ((count - incr_offset) / 2); // adj by incr |
| |
| int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) && |
| !is_low_risk_offset(speed_bin, byte_offset); |
| |
| printf("%10d%c", count - incr_offset, (will_need_review) ? '<' :' '); |
| } |
| printf("\n"); |
| |
| // print the window extents |
| printf("N%d.LMC%d: DLL %s Window Bounds %s : ", node, lmc, mode_str, mode_blk); |
| for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order |
| int start = byte_delay_best_start[lmc][byte]; |
| int count = byte_delay_best_count[lmc][byte]; |
| if (count == 0) |
| count = incr_offset; // should make non-tested ECC byte come out 0 |
| printf(" %3d to%3d ", start, |
| start + count - incr_offset); |
| } |
| printf("\n"); |
| #if 0 |
| // FIXME: should have a way to force these out... |
| // print the windows bit arrays |
| printf("N%d.LMC%d: DLL %s Window Bitmap%s : ", node, lmc, mode_str, mode_blk); |
| for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order |
| printf("%010lx ", byte_delay_windows[lmc][byte]); |
| } |
| printf("\n"); |
| #endif |
| } /* if (!do_tune) */ |
| } /* for (lmc = 0; lmc < num_lmcs; lmc++) */ |
| |
| // only when margining... |
| if (!do_tune) { |
| // print the Summary line(s) here |
| printf(" \n"); |
| printf("N%d: %s Timing Margining Summary : %s ", node, mode_str, |
| (needs_review_count > 0) ? "Needs Review" : "Low Risk"); |
| if (needs_review_count > 0) |
| printf("(%d)", needs_review_count); |
| printf("\n"); |
| |
| // FIXME??? want to print here: "N0: %s Offsets have been applied already" |
| |
| printf("-------------------------------------\n"); |
| printf(" \n"); |
| } /* if (!do_tune) */ |
| |
| // FIXME: we probably want this only when doing verbose... |
| // finally, print the utilizations all together |
| for (lmc = 0; lmc < num_lmcs; lmc++) { |
| uint64_t percent_x10 = ops_sum[lmc] * 1000 / dclk_sum[lmc]; |
| /* FIXME(dhendrix): %lu --> %llu */ |
| ddr_print2("N%d.LMC%d: ops %llu, cycles %llu, used %llu.%llu%%\n", |
| node, lmc, ops_sum[lmc], dclk_sum[lmc], percent_x10 / 10, percent_x10 % 10); |
| } /* for (lmc = 0; lmc < num_lmcs; lmc++) */ |
| |
| // FIXME: only when verbose, or only when there are errors? |
| // run the test one last time |
| // print whether there are errors or not, but only when verbose... |
| bdk_watchdog_poke(); |
| debug_print("N%d: %s: Start running test one last time\n", node, __func__); |
| tot_errors = run_dram_tuning_threads(node, num_lmcs, bytemask); |
| debug_print("N%d: %s: Finished running test one last time\n", node, __func__); |
| if (tot_errors) |
| ddr_print2("%s Timing Final Test: errors 0x%x\n", mode_str, tot_errors); |
| |
| return (do_tune) ? tot_errors : !!(needs_review_count > 0); |
| } |
| |
| #define USE_L2_WAYS_LIMIT 0 // non-zero to enable L2 ways limiting |
| |
| /* |
| * Automatically adjust the DLL offset for the data bytes |
| */ |
| int perform_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int do_tune) |
| { |
| int ddr_interface_64b; |
| int save_ecc_ena[4]; |
| bdk_lmcx_config_t lmc_config; |
| int lmc, num_lmcs = __bdk_dram_get_num_lmc(node); |
| const char *s; |
| #if USE_L2_WAYS_LIMIT |
| int ways, ways_print = 0; |
| #endif |
| #if 0 |
| int dram_tune_use_rodt = -1, save_rodt[4]; |
| bdk_lmcx_comp_ctl2_t comp_ctl2; |
| #endif |
| int loops = 1, loop; |
| uint64_t orig_coremask; |
| int errs = 0; |
| |
| // enable any non-running cores on this node |
| orig_coremask = bdk_get_running_coremask(node); |
| /* FIXME(dhendrix): %lx --> %llx */ |
| ddr_print4("N%d: %s: Starting cores (mask was 0x%llx)\n", |
| node, __func__, orig_coremask); |
| /* FIXME(dhendrix): don't call bdk_init_cores(). */ |
| // bdk_init_cores(node, ~0ULL & ~orig_coremask); |
| dram_tune_max_cores = bdk_get_num_running_cores(node); |
| |
| // but use only a certain number of cores, at most what is available |
| if ((s = getenv("ddr_tune_use_cores")) != NULL) { |
| dram_tune_use_cores = strtoul(s, NULL, 0); |
| if (dram_tune_use_cores <= 0) // allow 0 or negative to mean all |
| dram_tune_use_cores = dram_tune_max_cores; |
| } |
| if (dram_tune_use_cores > dram_tune_max_cores) |
| dram_tune_use_cores = dram_tune_max_cores; |
| |
| // see if we want to do the tuning more than once per LMC... |
| if ((s = getenv("ddr_tune_use_loops"))) { |
| loops = strtoul(s, NULL, 0); |
| } |
| |
| // see if we want to change the granularity of the byte_offset sampling |
| if ((s = getenv("ddr_tune_use_gran"))) { |
| dram_tune_use_gran = strtoul(s, NULL, 0); |
| } |
| |
| // allow override of the test repeats (bursts) per thread create |
| if ((s = getenv("ddr_tune_use_bursts")) != NULL) { |
| dram_tune_use_bursts = strtoul(s, NULL, 10); |
| } |
| |
| #if 0 |
| // allow override of Read ODT setting just during the tuning run(s) |
| if ((s = getenv("ddr_tune_use_rodt")) != NULL) { |
| int temp = strtoul(s, NULL, 10); |
| // validity check |
| if (temp >= 0 && temp <= 7) |
| dram_tune_use_rodt = temp; |
| } |
| #endif |
| |
| #if 0 |
| // allow override of the test pattern |
| // FIXME: a bit simplistic... |
| if ((s = getenv("ddr_tune_use_pattern")) != NULL) { |
| int patno = strtoul(s, NULL, 10); |
| if (patno == 2) |
| dram_tune_test_pattern = test_pattern_2; |
| else if (patno == 3) |
| dram_tune_test_pattern = test_pattern_3; |
| else // all other values use default |
| dram_tune_test_pattern = test_pattern_1; |
| } |
| #endif |
| |
| // allow override of the test mem_xor algorithm |
| if ((s = getenv("ddr_tune_use_xor2")) != NULL) { |
| dram_tune_use_xor2 = !!strtoul(s, NULL, 10); |
| } |
| |
| // print current working values |
| ddr_print2("N%d: Tuning will use %d cores of max %d cores, and use %d repeats.\n", |
| node, dram_tune_use_cores, dram_tune_max_cores, |
| dram_tune_use_bursts); |
| |
| #if USE_L2_WAYS_LIMIT |
| // see if L2 ways are limited |
| if ((s = lookup_env_parameter("limit_l2_ways")) != NULL) { |
| ways = strtoul(s, NULL, 10); |
| ways_print = 1; |
| } else { |
| ways = bdk_l2c_get_num_assoc(node); |
| } |
| #endif |
| |
| #if 0 |
| // if RODT is to be overridden during tuning, note change |
| if (dram_tune_use_rodt >= 0) { |
| ddr_print("N%d: using RODT %d for tuning.\n", |
| node, dram_tune_use_rodt); |
| } |
| #endif |
| |
| // FIXME? get flag from LMC0 only |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0)); |
| ddr_interface_64b = !lmc_config.s.mode32b; |
| |
| // do setup for each active LMC |
| debug_print("N%d: %s: starting LMCs setup.\n", node, __func__); |
| for (lmc = 0; lmc < num_lmcs; lmc++) { |
| |
| #if 0 |
| // if RODT change, save old and set new here... |
| if (dram_tune_use_rodt >= 0) { |
| comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc)); |
| save_rodt[lmc] = comp_ctl2.s.rodt_ctl; |
| comp_ctl2.s.rodt_ctl = dram_tune_use_rodt; |
| DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u); |
| BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc)); |
| } |
| #endif |
| /* Disable ECC for DRAM tests */ |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc)); |
| save_ecc_ena[lmc] = lmc_config.s.ecc_ena; |
| lmc_config.s.ecc_ena = 0; |
| DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u); |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc)); |
| |
| } /* for (lmc = 0; lmc < num_lmcs; lmc++) */ |
| |
| #if USE_L2_WAYS_LIMIT |
| /* Disable l2 sets for DRAM testing */ |
| limit_l2_ways(node, 0, ways_print); |
| #endif |
| |
| // testing is done on all LMCs simultaneously |
| // FIXME: for now, loop here to show what happens multiple times |
| for (loop = 0; loop < loops; loop++) { |
| /* Perform DLL offset tuning */ |
| errs = auto_set_dll_offset(node, dll_offset_mode, num_lmcs, ddr_interface_64b, do_tune); |
| } |
| |
| #if USE_L2_WAYS_LIMIT |
| /* Restore the l2 set configuration */ |
| limit_l2_ways(node, ways, ways_print); |
| #endif |
| |
| // perform cleanup on all active LMCs |
| debug_print("N%d: %s: starting LMCs cleanup.\n", node, __func__); |
| for (lmc = 0; lmc < num_lmcs; lmc++) { |
| |
| /* Restore ECC for DRAM tests */ |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc)); |
| lmc_config.s.ecc_ena = save_ecc_ena[lmc]; |
| DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u); |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc)); |
| #if 0 |
| // if RODT change, restore old here... |
| if (dram_tune_use_rodt >= 0) { |
| comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc)); |
| comp_ctl2.s.rodt_ctl = save_rodt[lmc]; |
| DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u); |
| BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc)); |
| } |
| #endif |
| // finally, see if there are any read offset overrides after tuning |
| // FIXME: provide a way to do write offsets also?? |
| if (dll_offset_mode == 2) { |
| for (int by = 0; by < 9; by++) { |
| if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) { |
| int dllro = strtoul(s, NULL, 10); |
| change_dll_offset_enable(node, lmc, 0); |
| load_dll_offset(node, lmc, /* read */2, dllro, by); |
| change_dll_offset_enable(node, lmc, 1); |
| } |
| } |
| } |
| } /* for (lmc = 0; lmc < num_lmcs; lmc++) */ |
| |
| // finish up... |
| |
| #if 0 |
| // if RODT was overridden during tuning, note restore |
| if (dram_tune_use_rodt >= 0) { |
| ddr_print("N%d: restoring RODT %d after tuning.\n", |
| node, save_rodt[0]); // FIXME? use LMC0 |
| } |
| #endif |
| |
| // put any cores on this node, that were not running at the start, back into reset |
| /* FIXME(dhendrix): don't reset cores... */ |
| // uint64_t reset_coremask = bdk_get_running_coremask(node) & ~orig_coremask; |
| uint64_t reset_coremask = 0; |
| if (reset_coremask) { |
| /* FIXME(dhendrix): %lx --> %llx */ |
| ddr_print4("N%d: %s: Stopping cores 0x%llx\n", node, __func__, |
| reset_coremask); |
| bdk_reset_cores(node, reset_coremask); |
| } else { |
| /* FIXME(dhendrix): %lx --> %llx */ |
| ddr_print4("N%d: %s: leaving cores set to 0x%llx\n", node, __func__, |
| orig_coremask); |
| } |
| |
| return errs; |
| |
| } /* perform_dll_offset_tuning */ |
| |
| ///////////////////////////////////////////////////////////////////////////////////////////// |
| |
| ///// HW-assist byte DLL offset tuning ////// |
| |
| #if 1 |
| // setup defaults for byte test pattern array |
| // take these first two from the HRM section 6.9.13 |
| static const uint64_t byte_pattern_0[] = { |
| 0xFFAAFFFFFF55FFFFULL, // GP0 |
| 0x55555555AAAAAAAAULL, // GP1 |
| 0xAA55AAAAULL, // GP2 |
| }; |
| static const uint64_t byte_pattern_1[] = { |
| 0xFBF7EFDFBF7FFEFDULL, // GP0 |
| 0x0F1E3C78F0E1C387ULL, // GP1 |
| 0xF0E1BF7FULL, // GP2 |
| }; |
| // this is from Andrew via LFSR with PRBS=0xFFFFAAAA |
| static const uint64_t byte_pattern_2[] = { |
| 0xEE55AADDEE55AADDULL, // GP0 |
| 0x55AADDEE55AADDEEULL, // GP1 |
| 0x55EEULL, // GP2 |
| }; |
| // this is from Mike via LFSR with PRBS=0x4A519909 |
| static const uint64_t byte_pattern_3[] = { |
| 0x0088CCEE0088CCEEULL, // GP0 |
| 0xBB552211BB552211ULL, // GP1 |
| 0xBB00ULL, // GP2 |
| }; |
| |
| static const uint64_t *byte_patterns[] = { |
| byte_pattern_0, byte_pattern_1, byte_pattern_2, byte_pattern_3 // FIXME: use all we have |
| }; |
| #define NUM_BYTE_PATTERNS ((int)(sizeof(byte_patterns)/sizeof(uint64_t *))) |
| |
| #define DEFAULT_BYTE_BURSTS 32 // FIXME: this is what what the longest test usually has |
| int dram_tune_byte_bursts = DEFAULT_BYTE_BURSTS; |
| #endif |
| |
| static void |
| setup_hw_pattern(bdk_node_t node, int lmc, const uint64_t *pattern_p) |
| { |
| /* |
| 3) Setup GENERAL_PURPOSE[0-2] registers with the data pattern of choice. |
| a. GENERAL_PURPOSE0[DATA<63:0>] – sets the initial lower (rising edge) 64 bits of data. |
| b. GENERAL_PURPOSE1[DATA<63:0>] – sets the initial upper (falling edge) 64 bits of data. |
| c. GENERAL_PURPOSE2[DATA<15:0>] – sets the initial lower (rising edge <7:0>) and upper |
| (falling edge <15:8>) ECC data. |
| */ |
| DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE0(lmc), pattern_p[0]); |
| DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE1(lmc), pattern_p[1]); |
| DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE2(lmc), pattern_p[2]); |
| } |
| |
| #define DEFAULT_PRBS 0xFFFFAAAAUL /* FIXME: maybe try 0x4A519909UL */ |
| |
| static void |
| setup_lfsr_pattern(bdk_node_t node, int lmc, uint64_t data) |
| { |
| uint32_t prbs; |
| const char *s; |
| |
| if ((s = getenv("ddr_lfsr_prbs"))) { |
| prbs = strtoul(s, NULL, 0); |
| } else |
| prbs = DEFAULT_PRBS; // FIXME: from data arg? |
| |
| /* |
| 2) DBTRAIN_CTL[LFSR_PATTERN_SEL] = 1 |
| here data comes from the LFSR generating a PRBS pattern |
| CHAR_CTL.EN = 0 |
| CHAR_CTL.SEL = 0; // for PRBS |
| CHAR_CTL.DR = 1; |
| CHAR_CTL.PRBS = setup for whatever type of PRBS to send |
| CHAR_CTL.SKEW_ON = 1; |
| */ |
| BDK_CSR_INIT(char_ctl, node, BDK_LMCX_CHAR_CTL(lmc)); |
| char_ctl.s.en = 0; |
| char_ctl.s.sel = 0; |
| char_ctl.s.dr = 1; |
| char_ctl.s.prbs = prbs; |
| char_ctl.s.skew_on = 1; |
| DRAM_CSR_WRITE(node, BDK_LMCX_CHAR_CTL(lmc), char_ctl.u); |
| } |
| |
| /* FIXME(dhendrix): made static to avoid need for prototype */ |
| static int |
| choose_best_hw_patterns(bdk_node_t node, int lmc, int mode) |
| { |
| int new_mode = mode; |
| const char *s; |
| |
| switch (mode) { |
| case DBTRAIN_TEST: // always choose LFSR if chip supports it |
| if (! CAVIUM_IS_MODEL(CAVIUM_CN88XX)) { |
| int lfsr_enable = 1; |
| if ((s = getenv("ddr_allow_lfsr"))) { // override? |
| lfsr_enable = !!strtoul(s, NULL, 0); |
| } |
| if (lfsr_enable) |
| new_mode = DBTRAIN_LFSR; |
| } |
| break; |
| case DBTRAIN_DBI: // possibly can allow LFSR use? |
| break; |
| case DBTRAIN_LFSR: // forced already |
| if (CAVIUM_IS_MODEL(CAVIUM_CN88XX)) { |
| ddr_print("ERROR: illegal HW assist mode %d\n", mode); |
| new_mode = DBTRAIN_TEST; |
| } |
| break; |
| default: |
| ddr_print("ERROR: unknown HW assist mode %d\n", mode); |
| } |
| |
| if (new_mode != mode) |
| VB_PRT(VBL_DEV2, "choose_best_hw_patterns: changing mode %d to %d\n", mode, new_mode); |
| |
| return new_mode; |
| } |
| |
| int |
| run_best_hw_patterns(bdk_node_t node, int lmc, uint64_t phys_addr, |
| int mode, uint64_t *xor_data) |
| { |
| int pattern; |
| const uint64_t *pattern_p; |
| int errs, errors = 0; |
| |
| // FIXME? always choose LFSR if chip supports it??? |
| mode = choose_best_hw_patterns(node, lmc, mode); |
| |
| if (mode == DBTRAIN_LFSR) { |
| setup_lfsr_pattern(node, lmc, 0); |
| errors = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data); |
| VB_PRT(VBL_DEV2, "%s: LFSR at A:0x%012llx errors 0x%x\n", |
| __func__, phys_addr, errors); |
| } else { |
| for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) { |
| pattern_p = byte_patterns[pattern]; |
| setup_hw_pattern(node, lmc, pattern_p); |
| |
| errs = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data); |
| |
| VB_PRT(VBL_DEV2, "%s: PATTERN %d at A:0x%012llx errors 0x%x\n", |
| __func__, pattern, phys_addr, errs); |
| |
| errors |= errs; |
| } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */ |
| } |
| return errors; |
| } |
| |
| static void |
| hw_assist_test_dll_offset(bdk_node_t node, int dll_offset_mode, |
| int lmc, int bytelane) |
| { |
| int byte_offset, new_best_offset[9]; |
| int rank_delay_start[4][9]; |
| int rank_delay_count[4][9]; |
| int rank_delay_best_start[4][9]; |
| int rank_delay_best_count[4][9]; |
| int errors[4]; |
| int num_lmcs = __bdk_dram_get_num_lmc(node); |
| int rank_mask, rankx, active_ranks; |
| int pattern; |
| const uint64_t *pattern_p; |
| int byte; |
| const char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write"; |
| int pat_best_offset[9]; |
| uint64_t phys_addr; |
| int pat_beg, pat_end; |
| int rank_beg, rank_end; |
| int byte_lo, byte_hi; |
| uint64_t hw_rank_offset; |
| // FIXME? always choose LFSR if chip supports it??? |
| int mode = choose_best_hw_patterns(node, lmc, DBTRAIN_TEST); |
| |
| if (bytelane == 0x0A) { // all bytelanes |
| byte_lo = 0; |
| byte_hi = 8; |
| } else { // just 1 |
| byte_lo = byte_hi = bytelane; |
| } |
| |
| BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(lmc)); |
| rank_mask = lmcx_config.s.init_status; |
| // this should be correct for 1 or 2 ranks, 1 or 2 DIMMs |
| hw_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2)); |
| |
| debug_print("N%d: %s: starting LMC%d with rank offset 0x%lx\n", |
| node, __func__, lmc, hw_rank_offset); |
| |
| // start of pattern loop |
| // we do the set of tests for each pattern supplied... |
| |
| memset(new_best_offset, 0, sizeof(new_best_offset)); |
| for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) { |
| |
| memset(pat_best_offset, 0, sizeof(pat_best_offset)); |
| |
| if (mode == DBTRAIN_TEST) { |
| pattern_p = byte_patterns[pattern]; |
| setup_hw_pattern(node, lmc, pattern_p); |
| } else { |
| setup_lfsr_pattern(node, lmc, 0); |
| } |
| |
| // now loop through all legal values for the DLL byte offset... |
| |
| #define BYTE_OFFSET_INCR 3 // FIXME: make this tunable? |
| |
| memset(rank_delay_count, 0, sizeof(rank_delay_count)); |
| memset(rank_delay_start, 0, sizeof(rank_delay_start)); |
| memset(rank_delay_best_count, 0, sizeof(rank_delay_best_count)); |
| memset(rank_delay_best_start, 0, sizeof(rank_delay_best_start)); |
| |
| for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) { |
| |
| // do the setup on the active LMC |
| // set the bytelanes DLL offsets |
| change_dll_offset_enable(node, lmc, 0); |
| load_dll_offset(node, lmc, dll_offset_mode, byte_offset, bytelane); // FIXME? bytelane? |
| change_dll_offset_enable(node, lmc, 1); |
| |
| bdk_watchdog_poke(); |
| |
| // run the test on each rank |
| // only 1 call per rank should be enough, let the bursts, loops, etc, control the load... |
| |
| active_ranks = 0; |
| |
| for (rankx = 0; rankx < 4; rankx++) { |
| if (!(rank_mask & (1 << rankx))) |
| continue; |
| |
| phys_addr = hw_rank_offset * active_ranks; |
| // FIXME: now done by test_dram_byte_hw() |
| //phys_addr |= (lmc << 7); |
| //phys_addr = bdk_numa_get_address(node, phys_addr); // map to node |
| |
| active_ranks++; |
| |
| // NOTE: return is a now a bitmask of the erroring bytelanes.. |
| errors[rankx] = test_dram_byte_hw(node, lmc, phys_addr, mode, NULL); |
| |
| for (byte = byte_lo; byte <= byte_hi; byte++) { // do bytelane(s) |
| |
| // check errors |
| if (errors[rankx] & (1 << byte)) { // yes, an error in the byte lane in this rank |
| |
| ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: Address 0x%012llx errors 0x%x\n", |
| node, lmc, rankx, bytelane, mode_str, |
| byte_offset, phys_addr, errors[rankx]); |
| |
| if (rank_delay_count[rankx][byte] > 0) { // had started run |
| ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: stopping a run here\n", |
| node, lmc, rankx, bytelane, mode_str, byte_offset); |
| rank_delay_count[rankx][byte] = 0; // stop now |
| } |
| // FIXME: else had not started run - nothing else to do? |
| } else { // no error in the byte lane |
| if (rank_delay_count[rankx][byte] == 0) { // first success, set run start |
| ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: starting a run here\n", |
| node, lmc, rankx, bytelane, mode_str, byte_offset); |
| rank_delay_start[rankx][byte] = byte_offset; |
| } |
| rank_delay_count[rankx][byte] += BYTE_OFFSET_INCR; // bump run length |
| |
| // is this now the biggest window? |
| if (rank_delay_count[rankx][byte] > rank_delay_best_count[rankx][byte]) { |
| rank_delay_best_count[rankx][byte] = rank_delay_count[rankx][byte]; |
| rank_delay_best_start[rankx][byte] = rank_delay_start[rankx][byte]; |
| debug_print("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: updating best to %d/%d\n", |
| node, lmc, rankx, bytelane, mode_str, byte_offset, |
| rank_delay_best_start[rankx][byte], rank_delay_best_count[rankx][byte]); |
| } |
| } |
| } /* for (byte = byte_lo; byte <= byte_hi; byte++) */ |
| } /* for (rankx = 0; rankx < 4; rankx++) */ |
| |
| } /* for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) */ |
| |
| // now choose the best byte_offsets for this pattern according to the best windows of the tested ranks |
| // calculate offset by constructing an average window from the rank windows |
| for (byte = byte_lo; byte <= byte_hi; byte++) { |
| |
| pat_beg = -999; |
| pat_end = 999; |
| |
| for (rankx = 0; rankx < 4; rankx++) { |
| if (!(rank_mask & (1 << rankx))) |
| continue; |
| |
| rank_beg = rank_delay_best_start[rankx][byte]; |
| pat_beg = max(pat_beg, rank_beg); |
| rank_end = rank_beg + rank_delay_best_count[rankx][byte] - BYTE_OFFSET_INCR; |
| pat_end = min(pat_end, rank_end); |
| |
| ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test: Rank Window %3d:%3d\n", |
| node, lmc, rankx, bytelane, mode_str, rank_beg, rank_end); |
| |
| } /* for (rankx = 0; rankx < 4; rankx++) */ |
| |
| pat_best_offset[byte] = (pat_end + pat_beg) / 2; |
| ddr_print4("N%d.LMC%d: Bytelane %d DLL %s Offset Test: Pattern %d Average %3d\n", |
| node, lmc, byte, mode_str, pattern, pat_best_offset[byte]); |
| |
| #if 0 |
| // FIXME: next print the window counts |
| sprintf(sbuffer, "N%d.LMC%d Pattern %d: DLL %s Offset Count ", |
| node, lmc, pattern, mode_str); |
| printf("%-45s : ", sbuffer); |
| printf(" %3d", byte_delay_best_count); |
| printf("\n"); |
| #endif |
| |
| new_best_offset[byte] += pat_best_offset[byte]; // sum the pattern averages |
| } /* for (byte = byte_lo; byte <= byte_hi; byte++) */ |
| } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */ |
| // end of pattern loop |
| |
| ddr_print("N%d.LMC%d: HW DLL %s Offset Amount : ", |
| node, lmc, mode_str); |
| |
| for (byte = byte_hi; byte >= byte_lo; --byte) { // print in decending byte index order |
| new_best_offset[byte] = divide_nint(new_best_offset[byte], NUM_BYTE_PATTERNS); // create the new average NINT |
| |
| // print the best offsets from all patterns |
| |
| if (bytelane == 0x0A) // print just the offset of all the bytes |
| ddr_print("%5d ", new_best_offset[byte]); |
| else |
| ddr_print("(byte %d) %5d ", byte, new_best_offset[byte]); |
| |
| |
| #if 1 |
| // done with testing, load up the best offsets we found... |
| change_dll_offset_enable(node, lmc, 0); // disable offsets while we load... |
| load_dll_offset(node, lmc, dll_offset_mode, new_best_offset[byte], byte); |
| change_dll_offset_enable(node, lmc, 1); // re-enable the offsets now that we are done loading |
| #endif |
| } /* for (byte = byte_hi; byte >= byte_lo; --byte) */ |
| |
| ddr_print("\n"); |
| |
| #if 0 |
| // run the test one last time |
| // print whether there are errors or not, but only when verbose... |
| tot_errors = run_test_dram_byte_threads(node, num_lmcs, bytemask); |
| printf("N%d.LMC%d: Bytelane %d DLL %s Offset Final Test: errors 0x%x\n", |
| node, lmc, bytelane, mode_str, tot_errors); |
| #endif |
| } |
| |
| /* |
| * Automatically adjust the DLL offset for the selected bytelane using hardware-assist |
| */ |
| int perform_HW_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int bytelane) |
| { |
| int save_ecc_ena[4]; |
| bdk_lmcx_config_t lmc_config; |
| int lmc, num_lmcs = __bdk_dram_get_num_lmc(node); |
| const char *s; |
| //bdk_lmcx_comp_ctl2_t comp_ctl2; |
| int loops = 1, loop; |
| |
| // see if we want to do the tuning more than once per LMC... |
| if ((s = getenv("ddr_tune_ecc_loops"))) { |
| loops = strtoul(s, NULL, 0); |
| } |
| |
| // allow override of the test repeats (bursts) |
| if ((s = getenv("ddr_tune_byte_bursts")) != NULL) { |
| dram_tune_byte_bursts = strtoul(s, NULL, 10); |
| } |
| |
| // print current working values |
| ddr_print2("N%d: H/W Tuning for bytelane %d will use %d loops, %d bursts, and %d patterns.\n", |
| node, bytelane, loops, dram_tune_byte_bursts, |
| NUM_BYTE_PATTERNS); |
| |
| // FIXME? get flag from LMC0 only |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0)); |
| |
| // do once for each active LMC |
| |
| for (lmc = 0; lmc < num_lmcs; lmc++) { |
| |
| ddr_print4("N%d: H/W Tuning: starting LMC%d bytelane %d tune.\n", node, lmc, bytelane); |
| |
| /* Enable ECC for the HW tests */ |
| // NOTE: we do enable ECC, but the HW tests used will not generate "visible" errors |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc)); |
| save_ecc_ena[lmc] = lmc_config.s.ecc_ena; |
| lmc_config.s.ecc_ena = 1; |
| DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u); |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc)); |
| |
| // testing is done on a single LMC at a time |
| // FIXME: for now, loop here to show what happens multiple times |
| for (loop = 0; loop < loops; loop++) { |
| /* Perform DLL offset tuning */ |
| //auto_set_dll_offset(node, 1 /* 1=write */, lmc, bytelane); |
| hw_assist_test_dll_offset(node, 2 /* 2=read */, lmc, bytelane); |
| } |
| |
| // perform cleanup on active LMC |
| ddr_print4("N%d: H/W Tuning: finishing LMC%d bytelane %d tune.\n", node, lmc, bytelane); |
| |
| /* Restore ECC for DRAM tests */ |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc)); |
| lmc_config.s.ecc_ena = save_ecc_ena[lmc]; |
| DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u); |
| lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc)); |
| |
| // finally, see if there are any read offset overrides after tuning |
| for (int by = 0; by < 9; by++) { |
| if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) { |
| int dllro = strtoul(s, NULL, 10); |
| change_dll_offset_enable(node, lmc, 0); |
| load_dll_offset(node, lmc, 2 /* 2=read */, dllro, by); |
| change_dll_offset_enable(node, lmc, 1); |
| } |
| } |
| |
| } /* for (lmc = 0; lmc < num_lmcs; lmc++) */ |
| |
| // finish up... |
| |
| return 0; |
| |
| } /* perform_HW_dll_offset_tuning */ |