blob: 5de8837a1e91ce13b32e53e858210f6216e7a5f3 [file] [log] [blame]
Arthur Heymans95c48cb2017-11-04 08:07:06 +01001/*
2 * This file is part of the coreboot project.
3 *
4 * Copyright (C) 2017-2018 Arthur Heymans <arthur@aheymans.xyz>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of
9 * the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 */
16
17#include <arch/io.h>
18#include <console/console.h>
19#include <stdint.h>
20#include <string.h>
21#include <types.h>
22#include "x4x.h"
23#include "iomap.h"
24
25static void print_dll_setting(const struct dll_setting *dll_setting,
26 u8 default_verbose)
27{
28 u8 debug_level = default_verbose ? BIOS_DEBUG : RAM_DEBUG;
29
30 printk(debug_level, "%d.%d.%d.%d:%d.%d\n", dll_setting->coarse,
31 dll_setting->clk_delay, dll_setting->tap,
32 dll_setting->pi, dll_setting->db_en,
33 dll_setting->db_sel);
34}
35
36struct db_limit {
37 u8 tap0;
38 u8 tap1;
39 u8 pi0;
40 u8 pi1;
41};
42
43static void set_db(const struct sysinfo *s, struct dll_setting *dq_dqs_setting)
44{
45 struct db_limit limit;
46
47 switch (s->selected_timings.mem_clk) {
48 default:
49 case MEM_CLOCK_800MHz:
50 limit.tap0 = 3;
51 limit.tap1 = 10;
52 limit.pi0 = 2;
53 limit.pi1 = 3;
54 break;
55 case MEM_CLOCK_1066MHz:
56 limit.tap0 = 2;
57 limit.tap1 = 8;
58 limit.pi0 = 6;
59 limit.pi1 = 7;
60 break;
61 case MEM_CLOCK_1333MHz:
62 limit.tap0 = 3;
63 limit.tap1 = 11;
64 /* TO CHECK: Might be reverse since this makes little sense */
65 limit.pi0 = 6;
66 limit.pi1 = 4;
67 break;
68 }
69
70 if (dq_dqs_setting->tap < limit.tap0) {
71 dq_dqs_setting->db_en = 1;
72 dq_dqs_setting->db_sel = 1;
73 } else if ((dq_dqs_setting->tap == limit.tap0)
74 && (dq_dqs_setting->pi < limit.pi0)) {
75 dq_dqs_setting->db_en = 1;
76 dq_dqs_setting->db_sel = 1;
77 } else if (dq_dqs_setting->tap < limit.tap1) {
78 dq_dqs_setting->db_en = 0;
79 dq_dqs_setting->db_sel = 0;
80 } else if ((dq_dqs_setting->tap == limit.tap1)
81 && (dq_dqs_setting->pi < limit.pi1)) {
82 dq_dqs_setting->db_en = 0;
83 dq_dqs_setting->db_sel = 0;
84 } else {
85 dq_dqs_setting->db_en = 1;
86 dq_dqs_setting->db_sel = 0;
87 }
88}
89
90const static u8 max_tap[3] = {12, 10, 13};
91
92static int increment_dq_dqs(const struct sysinfo *s,
93 struct dll_setting *dq_dqs_setting)
94{
95 u8 max_tap_val = max_tap[s->selected_timings.mem_clk
96 - MEM_CLOCK_800MHz];
97
98 if (dq_dqs_setting->pi < 6) {
99 dq_dqs_setting->pi += 1;
100 } else if (dq_dqs_setting->tap < max_tap_val) {
101 dq_dqs_setting->pi = 0;
102 dq_dqs_setting->tap += 1;
103 } else if (dq_dqs_setting->clk_delay < 2) {
104 dq_dqs_setting->pi = 0;
105 dq_dqs_setting->tap = 0;
106 dq_dqs_setting->clk_delay += 1;
107 } else if (dq_dqs_setting->coarse < 1) {
108 dq_dqs_setting->pi = 0;
109 dq_dqs_setting->tap = 0;
110 dq_dqs_setting->clk_delay -= 1;
111 dq_dqs_setting->coarse += 1;
112 } else {
113 return CB_ERR;
114 }
115 set_db(s, dq_dqs_setting);
116 return CB_SUCCESS;
117}
118
119#define WT_PATTERN_SIZE 80
120
121static const u32 write_training_schedule[WT_PATTERN_SIZE] = {
122 0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
123 0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
124 0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
125 0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
126 0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
127 0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
128 0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
129 0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
130 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
131 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
132 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
133 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
134 0x03030303, 0x04040404, 0x09090909, 0x10101010,
135 0x21212121, 0x40404040, 0x81818181, 0x00000000,
136 0x03030303, 0x04040404, 0x09090909, 0x10101010,
137 0x21212121, 0x40404040, 0x81818181, 0x00000000,
138 0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee,
139 0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe,
140 0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee,
141 0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe,
142};
143
144enum training_modes {
145 SUCCEEDING = 0,
146 FAILING = 1
147};
148
149static u8 test_dq_aligned(const struct sysinfo *s,
150 const u8 channel)
151{
152 u32 address;
153 int rank, lane;
154 u8 count, count1;
155 u8 data[8];
156 u8 lane_error = 0;
157
158 FOR_EACH_POPULATED_RANK_IN_CHANNEL(s->dimms, channel, rank) {
159 address = test_address(channel, rank);
160 for (count = 0; count < WT_PATTERN_SIZE; count++) {
161 for (count1 = 0; count1 < WT_PATTERN_SIZE; count1++) {
162 if ((count1 % 16) == 0)
163 MCHBAR32(0xf90) = 1;
164 const u32 pattern =
165 write_training_schedule[count1];
166 write32((u32 *)address + 8 * count1, pattern);
167 write32((u32 *)address + 8 * count1 + 4,
168 pattern);
169 }
170
171 const u32 good = write_training_schedule[count];
172 write32(&data[0], read32((u32 *)address + 8 * count));
173 write32(&data[4],
174 read32((u32 *)address + 8 * count + 4));
175 FOR_EACH_BYTELANE(lane) {
176 u8 expected = (good >> ((lane % 4) * 8)) & 0xff;
177 if (data[lane] != expected)
178 lane_error |= 1 << lane;
179 }
180 }
181 }
182 return lane_error;
183}
184
185#define CONSISTENCY 10
186
187/*
188 * This function finds either failing or succeeding writes by increasing DQ.
189 * When it has found a failing or succeeding setting it will increase DQ
190 * another 10 times to make sure the result is consistent.
191 * This is probably done because lanes cannot be trained independent from
192 * each other.
193 */
194static int find_dq_limit(const struct sysinfo *s, const u8 channel,
195 struct dll_setting dq_setting[TOTAL_BYTELANES],
196 u8 dq_lim[TOTAL_BYTELANES],
197 const enum training_modes expected_result)
198{
199 int status = CB_SUCCESS;
200 int lane;
201 u8 test_result;
202 u8 pass_count[TOTAL_BYTELANES];
203 u8 succes_mask = 0xff;
204
205 printk(RAM_DEBUG, "Looking for %s writes on channel %d\n",
206 expected_result == FAILING ? "failing" : "succeeding", channel);
207 memset(pass_count, 0, sizeof(pass_count));
208
209 while(succes_mask) {
210 test_result = test_dq_aligned(s, channel);
211 FOR_EACH_BYTELANE(lane) {
212 if (((test_result >> lane) & 1) != expected_result) {
213 status = increment_dq_dqs(s, &dq_setting[lane]);
214 dqset(channel, lane, &dq_setting[lane]);
215 dq_lim[lane]++;
216 } else if (pass_count[lane] < CONSISTENCY) {
217 status = increment_dq_dqs(s, &dq_setting[lane]);
218 dqset(channel, lane, &dq_setting[lane]);
219 dq_lim[lane]++;
220 pass_count[lane]++;
221 } else if (pass_count[lane] == CONSISTENCY) {
222 succes_mask &= ~(1 << lane);
223 }
224 if (status == CB_ERR) {
225 printk(BIOS_CRIT, "Could not find a case of %s "
226 "writes on CH%d, lane %d\n",
227 expected_result == FAILING ? "failing"
228 : "succeeding", channel, lane);
229 return CB_ERR;
230 }
231 }
232 }
233 return CB_SUCCESS;
234}
235
236/*
237 * This attempts to find the ideal delay for DQ to account for the skew between
238 * the DQ and the DQS signal.
239 * The training works this way:
240 * - start from the DQS delay values (DQ is always later than DQS)
241 * - increment the DQ delay until a succeeding write is found on all bytelayes,
242 * on all ranks on a channel and save these values
243 * - again increment the DQ delay until write start to fail on all bytelanes and
244 * save that value
245 * - use the mean between the saved succeeding and failing value
246 * - note: bytelanes cannot be trained independently, so the delays need to be
247 * adjusted and tested for all of them at the same time
248 */
249int do_write_training(struct sysinfo *s)
250{
251 int i;
252 u8 channel, lane;
253 u8 dq_lower[TOTAL_BYTELANES];
254 u8 dq_upper[TOTAL_BYTELANES];
255 struct dll_setting dq_setting[TOTAL_BYTELANES];
256 u8 dq_average;
257 u32 dq_absolute;
258
259 printk(BIOS_DEBUG, "Starting DQ write training\n");
260
261 FOR_EACH_POPULATED_CHANNEL(s->dimms, channel) {
262 printk(BIOS_DEBUG, "Doing DQ write training on CH%d\n", channel);
263
264 dq_average = 0;
265 dq_absolute = 0;
266 /* Start all lanes at DQS values */
267 FOR_EACH_BYTELANE(lane) {
268 dqset(channel, lane, &s->dqs_settings[channel][lane]);
269 s->dq_settings[channel][lane] = s->dqs_settings[channel][lane];
270 }
271 memset(dq_lower, 0, sizeof(dq_lower));
272 /* Start from DQS settings */
273 memcpy(dq_setting, s->dqs_settings[channel], sizeof(dq_setting));
274
275 if (find_dq_limit(s, channel, dq_setting, dq_lower,
276 SUCCEEDING)) {
277 printk(BIOS_CRIT,
278 "Could not find working lower limit DQ setting\n");
279 return CB_ERR;
280 }
281
282 memcpy(dq_upper, dq_lower, sizeof(dq_lower));
283
284 if (find_dq_limit(s, channel, dq_setting, dq_upper,
285 FAILING)) {
286 printk(BIOS_WARNING,
287 "Could not find failing upper limit DQ setting\n");
288 return CB_ERR;
289 }
290
291 FOR_EACH_BYTELANE(lane) {
292 dq_lower[lane] -= CONSISTENCY - 1;
293 dq_upper[lane] -= CONSISTENCY - 1;
294 u8 dq_center = (dq_upper[lane] + dq_lower[lane]) / 2;
295
296 printk(RAM_DEBUG, "Centered value for DQ DLL:"
297 " ch%d, lane %d, #steps = %d\n",
298 channel, lane, dq_center);
299 for (i = 0; i < dq_center; i++) {
300 /* Should never happen */
301 if (increment_dq_dqs(s, &s->dq_settings[channel][lane])
302 == CB_ERR)
303 printk(BIOS_ERR,
304 "Huh? write training overflowed!!\n");
305 }
306 }
307
308 /* Reset DQ DLL settings and increment with centered value*/
309 printk(BIOS_DEBUG, "Final DQ timings on CH%d\n", channel);
310 FOR_EACH_BYTELANE(lane) {
311 printk(BIOS_DEBUG, "\tlane%d: ", lane);
312 print_dll_setting(&s->dq_settings[channel][lane], 1);
313 dqset(channel, lane, &s->dq_settings[channel][lane]);
314 }
315 }
316 printk(BIOS_DEBUG, "Done DQ write training\n");
317 return CB_SUCCESS;
318}
319
320#define RT_PATTERN_SIZE 40
321
322static const u32 read_training_schedule[RT_PATTERN_SIZE] = {
323 0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
324 0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
325 0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
326 0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
327 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
328 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
329 0x03030303, 0x04040404, 0x09090909, 0x10101010,
330 0x21212121, 0x40404040, 0x81818181, 0x00000000,
331 0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee,
332 0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe
333};
334
335static int rt_increment_dqs(struct rt_dqs_setting *setting)
336{
337 if (setting->pi < 7) {
338 setting->pi++;
339 } else if (setting->tap < 14) {
340 setting->pi = 0;
341 setting->tap++;
342 } else {
343 return CB_ERR;
344 }
345 return CB_SUCCESS;
346}
347
348static u8 test_dqs_aligned(const struct sysinfo *s, const u8 channel)
349{
350 int i, rank, lane;
351 volatile u8 data[8];
352 u32 address;
353 u8 bytelane_error = 0;
354
355 FOR_EACH_POPULATED_RANK_IN_CHANNEL(s->dimms, channel, rank) {
356 address = test_address(channel, rank);
357 for (i = 0; i < RT_PATTERN_SIZE; i++) {
358 const u32 good = read_training_schedule[i];
359 write32(&data[0], read32((u32 *)address + i * 8));
360 write32(&data[4], read32((u32 *)address + i * 8 + 4));
361
362 FOR_EACH_BYTELANE(lane) {
363 if (data[lane] != (good & 0xff))
364 bytelane_error |= 1 << lane;
365 }
366 }
367 }
368 return bytelane_error;
369}
370
371static int rt_find_dqs_limit(struct sysinfo *s, u8 channel,
372 struct rt_dqs_setting dqs_setting[TOTAL_BYTELANES],
373 u8 dqs_lim[TOTAL_BYTELANES],
374 const enum training_modes expected_result)
375{
376 int lane;
377 u8 test_result;
378 int status = CB_SUCCESS;
379
380 FOR_EACH_BYTELANE(lane)
381 rt_set_dqs(channel, lane, 0, &dqs_setting[lane]);
382
383 while(status == CB_SUCCESS) {
384 test_result = test_dqs_aligned(s, channel);
385 if (test_result == (expected_result == SUCCEEDING ? 0 : 0xff))
386 return CB_SUCCESS;
387 FOR_EACH_BYTELANE(lane) {
388 if (((test_result >> lane) & 1) != expected_result) {
389 status = rt_increment_dqs(&dqs_setting[lane]);
390 dqs_lim[lane]++;
391 rt_set_dqs(channel, lane, 0, &dqs_setting[lane]);
392 }
393 }
394 }
395
396 if (expected_result == SUCCEEDING) {
397 printk(BIOS_CRIT,
398 "Could not find RT DQS setting\n");
399 return CB_ERR;
400 } else {
401 printk(RAM_DEBUG,
402 "Read succeeded over all DQS"
403 " settings, continuing\n");
404 return CB_SUCCESS;
405 }
406}
407
408#define RT_LOOPS 3
409
410/*
411 * This attempts to find the ideal delay for DQS on reads (rx).
412 * The training works this way:
413 * - start from the lowest possible delay (0) on all bytelanes
414 * - increment the DQS rx delays until a succeeding write is found on all
415 * bytelayes, on all ranks on a channel and save these values
416 * - again increment the DQS rx delay until write start to fail on all bytelanes
417 * and save that value
418 * - use the mean between the saved succeeding and failing value
419 * - note0: bytelanes cannot be trained independently, so the delays need to be
420 * adjusted and tested for all of them at the same time
421 * - note1: this memory controller appears to have per rank registers for these
422 * DQS rx delays, but only the one rank 0 seems to be used for all of them
423 */
424int do_read_training(struct sysinfo *s)
425{
426 int loop, channel, i, lane, rank;
427 u32 address, content;
428 u8 dqs_lower[TOTAL_BYTELANES];
429 u8 dqs_upper[TOTAL_BYTELANES];
430 struct rt_dqs_setting dqs_setting[TOTAL_BYTELANES];
431 u16 saved_dqs_center[TOTAL_CHANNELS][TOTAL_BYTELANES];
432
433 memset(saved_dqs_center, 0 , sizeof(saved_dqs_center));
434
435 printk(BIOS_DEBUG, "Starting DQS read training\n");
436
437 for (loop = 0; loop < RT_LOOPS; loop++) {
438 FOR_EACH_POPULATED_CHANNEL(s->dimms, channel) {
439 printk(RAM_DEBUG, "Doing DQS read training on CH%d\n",
440 channel);
441
442 /* Write pattern to strobe address */
443 FOR_EACH_POPULATED_RANK_IN_CHANNEL(s->dimms, channel, rank) {
444 address = test_address(channel, rank);
445 for (i = 0; i < RT_PATTERN_SIZE; i++) {
446 content = read_training_schedule[i];
447 write32((u32 *)address + 8 * i, content);
448 write32((u32 *)address + 8 * i + 4, content);
449 }
450 }
451
452 memset(dqs_lower, 0, sizeof(dqs_lower));
453 memset(&dqs_setting, 0, sizeof(dqs_setting));
454 if (rt_find_dqs_limit(s, channel, dqs_setting, dqs_lower,
455 SUCCEEDING)) {
456 printk(BIOS_CRIT,
457 "Could not find working lower limit DQS setting\n");
458 return CB_ERR;
459 }
460
461 FOR_EACH_BYTELANE(lane)
462 dqs_upper[lane] = dqs_lower[lane];
463
464 if (rt_find_dqs_limit(s, channel, dqs_setting, dqs_upper,
465 FAILING)) {
466 printk(BIOS_CRIT,
467 "Could not find failing upper limit DQ setting\n");
468 return CB_ERR;
469 }
470
471 printk(RAM_DEBUG, "Centered values, loop %d:\n", loop);
472 FOR_EACH_BYTELANE(lane) {
473 u8 center = (dqs_lower[lane] + dqs_upper[lane]) / 2;
474 printk(RAM_DEBUG, "\t lane%d: #%d\n", lane, center);
475 saved_dqs_center[channel][lane] += center;
476 }
477 } /* END FOR_EACH_POPULATED_CHANNEL */
478 } /* end RT_LOOPS */
479
480 memset(s->rt_dqs, 0, sizeof(s->rt_dqs));
481
482 FOR_EACH_POPULATED_CHANNEL(s->dimms, channel) {
483 printk(BIOS_DEBUG, "Final timings on CH%d:\n", channel);
484 FOR_EACH_BYTELANE(lane) {
485 saved_dqs_center[channel][lane] /= RT_LOOPS;
486 while (saved_dqs_center[channel][lane]--) {
487 if(rt_increment_dqs(&s->rt_dqs[channel][lane])
488 == CB_ERR)
489 /* Should never happen */
490 printk(BIOS_ERR,
491 "Huh? read training overflowed!!\n");
492 }
493 FOR_EACH_POPULATED_RANK_IN_CHANNEL(s->dimms, channel, rank)
494 rt_set_dqs(channel, lane, rank,
495 &s->rt_dqs[channel][lane]);
496 printk(BIOS_DEBUG, "\tlane%d: %d.%d\n",
497 lane, s->rt_dqs[channel][lane].tap,
498 s->rt_dqs[channel][lane].pi);
499 }
500 }
501 printk(BIOS_DEBUG, "Done DQS read training\n");
502 return CB_SUCCESS;
503}