Test-Retest Reliability

Simulating consistency, practice effects, and what correlation does (and doesn’t) tell you

Published

March 25, 2026

What is Test-Retest Reliability?

Test-retest reliability refers to the consistency of a measure when the same individuals are tested more than once under identical (or near-identical) conditions. It answers a deceptively simple question:

If I measure the same thing twice, do I get the same answer?

A high test-retest correlation tells you the instrument is consistent — not that the construct itself is stable, or that the scores are accurate. This distinction matters more than it might seem, and we’ll return to it at the end.

Interactive Sandbox

Use the sliders below to explore how sample size, true correlation, and practice effects interact. All plots and statistics update live in your browser — no server needed.

Show code

viewof n = Inputs.range([10, 300], {
  value: 50, step: 5,
  label: "Sample size (n)"
})

viewof t_r = Inputs.range([0, 1], {
  value: 0.6, step: 0.05,
  label: "True correlation (ρ)"
})

viewof pract_eff = Inputs.range([-10, 10], {
  value: 2, step: 0.5,
  label: "Practice effect (added to retest)"
})

viewof seed = Inputs.range([1, 100], {
  value: 42, step: 1,
  label: "Random seed"
})

Show code

function mulberry32(seed) {
  return function() {
    seed |= 0; seed = seed + 0x6D2B79F5 | 0;
    let t = Math.imul(seed ^ seed >>> 15, 1 | seed);
    t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
    return ((t ^ t >>> 14) >>> 0) / 4294967296;
  }
}

function boxMuller(rng) {
  const u1 = rng(), u2 = rng();
  const z0 = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
  const z1 = Math.sqrt(-2 * Math.log(u1)) * Math.sin(2 * Math.PI * u2);
  return [z0, z1];
}

// simulate n pairs from bivariate normal with correlation t_r
simData = {
  const rng  = mulberry32(seed);
  const mu   = 20;
  const sd   = 5;
  const data = [];

  for (let i = 0; i < n; i++) {
    const [z1, z2] = boxMuller(rng);
    const test    = mu + sd * z1;
    const retest  = mu + sd * (t_r * z1 + Math.sqrt(1 - t_r ** 2) * z2);
    data.push({ id: i, test, retest, retest_p: retest + pract_eff });
  }
  return data;
}

// ── statistics helpers ─────────────────────────────────────────────────────────
function mean(arr)  { return arr.reduce((a, b) => a + b, 0) / arr.length }
function variance(arr) {
  const m = mean(arr);
  return arr.reduce((a, b) => a + (b - m) ** 2, 0) / (arr.length - 1);
}
function sdFn(arr)  { return Math.sqrt(variance(arr)) }
function cov(a, b)  {
  const ma = mean(a), mb = mean(b);
  return a.reduce((s, v, i) => s + (v - ma) * (b[i] - mb), 0) / (a.length - 1);
}
function pearsonR(a, b) { return cov(a, b) / (sdFn(a) * sdFn(b)) }

// paired t-test
function pairedT(a, b) {
  const diffs = a.map((v, i) => v - b[i]);
  const md    = mean(diffs);
  const se    = sdFn(diffs) / Math.sqrt(diffs.length);
  const t     = md / se;
  const df    = diffs.length - 1;
  const p     = 2 * tDist(Math.abs(t), df);
  return { t: +t.toFixed(3), df, p: +p.toFixed(4), meanDiff: +md.toFixed(3), se: +se.toFixed(3) };
}

function tDist(t, df) {
  const x = df / (df + t * t);
  return 0.5 * incompleteBeta(x, df / 2, 0.5);
}
function incompleteBeta(x, a, b) {
  if (x <= 0) return 0; if (x >= 1) return 1;
  const lbeta = lgamma(a) + lgamma(b) - lgamma(a + b);
  const front = Math.exp(a * Math.log(x) + b * Math.log(1 - x) - lbeta) / a;
  return front * betaCF(x, a, b);
}
function betaCF(x, a, b) {
  const maxIter = 200, eps = 3e-7;
  let qab = a + b, qap = a + 1, qam = a - 1;
  let c = 1, d = 1 - qab * x / qap; if (Math.abs(d) < 1e-30) d = 1e-30; d = 1 / d;
  let h = d;
  for (let m = 1; m <= maxIter; m++) {
    let m2 = 2 * m;
    let aa = m * (b - m) * x / ((qam + m2) * (a + m2));
    d = 1 + aa * d; if (Math.abs(d) < 1e-30) d = 1e-30;
    c = 1 + aa / c; if (Math.abs(c) < 1e-30) c = 1e-30;
    d = 1 / d; h *= d * c;
    aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
    d = 1 + aa * d; if (Math.abs(d) < 1e-30) d = 1e-30;
    c = 1 + aa / c; if (Math.abs(c) < 1e-30) c = 1e-30;
    d = 1 / d; let del = d * c; h *= del;
    if (Math.abs(del - 1) < eps) break;
  }
  return h;
}
function lgamma(x) {
  const g = 7, c = [0.99999999999980993,676.5203681218851,-1259.1392167224028,
    771.32342877765313,-176.61502916214059,12.507343278686905,
    -0.13857109526572012,9.9843695780195716e-6,1.5056327351493116e-7];
  if (x < 0.5) return Math.log(Math.PI / Math.sin(Math.PI * x)) - lgamma(1 - x);
  x -= 1;
  let a = c[0]; const t = x + g + 0.5;
  for (let i = 1; i < g + 2; i++) a += c[i] / (x + i);
  return 0.5 * Math.log(2 * Math.PI) + (x + 0.5) * Math.log(t) - t + Math.log(a);
}

// ── derived stats ──────────────────────────────────────────────────────────────
stats = {
  const xs  = simData.map(d => d.test);
  const ys  = simData.map(d => d.retest);
  const ysp = simData.map(d => d.retest_p);
  return {
    r_orig:  +pearsonR(xs, ys).toFixed(3),
    r_pract: +pearsonR(xs, ysp).toFixed(3),
    t_orig:  pairedT(xs, ys),
    t_pract: pairedT(xs, ysp)
  }
}

Scatter plot

Show code

{
  const allY  = simData.flatMap(d => [d.retest, d.retest_p]);
  const allX  = simData.map(d => d.test);
  const lo    = Math.min(...allX, ...allY) - 2;
  const hi    = Math.max(...allX, ...allY) + 2;

  const diagLine = Plot.line(
    [{x: lo, y: lo}, {x: hi, y: hi}],
    { x: "x", y: "y", stroke: "#aaa", strokeDasharray: "4,3", strokeWidth: 1.2 }
  );

  const origMarks = Plot.dot(simData, {
    x: "test", y: "retest",
    fill: "#4C72B0", opacity: 0.75, r: 4,
    tip: true,
    title: d => `test: ${d.test.toFixed(1)}\nretest: ${d.retest.toFixed(1)}`
  });

  const practMarks = pract_eff !== 0
    ? Plot.dot(simData, {
        x: "test", y: "retest_p",
        stroke: "#DD4444", fill: "none", strokeWidth: 1.5, r: 4,
        tip: true,
        title: d => `test: ${d.test.toFixed(1)}\nretest+Δ: ${d.retest_p.toFixed(1)}`
      })
    : null;

  return Plot.plot({
    width: 560, height: 480,
    x: { label: "Test score →",  domain: [lo, hi] },
    y: { label: "↑ Retest score", domain: [lo, hi] },
    caption: `Observed r = ${stats.r_orig}  |  n = ${n}  |  true ρ = ${t_r}`,
    marks: [diagLine, origMarks, ...(practMarks ? [practMarks] : [])],
    style: { fontSize: 13 }
  });
}

Legend

🔵 Blue filled circles — original retest scores
🔴 Red open circles — retest + practice effect (visible when practice effect ≠ 0)
Dashed line — perfect agreement (test = retest)

Statistics summary

Show code

{
  const fmt_p = p => p < 0.001 ? "< .001" : p.toFixed(3);
  const sig   = p => p < .05
    ? `<span style="color:#2a9d2a">✅ p = ${fmt_p(p)}</span>`
    : `<span style="color:#cc4444">❌ p = ${fmt_p(p)}</span>`;

  return html`
  <table style="border-collapse:collapse; font-size:0.92em; width:100%">
    <thead>
      <tr style="background:#f0f4f8">
        <th style="padding:8px 12px; text-align:left; border-bottom:2px solid #ccc"></th>
        <th style="padding:8px 12px; text-align:center; border-bottom:2px solid #ccc">Original retest</th>
        <th style="padding:8px 12px; text-align:center; border-bottom:2px solid #ccc">Retest + practice (Δ = ${pract_eff})</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td style="padding:7px 12px; border-bottom:1px solid #eee"><b>Pearson <i>r</i></b></td>
        <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.r_orig}</td>
        <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.r_pract}</td>
      </tr>
      <tr style="background:#fafafa">
        <td style="padding:7px 12px; border-bottom:1px solid #eee"><b>Mean difference</b></td>
        <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.t_orig.meanDiff}</td>
        <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.t_pract.meanDiff}</td>
      </tr>
      <tr>
        <td style="padding:7px 12px; border-bottom:1px solid #eee"><b>Paired <i>t</i></b></td>
        <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.t_orig.t}</td>
        <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.t_pract.t}</td>
      </tr>
      <tr style="background:#fafafa">
        <td style="padding:7px 12px; border-bottom:1px solid #eee"><b><i>p</i>-value</b></td>
        <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${ html`${sig(stats.t_orig.p)}` }</td>
        <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${ html`${sig(stats.t_pract.p)}` }</td>
      </tr>
    </tbody>
  </table>`;
}

Things to try with the sliders

Drag n from 10 → 300 — watch the sample r stabilise toward the true ρ. Small samples are noisy!
Set ρ = 0, practice effect = 0 — the t-test should be non-significant and r ≈ 0.
Set practice effect = 5 — the mean difference jumps, the t-test turns significant, but r doesn’t move at all.
Change the seed — same parameters, different sample. How much does r jump around at n = 20 vs n = 200?

Explanation

Why does correlation ignore practice effects?

Pearson’s $r$ measures the linear relationship between deviations from the mean. When you add a constant $\Delta$ to every retest score:

\[r_{X,\, Y+\Delta} = \frac{\text{Cov}(X,\, Y+\Delta)}{\text{SD}(X)\cdot\text{SD}(Y+\Delta)} = \frac{\text{Cov}(X,Y)}{\text{SD}(X)\cdot\text{SD}(Y)} = r_{X,Y}\]

The constant cancels out entirely. The paired t-test, by contrast, works on the raw differences $d_i = X_i - Y_i$, so it picks up the shift immediately.

This is why reliability researchers often complement $r$ with:

Bland-Altman plots — visualise agreement and systematic bias simultaneously
Intraclass Correlation Coefficient (ICC) — penalises both poor correlation and mean-level differences

Key Takeaway

Don’t confuse reliability with stability

A high test-retest correlation tells you that individuals who score high on the first occasion tend to score high on the second — it reflects consistency of rank ordering.

It does not tell you:

That the absolute scores are accurate
That the construct hasn’t changed over time
That systematic biases (like a practice effect) are absent

For a fuller picture, pair the correlation with a check on mean-level differences — which is exactly what Bland-Altman plots and ICC are designed for.

Session Info

Show session info

sessionInfo()

R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Ventura 13.0

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Rome
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] htmlwidgets_1.6.4 compiler_4.5.2    fastmap_1.2.0     cli_3.6.5        
 [5] tools_4.5.2       htmltools_0.5.9   otel_0.2.0        rstudioapi_0.18.0
 [9] yaml_2.3.12       rmarkdown_2.30    knitr_1.51        jsonlite_2.0.0   
[13] xfun_0.56         digest_0.6.39     rlang_1.1.7       evaluate_1.0.5

--- title: "Test-Retest Reliability" subtitle: "Simulating consistency, practice effects, and what correlation does (and doesn't) tell you" date: today format: html: theme: cosmo toc: true toc-depth: 3 toc-title: "Contents" toc-location: left code-fold: true code-summary: "Show code" code-tools: true embed-resources: true fig-width: 7 fig-height: 5 highlight-style: github execute: warning: false message: false echo: true --- ## What is Test-Retest Reliability? **Test-retest reliability** refers to the consistency of a measure when the same individuals are tested more than once under identical (or near-identical) conditions. It answers a deceptively simple question: > *If I measure the same thing twice, do I get the same answer?* A high test-retest correlation tells you the instrument is **consistent** — not that the construct itself is stable, or that the scores are accurate. This distinction matters more than it might seem, and we'll return to it at the end. --- ## Interactive Sandbox {#sandbox} Use the sliders below to explore how sample size, true correlation, and practice effects interact. **All plots and statistics update live in your browser** — no server needed. ```{ojs} //| echo: false // ── controls ────────────────────────────────────────────────────────────────── viewof n = Inputs.range([10, 300], { value: 50, step: 5, label: "Sample size (n)" }) viewof t_r = Inputs.range([0, 1], { value: 0.6, step: 0.05, label: "True correlation (ρ)" }) viewof pract_eff = Inputs.range([-10, 10], { value: 2, step: 0.5, label: "Practice effect (added to retest)" }) viewof seed = Inputs.range([1, 100], { value: 42, step: 1, label: "Random seed" }) ``` ```{ojs} //| echo: false // ── simulation (Cholesky in plain JS, no R needed) ──────────────────────────── function mulberry32(seed) { return function() { seed |= 0; seed = seed + 0x6D2B79F5 | 0; let t = Math.imul(seed ^ seed >>> 15, 1 | seed); t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t; return ((t ^ t >>> 14) >>> 0) / 4294967296; } } function boxMuller(rng) { const u1 = rng(), u2 = rng(); const z0 = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); const z1 = Math.sqrt(-2 * Math.log(u1)) * Math.sin(2 * Math.PI * u2); return [z0, z1]; } // simulate n pairs from bivariate normal with correlation t_r simData = { const rng = mulberry32(seed); const mu = 20; const sd = 5; const data = []; for (let i = 0; i < n; i++) { const [z1, z2] = boxMuller(rng); const test = mu + sd * z1; const retest = mu + sd * (t_r * z1 + Math.sqrt(1 - t_r ** 2) * z2); data.push({ id: i, test, retest, retest_p: retest + pract_eff }); } return data; } // ── statistics helpers ───────────────────────────────────────────────────────── function mean(arr) { return arr.reduce((a, b) => a + b, 0) / arr.length } function variance(arr) { const m = mean(arr); return arr.reduce((a, b) => a + (b - m) ** 2, 0) / (arr.length - 1); } function sdFn(arr) { return Math.sqrt(variance(arr)) } function cov(a, b) { const ma = mean(a), mb = mean(b); return a.reduce((s, v, i) => s + (v - ma) * (b[i] - mb), 0) / (a.length - 1); } function pearsonR(a, b) { return cov(a, b) / (sdFn(a) * sdFn(b)) } // paired t-test function pairedT(a, b) { const diffs = a.map((v, i) => v - b[i]); const md = mean(diffs); const se = sdFn(diffs) / Math.sqrt(diffs.length); const t = md / se; const df = diffs.length - 1; const p = 2 * tDist(Math.abs(t), df); return { t: +t.toFixed(3), df, p: +p.toFixed(4), meanDiff: +md.toFixed(3), se: +se.toFixed(3) }; } function tDist(t, df) { const x = df / (df + t * t); return 0.5 * incompleteBeta(x, df / 2, 0.5); } function incompleteBeta(x, a, b) { if (x <= 0) return 0; if (x >= 1) return 1; const lbeta = lgamma(a) + lgamma(b) - lgamma(a + b); const front = Math.exp(a * Math.log(x) + b * Math.log(1 - x) - lbeta) / a; return front * betaCF(x, a, b); } function betaCF(x, a, b) { const maxIter = 200, eps = 3e-7; let qab = a + b, qap = a + 1, qam = a - 1; let c = 1, d = 1 - qab * x / qap; if (Math.abs(d) < 1e-30) d = 1e-30; d = 1 / d; let h = d; for (let m = 1; m <= maxIter; m++) { let m2 = 2 * m; let aa = m * (b - m) * x / ((qam + m2) * (a + m2)); d = 1 + aa * d; if (Math.abs(d) < 1e-30) d = 1e-30; c = 1 + aa / c; if (Math.abs(c) < 1e-30) c = 1e-30; d = 1 / d; h *= d * c; aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2)); d = 1 + aa * d; if (Math.abs(d) < 1e-30) d = 1e-30; c = 1 + aa / c; if (Math.abs(c) < 1e-30) c = 1e-30; d = 1 / d; let del = d * c; h *= del; if (Math.abs(del - 1) < eps) break; } return h; } function lgamma(x) { const g = 7, c = [0.99999999999980993,676.5203681218851,-1259.1392167224028, 771.32342877765313,-176.61502916214059,12.507343278686905, -0.13857109526572012,9.9843695780195716e-6,1.5056327351493116e-7]; if (x < 0.5) return Math.log(Math.PI / Math.sin(Math.PI * x)) - lgamma(1 - x); x -= 1; let a = c[0]; const t = x + g + 0.5; for (let i = 1; i < g + 2; i++) a += c[i] / (x + i); return 0.5 * Math.log(2 * Math.PI) + (x + 0.5) * Math.log(t) - t + Math.log(a); } // ── derived stats ────────────────────────────────────────────────────────────── stats = { const xs = simData.map(d => d.test); const ys = simData.map(d => d.retest); const ysp = simData.map(d => d.retest_p); return { r_orig: +pearsonR(xs, ys).toFixed(3), r_pract: +pearsonR(xs, ysp).toFixed(3), t_orig: pairedT(xs, ys), t_pract: pairedT(xs, ysp) } } ``` ### Scatter plot ```{ojs} //| echo: false { const allY = simData.flatMap(d => [d.retest, d.retest_p]); const allX = simData.map(d => d.test); const lo = Math.min(...allX, ...allY) - 2; const hi = Math.max(...allX, ...allY) + 2; const diagLine = Plot.line( [{x: lo, y: lo}, {x: hi, y: hi}], { x: "x", y: "y", stroke: "#aaa", strokeDasharray: "4,3", strokeWidth: 1.2 } ); const origMarks = Plot.dot(simData, { x: "test", y: "retest", fill: "#4C72B0", opacity: 0.75, r: 4, tip: true, title: d => `test: ${d.test.toFixed(1)}\nretest: ${d.retest.toFixed(1)}` }); const practMarks = pract_eff !== 0 ? Plot.dot(simData, { x: "test", y: "retest_p", stroke: "#DD4444", fill: "none", strokeWidth: 1.5, r: 4, tip: true, title: d => `test: ${d.test.toFixed(1)}\nretest+Δ: ${d.retest_p.toFixed(1)}` }) : null; return Plot.plot({ width: 560, height: 480, x: { label: "Test score →", domain: [lo, hi] }, y: { label: "↑ Retest score", domain: [lo, hi] }, caption: `Observed r = ${stats.r_orig} | n = ${n} | true ρ = ${t_r}`, marks: [diagLine, origMarks, ...(practMarks ? [practMarks] : [])], style: { fontSize: 13 } }); } ``` <details> <summary>**Legend**</summary> - 🔵 **Blue filled circles** — original retest scores - 🔴 **Red open circles** — retest + practice effect (visible when practice effect ≠ 0) - **Dashed line** — perfect agreement (test = retest) </details> ### Statistics summary ```{ojs} //| echo: false { const fmt_p = p => p < 0.001 ? "< .001" : p.toFixed(3); const sig = p => p < .05 ? `<span style="color:#2a9d2a">✅ p = ${fmt_p(p)}</span>` : `<span style="color:#cc4444">❌ p = ${fmt_p(p)}</span>`; return html` <table style="border-collapse:collapse; font-size:0.92em; width:100%"> <thead> <tr style="background:#f0f4f8"> <th style="padding:8px 12px; text-align:left; border-bottom:2px solid #ccc"></th> <th style="padding:8px 12px; text-align:center; border-bottom:2px solid #ccc">Original retest</th> <th style="padding:8px 12px; text-align:center; border-bottom:2px solid #ccc">Retest + practice (Δ = ${pract_eff})</th> </tr> </thead> <tbody> <tr> <td style="padding:7px 12px; border-bottom:1px solid #eee"><b>Pearson <i>r</i></b></td> <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.r_orig}</td> <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.r_pract}</td> </tr> <tr style="background:#fafafa"> <td style="padding:7px 12px; border-bottom:1px solid #eee"><b>Mean difference</b></td> <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.t_orig.meanDiff}</td> <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.t_pract.meanDiff}</td> </tr> <tr> <td style="padding:7px 12px; border-bottom:1px solid #eee"><b>Paired <i>t</i></b></td> <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.t_orig.t}</td> <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${stats.t_pract.t}</td> </tr> <tr style="background:#fafafa"> <td style="padding:7px 12px; border-bottom:1px solid #eee"><b><i>p</i>-value</b></td> <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${ html`${sig(stats.t_orig.p)}` }</td> <td style="padding:7px 12px; text-align:center; border-bottom:1px solid #eee">${ html`${sig(stats.t_pract.p)}` }</td> </tr> </tbody> </table>`; } ``` ::: {.callout-tip title="Things to try with the sliders"} - **Drag *n* from 10 → 300** — watch the sample *r* stabilise toward the true ρ. Small samples are noisy! - **Set ρ = 0, practice effect = 0** — the t-test should be non-significant and *r* ≈ 0. - **Set practice effect = 5** — the mean difference jumps, the t-test turns significant, but *r* doesn't move at all. - **Change the seed** — same parameters, different sample. How much does *r* jump around at n = 20 vs n = 200? ::: --- ## Explanation ### Why does correlation ignore practice effects? Pearson's $r$ measures the **linear relationship between deviations from the mean**. When you add a constant $\Delta$ to every retest score: $$r_{X,\, Y+\Delta} = \frac{\text{Cov}(X,\, Y+\Delta)}{\text{SD}(X)\cdot\text{SD}(Y+\Delta)} = \frac{\text{Cov}(X,Y)}{\text{SD}(X)\cdot\text{SD}(Y)} = r_{X,Y}$$ The constant cancels out entirely. The **paired t-test**, by contrast, works on the raw differences $d_i = X_i - Y_i$, so it picks up the shift immediately. This is why reliability researchers often complement $r$ with: - **Bland-Altman plots** — visualise agreement and systematic bias simultaneously - **Intraclass Correlation Coefficient (ICC)** — penalises both poor correlation *and* mean-level differences --- ## Key Takeaway ::: {.callout-warning title="Don't confuse reliability with stability"} A **high test-retest correlation** tells you that individuals who score high on the first occasion tend to score high on the second — it reflects **consistency of rank ordering**. It does *not* tell you: - That the absolute scores are accurate - That the construct hasn't changed over time - That systematic biases (like a practice effect) are absent For a fuller picture, pair the correlation with a check on mean-level differences — which is exactly what **Bland-Altman plots** and **ICC** are designed for. ::: --- ## Session Info ```{r} #| code-fold: true #| code-summary: "Show session info" sessionInfo() ```