From b8e45d261519ddeffbff19488b588a3c7180a073 Mon Sep 17 00:00:00 2001
From: thomassargent30 <ts43@nyu.edu>
Date: Fri, 8 May 2026 21:54:07 -0400
Subject: [PATCH 01/25] =?UTF-8?q?Tom's=C2=A0May=208=20edits=20of=20new=20l?=
 =?UTF-8?q?ecture?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lectures/_static/quant-econ.bib   | 131 +++++
 lectures/_toc.yml                 |   1 +
 lectures/rational_learning_re.bib | 160 ++++++
 lectures/rational_learning_re.md  | 915 ++++++++++++++++++++++++++++++
 4 files changed, 1207 insertions(+)
 create mode 100644 lectures/rational_learning_re.bib
 create mode 100644 lectures/rational_learning_re.md

diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 2701fe981..2dbdd0e89 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -3859,3 +3859,134 @@ @article{grossman1976
   pages   = {573--585},
   year    = {1976}
 }
+
+@incollection{BrayKreps1987,
+  author    = {Bray, Margaret M. and Kreps, David M.},
+  title     = {Rational Learning and Rational Expectations},
+  booktitle = {Arrow and the Ascent of Modern Economic Theory},
+  editor    = {Feiwel, George R.},
+  publisher = {New York University Press},
+  address   = {New York},
+  year      = {1987},
+  pages     = {597--625}
+}
+
+@article{Bray1982,
+  author  = {Bray, Margaret M.},
+  title   = {Learning, Estimation, and the Stability of Rational Expectations},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {26},
+  number  = {2},
+  pages   = {318--339}
+}
+
+@article{BraySavin1984,
+  author  = {Bray, Margaret M. and Savin, N. E.},
+  title   = {Rational Expectations Equilibria, Learning and Model Specification},
+  journal = {Econometrica},
+  year    = {1986},
+  volume  = {54},
+  number  = {5},
+  pages   = {1129--1160}
+}
+
+@article{Radner1979,
+  author  = {Radner, Roy},
+  title   = {Rational Expectations Equilibrium: Generic Existence and the Information Revealed by Prices},
+  journal = {Econometrica},
+  year    = {1979},
+  volume  = {47},
+  number  = {3},
+  pages   = {655--678}
+}
+
+@article{Jordan1982,
+  author  = {Jordan, James S.},
+  title   = {The Generic Existence of Rational Expectations Equilibrium in the Higher Dimensional Case},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {26},
+  number  = {2},
+  pages   = {224--243}
+}
+
+@article{Jordan1982b,
+  author  = {Jordan, James S.},
+  title   = {Admissible Market Data Structures: A Complete Characterization},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {28},
+  number  = {1},
+  pages   = {19--31}
+}
+
+@article{Admati1985,
+  author  = {Admati, Anat R.},
+  title   = {A Noisy Rational Expectations Equilibrium for Multi-Asset Securities Markets},
+  journal = {Econometrica},
+  year    = {1985},
+  volume  = {53},
+  number  = {3},
+  pages   = {629--658}
+}
+
+@article{GrossmanStiglitz1980,
+  author  = {Grossman, Sanford J. and Stiglitz, Joseph E.},
+  title   = {On the Impossibility of Informationally Efficient Markets},
+  journal = {American Economic Review},
+  year    = {1980},
+  volume  = {70},
+  number  = {3},
+  pages   = {393--408}
+}
+
+@article{BlumeEasley1982,
+  author  = {Blume, Lawrence E. and Easley, David},
+  title   = {Learning to be Rational},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {26},
+  number  = {2},
+  pages   = {340--351}
+}
+
+@article{Frydman1982,
+  author  = {Frydman, Roman},
+  title   = {Towards an Understanding of Market Processes: Individual Expectations, Learning, and Convergence to Rational Expectations Equilibrium},
+  journal = {American Economic Review},
+  year    = {1982},
+  volume  = {72},
+  number  = {4},
+  pages   = {652--668}
+}
+
+@article{Lewis1981,
+  author  = {Lewis, Karen K.},
+  title   = {An Introduction to the Theory of Rational Expectations under Asymmetric Information},
+  journal = {Review of Economic Studies},
+  year    = {1981},
+  volume  = {48},
+  number  = {4},
+  pages   = {541--560}
+}
+
+@article{Townsend1983b,
+  author  = {Townsend, Robert M.},
+  title   = {Forecasting the Forecasts of Others},
+  journal = {Journal of Political Economy},
+  year    = {1983},
+  volume  = {91},
+  number  = {4},
+  pages   = {546--588}
+}
+
+@article{Kobayashi1977,
+  author  = {Kobayashi, Tetsuya},
+  title   = {A Note on Fulfilled Expectations Equilibria},
+  journal = {Journal of Economic Theory},
+  year    = {1977},
+  volume  = {14},
+  number  = {1},
+  pages   = {32--43}
+}
diff --git a/lectures/_toc.yml b/lectures/_toc.yml
index 078ba02e6..7341e6771 100644
--- a/lectures/_toc.yml
+++ b/lectures/_toc.yml
@@ -141,6 +141,7 @@ parts:
   - file: ge_arrow
   - file: harrison_kreps
   - file: morris_learn
+  - file: rational_learning_re
   - file: affine_risk_prices
   - file: ross_recovery
   - file: misspecified_recovery
diff --git a/lectures/rational_learning_re.bib b/lectures/rational_learning_re.bib
new file mode 100644
index 000000000..585c27c1b
--- /dev/null
+++ b/lectures/rational_learning_re.bib
@@ -0,0 +1,160 @@
+% BibTeX references for rational_learning_re.md
+% References NOT already in quant-econ.bib
+
+@incollection{BrayKreps1987,
+  author    = {Bray, Margaret M. and Kreps, David M.},
+  title     = {Rational Learning and Rational Expectations},
+  booktitle = {Arrow and the Ascent of Modern Economic Theory},
+  editor    = {Feiwel, George R.},
+  publisher = {New York University Press},
+  address   = {New York},
+  year      = {1987},
+  pages     = {597--625},
+  note      = {Chapter 19}
+}
+
+@article{Bray1982,
+  author  = {Bray, Margaret M.},
+  title   = {Learning, Estimation, and the Stability of Rational Expectations},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {26},
+  number  = {2},
+  pages   = {318--339},
+  doi     = {10.1016/0022-0531(82)90028-X}
+}
+
+@article{BraySavin1984,
+  author  = {Bray, Margaret M. and Savin, N. E.},
+  title   = {Rational Expectations Equilibria, Learning and Model Specification},
+  journal = {Econometrica},
+  year    = {1986},
+  volume  = {54},
+  number  = {5},
+  pages   = {1129--1160},
+  doi     = {10.2307/1912325}
+}
+
+@article{Radner1979,
+  author  = {Radner, Roy},
+  title   = {Rational Expectations Equilibrium: Generic Existence and the Information Revealed by Prices},
+  journal = {Econometrica},
+  year    = {1979},
+  volume  = {47},
+  number  = {3},
+  pages   = {655--678},
+  doi     = {10.2307/1910414}
+}
+
+@article{Jordan1982,
+  author  = {Jordan, James S.},
+  title   = {The Generic Existence of Rational Expectations Equilibrium in the Higher Dimensional Case},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {26},
+  number  = {2},
+  pages   = {224--243},
+  doi     = {10.1016/0022-0531(82)90021-7}
+}
+
+@article{Jordan1982b,
+  author  = {Jordan, James S.},
+  title   = {Admissible Market Data Structures: A Complete Characterization},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {28},
+  number  = {1},
+  pages   = {19--31},
+  doi     = {10.1016/0022-0531(82)90089-8}
+}
+
+@article{Admati1985,
+  author  = {Admati, Anat R.},
+  title   = {A Noisy Rational Expectations Equilibrium for Multi-Asset Securities Markets},
+  journal = {Econometrica},
+  year    = {1985},
+  volume  = {53},
+  number  = {3},
+  pages   = {629--658},
+  doi     = {10.2307/1911659}
+}
+
+@article{GrossmanStiglitz1980,
+  author  = {Grossman, Sanford J. and Stiglitz, Joseph E.},
+  title   = {On the Impossibility of Informationally Efficient Markets},
+  journal = {American Economic Review},
+  year    = {1980},
+  volume  = {70},
+  number  = {3},
+  pages   = {393--408}
+}
+
+@article{GrossmanSonnenschein1982,
+  author  = {Grossman, Sanford J. and Sonnenschein, Hugo},
+  title   = {Notes on Expectations Equilibria in Bayesian Settings},
+  journal = {Working Paper},
+  year    = {1982}
+}
+
+@article{BlumeEasley1982,
+  author  = {Blume, Lawrence E. and Easley, David},
+  title   = {Learning to be Rational},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {26},
+  number  = {2},
+  pages   = {340--351},
+  doi     = {10.1016/0022-0531(82)90022-9}
+}
+
+@article{Frydman1982,
+  author  = {Frydman, Roman},
+  title   = {Towards an Understanding of Market Processes: Individual Expectations, Learning, and Convergence to Rational Expectations Equilibrium},
+  journal = {American Economic Review},
+  year    = {1982},
+  volume  = {72},
+  number  = {4},
+  pages   = {652--668}
+}
+
+@article{Lewis1981,
+  author  = {Lewis, Karen K.},
+  title   = {An Introduction to the Theory of Rational Expectations under Asymmetric Information},
+  journal = {Review of Economic Studies},
+  year    = {1981},
+  volume  = {48},
+  number  = {4},
+  pages   = {541--560},
+  doi     = {10.2307/2297169}
+}
+
+@article{Townsend1983,
+  author  = {Townsend, Robert M.},
+  title   = {Forecasting the Forecasts of Others},
+  journal = {Journal of Political Economy},
+  year    = {1983},
+  volume  = {91},
+  number  = {4},
+  pages   = {546--588},
+  doi     = {10.1086/261170}
+}
+
+@article{ArrowGreen1973,
+  author  = {Arrow, Kenneth J. and Green, Jerry R.},
+  title   = {Notes on Expectations Equilibria in Bayesian Settings},
+  journal = {Working Paper in Economics},
+  year    = {1973},
+  number  = {33},
+  institution = {Institute for Mathematical Studies in the Social Sciences, Stanford University}
+}
+
+@article{Kobayashi1977,
+  author  = {Kobayashi, Tetsuya},
+  title   = {A Note on Fulfilled Expectations Equilibria},
+  journal = {Journal of Economic Theory},
+  year    = {1977},
+  volume  = {14},
+  number  = {1},
+  pages   = {32--43},
+  doi     = {10.1016/0022-0531(77)90098-0}
+}
diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
new file mode 100644
index 000000000..8f3cba541
--- /dev/null
+++ b/lectures/rational_learning_re.md
@@ -0,0 +1,915 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+(rational_learning_re)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Rational Learning and Rational Expectations
+
+```{contents} Contents
+:depth: 2
+```
+
+## Overview
+
+This lecture explores a classic question in economic theory: can agents **learn** their way to a rational expectations equilibrium?
+
+{cite}`BrayKreps1987` examine this question in a rigorously specified model.
+
+In a rational expectations equilibrium, agents use market prices to make inferences about other agents' private information.
+Each agent knows the **statistical relationship** between prices and the underlying payoff-relevant variables — and that relationship is **correct** given the equilibrium.
+
+But this raises a deep question: where does that knowledge come from?
+
+The **rational learning** approach studied by Bray and Kreps asks whether agents who start with uncertainty about the equilibrium price function can, over time, learn it from observations of past prices.
+
+The key findings are:
+
+* In a benchmark example, a rational (Bayesian) uninformed agent **does learn** the equilibrium price function as data accumulate.
+* The beliefs of the uninformed agent converge (weakly) to a point mass at the true equilibrium parameter.
+* In more general economies, this convergence can fail — especially when **multiple equilibria** exist or when the uninformed agent's model is **misspecified**.
+
+This lecture presents the Bray–Kreps framework, works through their benchmark example in detail, and provides Python code to simulate Bayesian learning dynamics.
+
+```{note}
+This lecture draws on {cite}`BrayKreps1987`, Chapter 19 in *Advances in Economic Theory* (1987), which synthesizes earlier work by {cite}`Bray1982`, {cite}`BraySavin1984`, and the rational expectations literature of {cite}`Radner1979`, {cite}`grossman1976`, and {cite}`Jordan1982`.
+```
+
+Let's start with the necessary imports.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import norm
+from scipy.integrate import quad
+import warnings
+warnings.filterwarnings('ignore')
+```
+
+## The Economy
+
+### Agents and Assets
+
+The economy has two types of agents and two assets:
+
+* A **safe asset** with net return normalized to zero.
+* A **risky asset** traded at date $t$ at spot price $p_t$.
+
+At each date $t = 0, 1, 2, \ldots$ the risky asset yields a gross return $r_t$ that is drawn IID from a distribution with mean zero and variance $\sigma^2$.
+
+There are two agents:
+
+* **Agent $I$ (Informed)**: observes $r_t$ before trading at date $t$.
+* **Agent $U$ (Uninformed)**: cannot observe $r_t$ directly, but can observe the price $p_t$.
+
+### Preferences
+
+Both agents have von Neumann–Morgenstern utility with coefficient of absolute risk tolerance equal to $2$.
+Agent $n \in \{I, U\}$ chooses holdings $x^n$ of the risky asset to maximize
+
+$$
+-e^{-\frac{1}{2}(r_{t+1} + b \cdot y^n_t) x^n}
+$$
+
+where $y^n_t$ is agent $n$'s information at date $t$, and $b$ is agent $n$'s holding of the safe asset (treated as a budget constraint).
+
+At date $t$:
+* Agent $I$ observes $(r_t, p_t)$, so $y^I_t = (r_t, p_t)$.
+* Agent $U$ observes $p_t$ only, so $y^U_t = p_t$.
+
+Net supply of the risky asset is zero, so market clearing requires $x^I_t + x^U_t = 2$ (both normalized to share the supply equally, then net supply equals zero).
+
+### Rational Expectations Equilibrium
+
+Following {cite}`Radner1979`, a **rational expectations equilibrium** (REE) is a price function $p(\cdot)$ such that:
+
+1. Each agent maximizes expected utility given their information (which includes $p_t$).
+2. Markets clear at each date.
+3. The price function is **correct** in the sense that agents' beliefs about the relationship between $p_t$ and $r_t$ are consistent with the actual relationship generated by the equilibrium.
+
+In this environment, {cite}`grossman1976` showed that under certain conditions a **fully revealing** equilibrium exists in which $p_t$ perfectly reveals $r_t$ to the uninformed agent.
+
+The unique rational expectations equilibrium has the linear price function
+
+$$
+p_t = a + b r_t
+$$ (eq:req_price)
+
+where the coefficients $a$ and $b$ are determined by market clearing and the agents' optimization.
+
+**Equilibrium coefficient values** (derived from market clearing with risk tolerance $= 2$ and supply normalized to $2$):
+
+$$
+a = 0, \qquad b = 1
+$$
+
+so that $p_t = r_t$ — the price fully reveals the fundamental.
+
+More generally, with parameters $(\theta^I, \theta^U)$ denoting risk tolerances and $\sigma^2$ the variance of $r_t$:
+
+$$
+b = \frac{\theta^I + \theta^U}{\sigma^2 (\theta^U)^{-1} + \sigma^2 (\theta^I)^{-1}} = \frac{2\sigma^2}{\sigma^2} = 2
+$$
+
+Bray and Kreps work with a parametrization in which $b$ takes the value
+
+$$
+b = \frac{\theta^U(a_{t-1} + b_{t-1} p_{t-1} - p_{t-1})}{\sigma^2}
+$$
+
+and for concreteness set $\theta^I + \theta^U = \sigma^2 = 1$ so that the equilibrium value is $b^* = 2\sigma^2/(\theta^I + \theta^U) = 2$.
+
+For the numerical example below we follow Bray–Kreps directly and use:
+
+$$
+p_t = a + b r_t, \quad \text{with } a = 0, \; b^* = 2
+$$ (eq:bk_price)
+
+## The Learning Model
+
+### Setup
+
+Agent $U$ **does not know** the equilibrium price function.
+Specifically, $U$ does not know $b^*$.
+
+However, $U$ does know:
+* The distribution of $r_t$: $r_t \sim \mathcal{N}(0, \sigma^2)$ IID.
+* That the price function is **linear**: $p_t = a + b r_t$ for some unknown $b$.
+* The value of $a = 0$.
+
+So $U$'s task is to learn the single parameter $b$ from observations of prices and (eventually) returns.
+
+### Observing the Signal
+
+At date $t$, agent $U$ observes $p_t$.
+The signal $U$ extracts is the return implied by the price:
+
+$$
+\hat{r}_t = \frac{p_t}{b_{t-1}}
+$$
+
+where $b_{t-1}$ is $U$'s current estimate of $b^*$.
+
+After date $t$ trading and before date $t+1$, $U$ observes $r_t$ (the actual return is revealed, say through dividend payments).
+
+### Bayesian Updating
+
+Agent $U$ begins with a **prior** distribution on $b$:
+
+$$
+b \sim \mathcal{N}(\mu_0, v_0)
+$$
+
+Given past data $(r_1, p_1), \ldots, (r_{t-1}, p_{t-1})$, agent $U$'s posterior on $b$ at date $t$ is
+
+$$
+b \mid \text{data} \sim \mathcal{N}(\mu_t, v_t)
+$$
+
+The posterior is updated using Bayes' rule.
+Since $p_t = b \cdot r_t$ (with $a = 0$), each pair $(r_s, p_s)$ provides the observation $p_s = b \cdot r_s$, i.e., a noisy linear measurement of $b$.
+
+For a Gaussian prior and Gaussian likelihood, the posterior updates as:
+
+$$
+v_t^{-1} = v_0^{-1} + \frac{1}{\sigma^2} \sum_{s=1}^{t} r_s^2
+$$ (eq:posterior_precision)
+
+$$
+\mu_t = v_t \left( v_0^{-1} \mu_0 + \frac{1}{\sigma^2} \sum_{s=1}^{t} r_s p_s \right)
+$$ (eq:posterior_mean)
+
+```{note}
+Equations {eq}`eq:posterior_precision` and {eq}`eq:posterior_mean` follow from the standard Gaussian linear regression posterior.
+Each observation $(r_s, p_s)$ with $p_s = b r_s + 0$ is treated as a noisy signal of $b$ with signal-to-noise ratio $r_s^2 / \sigma^2$.
+```
+
+### The Key Convergence Result
+
+{cite}`BrayKreps1987` prove the following in their Proposition 1:
+
+**Proposition (Bray–Kreps):** *For any prior $(μ_0, v_0)$ with $v_0 < \infty$, as $t \to \infty$:*
+
+$$
+\mu_t \xrightarrow{a.s.} b^*, \qquad v_t \xrightarrow{a.s.} 0
+$$
+
+*That is, agent $U$'s posterior distribution on $b$ converges almost surely to a point mass at the true equilibrium value $b^*$.*
+
+The intuition is straightforward:
+
+* Each period adds a new observation $(r_t, p_t)$ with information content proportional to $r_t^2$.
+* Since $r_t$ is IID with $E[r_t^2] = \sigma^2 > 0$, the cumulative information $\sum_{s=1}^t r_s^2 \to \infty$ by the law of large numbers.
+* Therefore the posterior precision $v_t^{-1} \to \infty$, which means $v_t \to 0$.
+* Since the observations are generated by the true $b^*$, the posterior mean $\mu_t$ converges to $b^*$.
+
+The proof follows from standard results on Bayesian consistency for correctly specified models.
+
+## Simulating Bayesian Learning
+
+We now implement the Bayesian learning dynamics and verify convergence numerically.
+
+### Parameters
+
+```{code-cell} ipython3
+# True equilibrium parameters
+a_true = 0.0
+b_true = 2.0        # true b* in the REE
+
+# Distribution of fundamentals
+sigma2 = 1.0        # variance of r_t
+
+# Prior on b
+mu_0  = 0.5         # prior mean (misspecified, true is 2.0)
+v_0   = 2.0         # prior variance (diffuse)
+
+# Simulation settings
+T     = 300         # time periods
+N     = 200         # number of Monte Carlo paths
+
+np.random.seed(42)
+```
+
+### Bayesian Updating Function
+
+```{code-cell} ipython3
+def simulate_bayesian_learning(b_true, sigma2, mu_0, v_0, T, N):
+    """
+    Simulate Bayesian learning of the REE slope parameter b*.
+
+    Parameters
+    ----------
+    b_true  : true equilibrium slope
+    sigma2  : variance of fundamentals r_t
+    mu_0    : prior mean on b
+    v_0     : prior variance on b
+    T       : number of time periods
+    N       : number of Monte Carlo paths
+
+    Returns
+    -------
+    mu_paths : array (N, T) of posterior means over time
+    v_paths  : array (N, T) of posterior variances over time
+    """
+    # Draw fundamentals r_t for all paths
+    r = np.random.normal(0, np.sqrt(sigma2), size=(N, T))
+
+    # Equilibrium prices: p_t = b_true * r_t
+    p = b_true * r
+
+    # Arrays to store posterior parameters
+    mu_paths = np.empty((N, T))
+    v_paths  = np.empty((N, T))
+
+    for i in range(N):
+        # Initialize prior
+        precision = 1.0 / v_0
+        weighted_sum = mu_0 / v_0
+
+        for t in range(T):
+            # Each observation: p_s = b * r_s  =>  b = p_s / r_s (when r_s != 0)
+            # Likelihood contribution: precision += r_s^2 / sigma2
+            #                          weighted_sum += r_s * p_s / sigma2
+            precision    += r[i, t]**2 / sigma2
+            weighted_sum += r[i, t] * p[i, t] / sigma2
+
+            v_t  = 1.0 / precision
+            mu_t = v_t * weighted_sum
+
+            mu_paths[i, t] = mu_t
+            v_paths[i, t]  = v_t
+
+    return mu_paths, v_paths
+```
+
+### Running the Simulation
+
+```{code-cell} ipython3
+mu_paths, v_paths = simulate_bayesian_learning(
+    b_true, sigma2, mu_0, v_0, T, N
+)
+```
+
+### Plotting Results
+
+```{code-cell} ipython3
+fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+
+t_range = np.arange(1, T + 1)
+
+# --- Left panel: posterior means ---
+ax = axes[0]
+for i in range(min(30, N)):
+    ax.plot(t_range, mu_paths[i, :], color='steelblue', alpha=0.25, lw=0.8)
+
+ax.plot(t_range, np.mean(mu_paths, axis=0), color='navy', lw=2,
+        label='cross-path average')
+ax.axhline(b_true, color='red', ls='--', lw=1.5, label=f'$b^* = {b_true}$')
+ax.axhline(mu_0,   color='gray', ls=':',  lw=1.2, label=f'prior mean $= {mu_0}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('posterior mean $\\mu_t$')
+ax.set_title('Posterior Mean Converges to $b^*$')
+ax.legend()
+
+# --- Right panel: posterior variances ---
+ax = axes[1]
+for i in range(min(30, N)):
+    ax.plot(t_range, v_paths[i, :], color='darkorange', alpha=0.25, lw=0.8)
+
+ax.plot(t_range, np.mean(v_paths, axis=0), color='saddlebrown', lw=2,
+        label='cross-path average')
+
+# Theoretical rate: v_t ≈ sigma2 / (t * sigma2) = 1/t  for large t
+ax.plot(t_range, 1.0 / t_range, color='black', ls='--', lw=1.5,
+        label='$1/t$ (theory)')
+ax.set_xlabel('$t$')
+ax.set_ylabel('posterior variance $v_t$')
+ax.set_title('Posterior Variance Shrinks to 0')
+ax.legend()
+
+plt.tight_layout()
+plt.show()
+```
+
+The left panel shows that regardless of the (misspecified) prior mean, agent $U$'s posterior mean converges to the true equilibrium value $b^* = 2$.
+The right panel confirms that the posterior variance vanishes at rate $1/t$, consistent with the formula in {eq}`eq:posterior_precision`.
+
+## The Demand and Equilibrium
+
+To connect the learning story to market equilibrium, we can track how agent $U$'s **equilibrium demand** for the risky asset evolves.
+
+Given $U$'s current beliefs about $b$ (summarized by $\mu_t$), $U$ estimates $r_t \approx p_t / \mu_t$ and formulates demand:
+
+$$
+x^U_t(\mu_t) = \frac{\theta^U}{\sigma^2} \cdot \left(\frac{p_t}{\mu_t} - p_t\right)
+$$
+
+As $\mu_t \to b^*$, this demand function converges to the demand implied by the rational expectations equilibrium.
+
+The following code computes the demand trajectories.
+
+```{code-cell} ipython3
+def compute_demand(mu_t, p_t, sigma2=1.0, theta_U=0.5):
+    """
+    Compute agent U's demand for the risky asset given beliefs mu_t.
+
+    x^U = (theta_U / sigma2) * (r_hat - p_t)
+    where r_hat = p_t / mu_t is U's signal extraction.
+    """
+    r_hat = p_t / mu_t
+    return (theta_U / sigma2) * (r_hat - p_t)
+
+# Single representative path
+i_rep = 0
+r_rep = np.random.normal(0, np.sqrt(sigma2), T)
+p_rep = b_true * r_rep
+
+demand_path = np.array([
+    compute_demand(mu_paths[i_rep, t], p_rep[t])
+    for t in range(T)
+])
+
+# REE demand (what U would demand knowing b*)
+demand_ree = np.array([
+    compute_demand(b_true, p_rep[t])
+    for t in range(T)
+])
+
+fig, ax = plt.subplots(figsize=(10, 5))
+ax.plot(t_range, demand_path, color='steelblue', alpha=0.7,
+        label="$x^U_t$ (learning)")
+ax.plot(t_range, demand_ree, color='red', ls='--', lw=1.5,
+        label="$x^U_t$ (REE)")
+ax.set_xlabel('$t$')
+ax.set_ylabel("agent $U$'s demand $x^U_t$")
+ax.set_title("Demand Converges to REE Demand as $\\mu_t \\to b^*$")
+ax.legend()
+plt.tight_layout()
+plt.show()
+```
+
+## When Does Learning Fail?
+
+The convergence result above relies on several assumptions that may fail in richer environments.
+
+{cite}`BrayKreps1987` identify several obstacles:
+
+### 1. Multiple Equilibria
+
+When there are multiple rational expectations equilibria, the uninformed agent's beliefs may converge to the **wrong** equilibrium — one that is not the equilibrium that actually prevails.
+
+In the example with two potential equilibrium parameters $b_1^*$ and $b_2^*$, the agent's posterior mean can converge to either one depending on the history.
+
+The following code illustrates this with a mixture prior.
+
+```{code-cell} ipython3
+def simulate_two_equilibria(b_values, sigma2, T, N, seed=0):
+    """
+    Simulate learning when the prior is spread over two possible equilibrium values.
+    Nature uses b_values[0] as the true equilibrium with probability 0.5.
+    """
+    rng = np.random.default_rng(seed)
+    b_true_draw = rng.choice(b_values, size=N)
+
+    mu_paths_all = np.empty((N, T))
+
+    for i in range(N):
+        b_i = b_true_draw[i]
+        r = rng.normal(0, np.sqrt(sigma2), T)
+        p = b_i * r
+
+        # Diffuse prior centered between the two equilibria
+        mu_prior    = np.mean(b_values)
+        prec_prior  = 1.0 / 4.0
+        w_sum       = mu_prior * prec_prior
+        prec        = prec_prior
+
+        for t in range(T):
+            prec  += r[t]**2 / sigma2
+            w_sum += r[t] * p[t] / sigma2
+            mu_paths_all[i, t] = w_sum / prec
+
+    return mu_paths_all, b_true_draw
+
+b_values = [1.0, 3.0]
+mu_two, b_drawn = simulate_two_equilibria(b_values, sigma2=1.0, T=200, N=300)
+
+fig, ax = plt.subplots(figsize=(10, 5))
+
+colors = {b_values[0]: 'steelblue', b_values[1]: 'darkorange'}
+for i in range(len(b_drawn)):
+    c = colors[b_drawn[i]]
+    ax.plot(np.arange(1, 201), mu_two[i, :], color=c, alpha=0.15, lw=0.6)
+
+for bv, c in colors.items():
+    ax.axhline(bv, color=c, ls='--', lw=2, label=f'$b^* = {bv}$')
+
+ax.set_xlabel('$t$')
+ax.set_ylabel('posterior mean $\\mu_t$')
+ax.set_title('Learning with Two Possible Equilibria\n'
+             '(blue paths: true $b^*=1$; orange paths: true $b^*=3$)')
+ax.legend()
+plt.tight_layout()
+plt.show()
+```
+
+As expected, agent $U$ learns the **correct** equilibrium as long as the model is correctly specified and the true equilibrium generates the data.
+
+The more subtle failure mode — identified by Bray and Kreps — arises when agents' learning rules themselves **change the equilibrium**, creating a feedback loop that may or may not converge.
+
+### 2. Self-Referential Learning Dynamics
+
+In the fully general setting, the price at date $t$ depends on $U$'s current beliefs $\mu_t$.
+But $\mu_t$ is updated based on past prices.
+This creates a **self-referential** system: beliefs drive prices, and prices update beliefs.
+
+{cite}`BrayKreps1987` show (their Proposition 2 and Section 5) that this feedback can lead to **non-stationary** dynamics and that convergence to the rational expectations equilibrium requires additional conditions — essentially that the economy "settles down" to a stationary relationship before agents learn the parameters of that relationship.
+
+The next section illustrates the self-referential dynamics.
+
+```{code-cell} ipython3
+def simulate_self_referential(b_true, sigma2, mu_0, v_0, T, N,
+                              alpha_demand=0.5):
+    """
+    Simulate the self-referential learning model where prices depend on
+    current beliefs mu_t.
+
+    p_t = b_true * r_t + alpha_demand * (mu_t - b_true) * r_t
+
+    This captures the idea that as U's beliefs deviate from b*, the
+    equilibrium price is distorted.
+    """
+    rng = np.random.default_rng(10)
+    r_all = rng.normal(0, np.sqrt(sigma2), (N, T))
+
+    mu_paths_sr = np.empty((N, T))
+    p_paths_sr  = np.empty((N, T))
+
+    for i in range(N):
+        prec  = 1.0 / v_0
+        w_sum = mu_0 / v_0
+        mu_t  = mu_0
+
+        for t in range(T):
+            r_t = r_all[i, t]
+            # Price is partly driven by current beliefs
+            p_t = b_true * r_t + alpha_demand * (mu_t - b_true) * r_t
+
+            # Update beliefs with this price
+            prec  += r_t**2 / sigma2
+            w_sum += r_t * p_t / sigma2
+            mu_t   = w_sum / prec
+
+            mu_paths_sr[i, t] = mu_t
+            p_paths_sr[i, t]  = p_t
+
+    return mu_paths_sr, p_paths_sr
+
+mu_sr, p_sr = simulate_self_referential(
+    b_true, sigma2, mu_0, v_0, T=200, N=100, alpha_demand=0.3
+)
+
+fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+
+ax = axes[0]
+for i in range(30):
+    ax.plot(np.arange(1, 201), mu_sr[i, :], color='steelblue', alpha=0.3, lw=0.8)
+ax.plot(np.arange(1, 201), np.mean(mu_sr, axis=0), color='navy', lw=2,
+        label='average $\\mu_t$')
+ax.axhline(b_true, color='red', ls='--', lw=1.5, label=f'$b^* = {b_true}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\mu_t$')
+ax.set_title('Self-Referential Learning: Posterior Means')
+ax.legend()
+
+ax = axes[1]
+for i in range(30):
+    ax.plot(np.arange(1, 201), p_sr[i, :], color='darkorange', alpha=0.2, lw=0.6)
+ax.plot(np.arange(1, 201), np.mean(np.abs(p_sr), axis=0), color='saddlebrown', lw=2,
+        label='average $|p_t|$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$p_t$')
+ax.set_title('Self-Referential Learning: Price Paths')
+ax.legend()
+
+plt.tight_layout()
+plt.show()
+```
+
+## Convergence of Beliefs: The General Result
+
+Section 3 of {cite}`BrayKreps1987` proves the following general convergence theorem.
+
+Let $\Omega$ be the set of states of nature, $G$ a probability measure, and $H_t$ the sigma-field generated by the history up to date $t$.
+
+Suppose agent $U$ maintains a prior $P$ over the probability space $(\Omega, G)$.
+
+**Proposition 2 (Bray–Kreps, Rational Learning of Beliefs):**
+
+*Under regularity conditions, for any event $A$:*
+
+$$
+P(A \mid H_t) \xrightarrow{a.s.} \mathbf{1}_A
+$$
+
+*That is, conditional probabilities converge almost surely to the truth.*
+
+This is the **Bayesian consistency** result: a rational Bayesian agent who assigns positive prior probability to the truth will eventually learn it.
+
+The key caveat: the agent must assign **positive prior probability** to the true data-generating process.
+If the agent's model is misspecified — if the true equilibrium is outside the support of the agent's prior — convergence to the truth is not guaranteed.
+
+A corollary to this general result is that for the specific model described above, the uninformed agent's posterior on $b$ converges to the truth as long as the prior assigns positive density to a neighborhood of $b^*$.
+
+## Convergence to a Stationary Rational Expectations Equilibrium
+
+Section 4 of {cite}`BrayKreps1987` specializes the convergence results to the context of rational expectations equilibria in markets.
+
+The main result (Proposition 3) states that even in large general-equilibrium economies with $N$ agents and $M$ assets, agents' beliefs converge weakly to a stationary rational expectations equilibrium — provided:
+
+1. Agents form **rational (Bayesian) forecasts** given their information.
+2. The equilibrium is **unique** (no multiplicity problem).
+3. The model is **correctly specified** in the sense that the true equilibrium lies in the support of agents' priors.
+
+The formal statement requires some notation.
+
+Let $\theta$ be the vector of unknown parameters of the economy (e.g., preferences, endowments), and let $\phi$ be the state space.
+Denote by $F_t(\cdot; \theta)$ the agents' conditional distribution function for $\theta$ at date $t$.
+
+**Theorem (Convergence to REE):**
+*If conditions (1)–(3) hold, then $F_t(\cdot; \theta)$ converges weakly (P–a.s.) to a point mass at the true $\theta^*$, and equilibrium prices and allocations converge to those of the REE.*
+
+The proof involves three steps:
+
+* **Step 1A**: The conditional probability $P(A \mid H_t)$ forms a martingale with respect to $H_t$ (by the law of iterated expectations).
+* **Step 1B**: The martingale converges a.s. by Doob's martingale convergence theorem.
+* **Step 2**: The equilibrium price function — which maps $(p, \theta)$ space to prices — is continuous (under a linear model assumption).
+* **Step 3–4**: By combining Step 1 and Step 2, the joint distribution of prices and beliefs converges.
+
+## Obstacles to Convergence
+
+While the positive convergence results are elegant, {cite}`BrayKreps1987` are careful to document when learning **fails** to produce convergence to REE.
+
+### Obstacle 1: Multiple Equilibria
+
+When the economy admits multiple rational expectations equilibria, agents learning within one equilibrium may receive price signals that are informative about the *current* equilibrium but not necessarily about which equilibrium will prevail in the long run.
+
+A concrete example: suppose there are two spot market equilibria for some payoff-relevant variable $\theta$: one equilibrium at $\theta_1$ and another at $\theta_2$.
+The informed agents choose randomly among these each period (since they are indifferent).
+The uninformed agent's posterior mean can never converge to a single value — it will bounce between neighborhoods of $\theta_1$ and $\theta_2$.
+
+### Obstacle 2: Non-Stationarity of Beliefs
+
+Even if the economy has a unique REE, if agents' beliefs are updating over time, the **realized** price process is non-stationary.
+In that case, past data provides **biased** information about the future.
+
+This is a **philosophical problem** with the idea of learning in equilibrium: one cannot use data generated by a learning process (in which prices depend on beliefs that are changing) to learn the *stationary* equilibrium relationship.
+
+### Obstacle 3: Misspecified Models
+
+If $U$'s prior assigns zero probability to $b^*$ — that is, if $U$'s model is misspecified — then convergence to $b^*$ is impossible by Bayesian consistency.
+
+{cite}`BrayKreps1987` note (p. 622) that this is a subtle but important caveat: convergence is guaranteed only when the "true $\theta$ may lie outside the set of states $\Omega$" to which the agent's prior assigns positive probability is not the case.
+
+## Exercises
+
+```{exercise}
+:label: rle_ex1
+
+**Posterior Precision Growth**
+
+In the Bayesian learning model above, the posterior precision is
+
+$$
+v_t^{-1} = v_0^{-1} + \frac{1}{\sigma^2} \sum_{s=1}^{t} r_s^2
+$$
+
+(a) Show that $v_t \to 0$ almost surely as $t \to \infty$, using the law of large numbers.
+
+(b) What is the approximate rate of decay of $v_t$? That is, what does $t \cdot v_t$ converge to?
+
+(c) Write Python code to verify your answer for $\sigma^2 = 1$ and a single simulated path of $T = 500$ periods.
+```
+
+```{solution-start} rle_ex1
+:class: dropdown
+```
+
+**(a)** By the strong law of large numbers, since $r_s \sim \mathcal{N}(0, \sigma^2)$ IID with $E[r_s^2] = \sigma^2$:
+
+$$
+\frac{1}{t} \sum_{s=1}^t r_s^2 \xrightarrow{a.s.} \sigma^2 > 0
+$$
+
+Therefore
+
+$$
+\frac{1}{t} v_t^{-1} = \frac{v_0^{-1}}{t} + \frac{1}{\sigma^2} \cdot \frac{1}{t} \sum_{s=1}^t r_s^2 \xrightarrow{a.s.} \sigma^2 / \sigma^2 = 1
+$$
+
+So $v_t^{-1} \sim t$ and $v_t \to 0$ almost surely.
+
+**(b)** From the above, $t \cdot v_t^{-1} \to 1$ implies $t \cdot v_t \to 1 / 1 = 1 / \sigma^2 \cdot \sigma^2 = 1$ when $\sigma^2 = 1$.
+More precisely, $t \cdot v_t \to \sigma^2 / \sigma^2 = 1$ (since $v_t \approx \sigma^2 / (t \sigma^2) = 1/t$ for large $t$ when $\sigma^2 = 1$).
+
+So $t \cdot v_t \to 1$ (when $\sigma^2 = 1$).
+
+**(c)**
+
+```{code-cell} ipython3
+sigma2_ex = 1.0
+T_ex = 500
+v0_ex = 2.0
+mu0_ex = 0.0
+
+np.random.seed(7)
+r_ex = np.random.normal(0, np.sqrt(sigma2_ex), T_ex)
+
+precisions = np.empty(T_ex)
+prec = 1.0 / v0_ex
+for t in range(T_ex):
+    prec += r_ex[t]**2 / sigma2_ex
+    precisions[t] = prec
+
+v_t_ex = 1.0 / precisions
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+axes[0].plot(np.arange(1, T_ex + 1), v_t_ex, label='$v_t$')
+axes[0].plot(np.arange(1, T_ex + 1), 1.0 / np.arange(1, T_ex + 1),
+             '--', label='$1/t$')
+axes[0].set_xlabel('$t$'); axes[0].set_ylabel('$v_t$')
+axes[0].set_title('Posterior Variance Decay')
+axes[0].legend()
+
+axes[1].plot(np.arange(1, T_ex + 1),
+             np.arange(1, T_ex + 1) * v_t_ex, label='$t \\cdot v_t$')
+axes[1].axhline(1.0, color='red', ls='--', label='limit = 1')
+axes[1].set_xlabel('$t$'); axes[1].set_ylabel('$t \\cdot v_t$')
+axes[1].set_title('Normalized Variance Converges to 1')
+axes[1].legend()
+
+plt.tight_layout()
+plt.show()
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: rle_ex2
+
+**Effect of Prior Misspecification**
+
+Suppose agent $U$ starts with a prior mean $\mu_0$ far from the true value $b^* = 2$.
+
+(a) Simulate 100 paths of $T = 400$ periods for each of $\mu_0 \in \{-3, 0, 1, 3, 5\}$ and plot the average posterior mean across paths for each $\mu_0$.
+
+(b) Does the prior mean affect the **rate** at which the posterior mean converges to $b^*$?
+
+(c) Does the prior **variance** $v_0$ affect the rate? Verify by comparing $v_0 \in \{0.1, 1.0, 10.0\}$ with fixed $\mu_0 = 0$.
+```
+
+```{solution-start} rle_ex2
+:class: dropdown
+```
+
+```{code-cell} ipython3
+b_true_ex = 2.0
+sigma2_ex = 1.0
+T_ex = 400
+N_ex = 100
+t_range_ex = np.arange(1, T_ex + 1)
+
+# (a) and (b): different prior means
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+ax = axes[0]
+for mu0 in [-3, 0, 1, 3, 5]:
+    mu_p, _ = simulate_bayesian_learning(
+        b_true_ex, sigma2_ex, mu0, v_0=1.0, T=T_ex, N=N_ex
+    )
+    ax.plot(t_range_ex, np.mean(mu_p, axis=0), label=f'$\\mu_0 = {mu0}$')
+
+ax.axhline(b_true_ex, color='black', ls='--', lw=1.5, label=f'$b^* = {b_true_ex}$')
+ax.set_xlabel('$t$'); ax.set_ylabel('$E[\\mu_t]$')
+ax.set_title('Effect of Prior Mean on Convergence')
+ax.legend(fontsize=8)
+
+# (c): different prior variances
+ax = axes[1]
+for v0 in [0.1, 1.0, 10.0]:
+    mu_p, _ = simulate_bayesian_learning(
+        b_true_ex, sigma2_ex, mu_0=0.0, v_0=v0, T=T_ex, N=N_ex
+    )
+    ax.plot(t_range_ex, np.mean(mu_p, axis=0), label=f'$v_0 = {v0}$')
+
+ax.axhline(b_true_ex, color='black', ls='--', lw=1.5, label=f'$b^* = {b_true_ex}$')
+ax.set_xlabel('$t$'); ax.set_ylabel('$E[\\mu_t]$')
+ax.set_title('Effect of Prior Variance on Convergence')
+ax.legend()
+
+plt.tight_layout()
+plt.show()
+
+print("Observations:")
+print("(b) Prior mean affects the initial level but not the long-run rate.")
+print("    All paths converge to b* = 2 at the same asymptotic rate.")
+print("(c) A tighter prior (small v_0) slows initial adaptation but all")
+print("    converge; a diffuse prior adapts quickly early on.")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: rle_ex3
+
+**Convergence with Non-Standard Fundamentals**
+
+The convergence proof relies on $E[r_t^2] = \sigma^2 > 0$.
+
+(a) Suppose $r_t$ follows a **mixture distribution**: with probability $0.5$ it equals $0$, and with probability $0.5$ it is drawn from $\mathcal{N}(0, 2\sigma^2)$.
+Show that $E[r_t^2] = \sigma^2 > 0$ still holds, so convergence is guaranteed.
+
+(b) Simulate $T = 500$ periods with $\sigma^2 = 1$ and $b^* = 2$ using this mixture distribution for $r_t$.
+Plot the posterior mean and variance over time for 50 paths.
+
+(c) Compare the speed of convergence to the Gaussian case.
+Why does the mixture distribution slow convergence even though $E[r_t^2]$ is the same?
+```
+
+```{solution-start} rle_ex3
+:class: dropdown
+```
+
+**(a)** Let $Z \sim \mathcal{N}(0, 2\sigma^2)$.
+Then
+
+$$
+E[r_t^2] = 0.5 \cdot 0^2 + 0.5 \cdot E[Z^2] = 0.5 \cdot 2\sigma^2 = \sigma^2
+$$
+
+So $E[r_t^2] = \sigma^2 > 0$ and the strong law of large numbers guarantees $\sum_{s=1}^t r_s^2 / t \to \sigma^2$, ensuring convergence.
+
+**(b) and (c)**
+
+```{code-cell} ipython3
+def simulate_learning_mixture(b_true, sigma2, mu_0, v_0, T, N):
+    """
+    Simulate Bayesian learning with mixture fundamentals:
+    r_t = 0 with prob 0.5, else N(0, 2*sigma2) with prob 0.5.
+    """
+    rng = np.random.default_rng(42)
+
+    mu_paths = np.empty((N, T))
+    v_paths  = np.empty((N, T))
+
+    for i in range(N):
+        prec  = 1.0 / v_0
+        w_sum = mu_0 / v_0
+
+        for t in range(T):
+            # Draw from mixture
+            if rng.random() < 0.5:
+                r_t = 0.0
+            else:
+                r_t = rng.normal(0, np.sqrt(2 * sigma2))
+
+            p_t = b_true * r_t
+
+            prec  += r_t**2 / sigma2
+            w_sum += r_t * p_t / sigma2
+
+            v_t   = 1.0 / prec
+            mu_t  = v_t * w_sum
+
+            mu_paths[i, t] = mu_t
+            v_paths[i, t]  = v_t
+
+    return mu_paths, v_paths
+
+sigma2_ex = 1.0
+T_ex = 500
+N_ex = 50
+
+# Gaussian case
+mu_gauss, v_gauss = simulate_bayesian_learning(
+    b_true=2.0, sigma2=sigma2_ex, mu_0=0.5, v_0=2.0, T=T_ex, N=N_ex
+)
+
+# Mixture case
+mu_mix, v_mix = simulate_learning_mixture(
+    b_true=2.0, sigma2=sigma2_ex, mu_0=0.5, v_0=2.0, T=T_ex, N=N_ex
+)
+
+t_range_ex = np.arange(1, T_ex + 1)
+
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+ax = axes[0]
+ax.plot(t_range_ex, np.mean(mu_gauss, axis=0), label='Gaussian $r_t$',
+        color='steelblue', lw=2)
+ax.plot(t_range_ex, np.mean(mu_mix,   axis=0), label='Mixture $r_t$',
+        color='darkorange', lw=2)
+ax.axhline(2.0, color='red', ls='--', lw=1.5, label='$b^* = 2$')
+ax.set_xlabel('$t$'); ax.set_ylabel('$E[\\mu_t]$')
+ax.set_title('Posterior Mean: Gaussian vs Mixture')
+ax.legend()
+
+ax = axes[1]
+ax.plot(t_range_ex, np.mean(v_gauss, axis=0), label='Gaussian $r_t$',
+        color='steelblue', lw=2)
+ax.plot(t_range_ex, np.mean(v_mix,   axis=0), label='Mixture $r_t$',
+        color='darkorange', lw=2)
+ax.set_xlabel('$t$'); ax.set_ylabel('$E[v_t]$')
+ax.set_title('Posterior Variance: Gaussian vs Mixture')
+ax.legend()
+
+plt.tight_layout()
+plt.show()
+
+print("The mixture distribution slows convergence because periods with r_t = 0")
+print("provide NO information about b* (the observation p_t = 0 is uninformative).")
+print("Even though E[r_t^2] = sigma^2, the variance of r_t^2 is larger under the")
+print("mixture, leading to noisier information accumulation.")
+```
+
+```{solution-end}
+```
+
+## Summary
+
+This lecture has covered the following key ideas from {cite}`BrayKreps1987`:
+
+1. **Rational expectations equilibria** require agents to know the statistical relationship between prices and fundamentals — but this knowledge is typically assumed, not derived.
+
+2. **Rational learning** asks whether Bayesian agents can *learn* the REE from data.
+   In a benchmark linear model, the answer is yes: the uninformed agent's posterior on the slope parameter $b^*$ converges almost surely to the truth.
+
+3. The convergence relies on **Bayesian consistency** — the uninformed agent accumulates sufficient information to identify $b^*$ from observed prices and returns.
+
+4. Convergence can **fail** when:
+   - There are **multiple equilibria** and agents' learning rules interact with equilibrium selection.
+   - The agent's **model is misspecified** (prior assigns zero weight to the truth).
+   - The learning process generates **non-stationary** prices that contaminate inference.
+
+5. A **general convergence theorem** guarantees that under correct specification and unique equilibria, Bayesian posteriors converge weakly to a point mass at the truth.
+
+The broader message of Bray and Kreps is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle — and the conditions under which learning succeeds are more restrictive than they might appear.

From bb0669a78158838afa6f4bc2aa2cef3ebcc27129 Mon Sep 17 00:00:00 2001
From: thomassargent30 <ts43@nyu.edu>
Date: Sat, 9 May 2026 15:49:58 -0400
Subject: [PATCH 02/25] Tom's April 9 edits of new and old lectures

---
 lectures/_static/quant-econ.bib   |   47 ++
 lectures/_toc.yml                 |    1 +
 lectures/imp_sample.md            |    3 +-
 lectures/ls_learning.bib          |   60 ++
 lectures/ls_learning.md           | 1305 +++++++++++++++++++++++++++++
 lectures/rational_learning_re.bib |   12 +
 lectures/rational_learning_re.md  |  166 +++-
 7 files changed, 1566 insertions(+), 28 deletions(-)
 create mode 100644 lectures/ls_learning.bib
 create mode 100644 lectures/ls_learning.md

diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 2dbdd0e89..414cde9ae 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -2841,6 +2841,53 @@ @article{MarcetMarimon1994
   year    = {2019}
 }
 
+@article{MarcetSargent1989jet,
+  author    = {Marcet, Albert and Sargent, Thomas J.},
+  title     = {Convergence of Least Squares Learning Mechanisms in
+               Self-Referential Linear Stochastic Models},
+  journal   = {Journal of Economic Theory},
+  year      = {1989},
+  volume    = {48},
+  number    = {2},
+  pages     = {337--368},
+  publisher = {Elsevier},
+  doi       = {10.1016/0022-0531(89)90032-X}
+}
+
+@article{Ljung1977,
+  author  = {Ljung, Lennart},
+  title   = {Analysis of Recursive Stochastic Algorithms},
+  journal = {IEEE Transactions on Automatic Control},
+  year    = {1977},
+  volume  = {22},
+  number  = {4},
+  pages   = {551--575},
+  doi     = {10.1109/TAC.1977.1101561}
+}
+
+@article{Evans1985,
+  author  = {Evans, George W.},
+  title   = {Expectational Stability and the Multiple Equilibria Problem
+             in Linear Rational Expectations Models},
+  journal = {Quarterly Journal of Economics},
+  year    = {1985},
+  volume  = {100},
+  number  = {4},
+  pages   = {1217--1233},
+  doi     = {10.2307/1885377}
+}
+
+@article{FourgeaudGourieroux1986,
+  author  = {Fourgeaud, Claude and Gourieroux, Christian and Pradel, Jacqueline},
+  title   = {Learning Procedures and Convergence to Rationality},
+  journal = {Econometrica},
+  year    = {1986},
+  volume  = {54},
+  number  = {4},
+  pages   = {845--868},
+  doi     = {10.2307/1912836}
+}
+
 @article{MarcetSargent1989,
   author    = {Marcet, Albert and Sargent, Thomas J},
   journal   = {Journal of Political Economy},
diff --git a/lectures/_toc.yml b/lectures/_toc.yml
index 7341e6771..74a694c97 100644
--- a/lectures/_toc.yml
+++ b/lectures/_toc.yml
@@ -142,6 +142,7 @@ parts:
   - file: harrison_kreps
   - file: morris_learn
   - file: rational_learning_re
+  - file: ls_learning
   - file: affine_risk_prices
   - file: ross_recovery
   - file: misspecified_recovery
diff --git a/lectures/imp_sample.md b/lectures/imp_sample.md
index 09ae67576..2ce766683 100644
--- a/lectures/imp_sample.md
+++ b/lectures/imp_sample.md
@@ -264,13 +264,12 @@ We next study the bias and efficiency of the Monte Carlo and importance sampling
 The code  below produces distributions of estimates using both Monte Carlo and importance sampling methods.
 
 ```{code-cell} ipython3
-@jit(parallel=True)
 def simulate(p_a, p_b, q_a, q_b, N_simu, T=1):
 
     μ_L_p = np.empty(N_simu)
     μ_L_q = np.empty(N_simu)
 
-    for i in prange(N_simu):
+    for i in range(N_simu):
         μ_L_p[i] = estimate(p_a, p_b, p_a, p_b, T=T)
         μ_L_q[i] = estimate(p_a, p_b, q_a, q_b, T=T)
 
diff --git a/lectures/ls_learning.bib b/lectures/ls_learning.bib
new file mode 100644
index 000000000..671215e99
--- /dev/null
+++ b/lectures/ls_learning.bib
@@ -0,0 +1,60 @@
+% BibTeX references for ls_learning.md
+% These entries have been added to _static/quant-econ.bib.
+% This file is kept as a local record.
+%
+% References already in quant-econ.bib before this lecture was written:
+%   BrayKreps1987, Bray1982, BraySavin1984, Frydman1982,
+%   Lucas_Prescott_1971, Cagan, Sargent1979, MarcetSargent1989 (JPE version)
+%
+% New entries added to quant-econ.bib for this lecture:
+%   MarcetSargent1989jet  (JET paper -- the main paper of the lecture)
+%   Ljung1977
+%   Evans1985
+%   FourgeaudGourieroux1986
+
+@article{MarcetSargent1989jet,
+  author    = {Marcet, Albert and Sargent, Thomas J.},
+  title     = {Convergence of Least Squares Learning Mechanisms in
+               Self-Referential Linear Stochastic Models},
+  journal   = {Journal of Economic Theory},
+  year      = {1989},
+  volume    = {48},
+  number    = {2},
+  pages     = {337--368},
+  publisher = {Elsevier},
+  doi       = {10.1016/0022-0531(89)90032-X}
+}
+
+@article{Ljung1977,
+  author  = {Ljung, Lennart},
+  title   = {Analysis of Recursive Stochastic Algorithms},
+  journal = {IEEE Transactions on Automatic Control},
+  year    = {1977},
+  volume  = {22},
+  number  = {4},
+  pages   = {551--575},
+  doi     = {10.1109/TAC.1977.1101561}
+}
+
+@article{Evans1985,
+  author  = {Evans, George W.},
+  title   = {Expectational Stability and the Multiple Equilibria Problem
+             in Linear Rational Expectations Models},
+  journal = {Quarterly Journal of Economics},
+  year    = {1985},
+  volume  = {100},
+  number  = {4},
+  pages   = {1217--1233},
+  doi     = {10.2307/1885377}
+}
+
+@article{FourgeaudGourieroux1986,
+  author  = {Fourgeaud, Claude and Gourieroux, Christian and Pradel, Jacqueline},
+  title   = {Learning Procedures and Convergence to Rationality},
+  journal = {Econometrica},
+  year    = {1986},
+  volume  = {54},
+  number  = {4},
+  pages   = {845--868},
+  doi     = {10.2307/1912836}
+}
diff --git a/lectures/ls_learning.md b/lectures/ls_learning.md
new file mode 100644
index 000000000..90b23cbc5
--- /dev/null
+++ b/lectures/ls_learning.md
@@ -0,0 +1,1305 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+(ls_learning)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Least Squares Learning in Self-Referential Models
+
+```{contents} Contents
+:depth: 2
+```
+
+## Overview
+
+This lecture is a companion to {doc}`rational_learning_re`, which presents the
+Bray–Kreps perspective on rational learning. 
+
+The present lecture examines the
+closely related but distinct question of whether **least squares** learning
+converges to a rational expectations equilibrium in self-referential models.
+
+
+This lecture presents the framework of {cite}`MarcetSargent1989jet` for studying
+**least squares learning** in a class of **self-referential** linear stochastic models.
+
+A self-referential model is one in which the **actual** law of motion for the
+economy depends on the **perceived** law of motion held by the agents within
+it. 
+
+In a rational expectations equilibrium (REE) the two coincide: the
+perceived and actual laws of motion are the same.
+
+But if agents start away
+from equilibrium and update their beliefs by running least squares regressions,
+will they converge to the REE?
+
+{cite}`MarcetSargent1989jet` answer this question by exploiting a powerful
+technique from systems-control engineering: the **differential equation
+approach** of {cite}`Ljung1977`.
+
+The key insight is that the stochastic
+difference equation describing how beliefs evolve can be approximated, in the
+limit, by a deterministic **ordinary differential equation** (ODE).
+
+Almost-sure
+convergence of least squares to the REE is then equivalent to **local stability**
+of the REE as a fixed point of that ODE.
+
+The framework unifies and extends earlier work by {cite}`Bray1982` and
+{cite}`BraySavin1984` and connects naturally to the distinction between learning
+*within* a rational expectations equilibrium (Bayesian updating inside a
+correctly specified model) and learning *about* one (adapting an OLS estimator
+whose data-generating process shifts with beliefs) discussed in
+{cite}`BrayKreps1987`.
+
+
+
+Let's begin with the imports we'll use throughout.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.integrate import solve_ivp
+from matplotlib.gridspec import GridSpec
+
+np.random.seed(42)
+```
+
+We also define two helper functions used throughout the lecture: one to
+simulate recursive least squares in a scalar self-referential model, and one
+to solve the associated ODE.
+
+```{code-cell} ipython3
+def simulate_rls_scalar(T_map, sigma_u, beta0, T_periods=500, N_paths=100,
+                        a_seq=None, seed=0):
+    """
+    Simulate recursive least squares in a scalar self-referential model.
+
+    The perceived law of motion is:  z1_t = beta_t * z2_{t-1} + u_t
+    The actual law of motion is:     z1_t = T(beta_t) * z2_{t-1} + V * u_t
+
+    For the scalar examples here z2_t = 1 (constant), so agents learn about
+    the mean of a process that depends on their own expectation.
+
+    Parameters
+    ----------
+    T_map    : callable, the mapping T: beta -> T(beta)
+    sigma_u  : float, std of innovations
+    beta0    : float, initial belief
+    T_periods: int, simulation length
+    N_paths  : int, number of Monte Carlo paths
+    a_seq    : None or array of length T_periods (forgetting factors)
+    seed     : int, random seed
+
+    Returns
+    -------
+    beta_paths : ndarray, shape (N_paths, T_periods)
+    """
+    rng = np.random.default_rng(seed)
+    if a_seq is None:
+        a_seq = np.ones(T_periods)          # standard OLS
+
+    beta_paths = np.empty((N_paths, T_periods))
+
+    for i in range(N_paths):
+        beta = beta0
+        R = 1.0          # scalar moment estimate
+        prec = 1.0 / R   # use precision for numerical stability
+
+        for t in range(T_periods):
+            alpha_t = a_seq[t]
+            # z2 = 1 (constant regressor), so z2*z2' = 1
+            z2 = 1.0
+            u_t = rng.normal(0, sigma_u)
+
+            # Actual z1 given current beta
+            z1 = T_map(beta) * z2 + u_t
+
+            # RLS update (lagged: use previous beta to form z1, then update)
+            R = R + (alpha_t / (t + 1)) * (z2**2 - R / alpha_t)
+            R = max(R, 1e-8)
+            beta = beta + (alpha_t / (t + 1)) / R * z2 * (z1 - beta * z2)
+
+            beta_paths[i, t] = beta
+
+    return beta_paths
+
+
+def solve_ode(f_ode, beta0, t_span=(0, 80), n_points=1000):
+    """Solve scalar ODE d(beta)/dt = f_ode(beta) from beta0."""
+    sol = solve_ivp(lambda t, y: [f_ode(y[0])], t_span, [beta0],
+                    t_eval=np.linspace(*t_span, n_points), method='RK45',
+                    max_step=0.1)
+    return sol.t, sol.y[0]
+```
+
+## The Self-Referential Structure
+
+### Perceived and Actual Laws of Motion
+
+At each date $t$, agents hold a **perceived law of motion** summarised by a
+parameter matrix $\beta_t$.
+
+They believe that the variable $z_{1t}$ they care
+about evolves according to
+
+$$
+z_{1t} = \beta_t z_{2,t-1} + \eta_t ,
+$$ (eq:perceived_lom)
+
+where $z_{2t}$ is a vector of variables agents use to forecast $z_{1,t+1}$, and
+$\eta_t$ is orthogonal to all past $z_2$'s.
+
+Because agents optimise (or behave) on the basis of this belief, their actions
+feed back into the economy.
+
+The **actual** law of motion for the full state
+vector $z_t = (z_{1t}, z_{1t}^c)'$ is
+
+$$
+z_t = \begin{bmatrix} 0 & T(\beta_t) \\ A(\beta_t) & \end{bmatrix}
+      \begin{bmatrix} z_{2,t-1}^c \\ z_{2,t-1} \end{bmatrix}
+    + \begin{bmatrix} V(\beta_t) \\ B(\beta_t) \end{bmatrix} u_t ,
+$$ (eq:actual_lom)
+
+where $u_t$ is i.i.d. white noise with covariance $\Sigma$.
+
+The mapping $T$ is the key object: it maps the **perceived** coefficient $\beta$
+to the coefficient that **actually** governs $z_{1t}$ in equilibrium.
+
+A
+**rational expectations equilibrium** is a fixed point $\beta_f = T(\beta_f)$.
+
+### The Learning Scheme
+
+Agents update $\beta_t$ each period using **recursive least squares** (RLS).
+Define $R_t$ as a running estimate of the second-moment matrix $Ez_{2t}z_{2t}'$.
+
+
+Updating equations are
+
+$$
+\beta_t' = \beta_{t-1}' + \frac{\alpha_t}{t} R_{t-1}^{-1}
+           z_{2,t-2} z_{2,t-2}' \bigl[ T(\beta_{t-1})' - \beta_{t-1}' \bigr]
+         + \frac{\alpha_t}{t} z_{2,t-2} u_{t-1}' V(\beta_{t-1})' ,
+$$ (eq:rls_beta)
+
+$$
+R_t = R_{t-1} + \frac{\alpha_t}{t} \bigl[ z_{2,t-1} z_{2,t-1}' - R_{t-1}/\alpha_t \bigr] ,
+$$ (eq:rls_R)
+
+where $\{\alpha_t\}$ is a positive, non-decreasing sequence with $\alpha_t \to 1$
+as $t \to \infty$.  When $\alpha_t = 1$ for all $t$, equations
+{eq}`eq:rls_beta`–{eq}`eq:rls_R` reduce to **ordinary least squares** updated
+recursively.
+
+```{note}
+As {cite}`BraySavin1984` and {cite}`BrayKreps1987` emphasise, the RLS algorithm
+cannot be derived from Bayes' rule applied to a correctly specified model, because
+during the learning transition the data-generating process is non-stationary —
+beliefs shift the equilibrium, which shifts the data.  The algorithm is
+"irrational" in the  sense that it acts as if the environment were stationary,
+when it is not.
+```
+
+## The Governing ODE
+
+### Ljung's Differential-Equation Approach
+
+{cite}`MarcetSargent1989jet` apply Ljung's theorem ({cite}`Ljung1977`) to
+characterise the almost-sure limiting behaviour of the stochastic system
+{eq}`eq:rls_beta`–{eq}`eq:rls_R`.
+
+The central result is that the **only possible limit points** of $\beta_t$ are
+fixed points of the ODE
+
+$$
+\frac{d\beta}{dt} = T(\beta) - \beta .
+$$ (eq:small_ode)
+
+This is the **small ODE** (equation (6) in {cite}`MarcetSargent1989jet`).
+
+Its
+fixed points are exactly the rational expectations equilibria.
+
+The full ODE system associated with the joint process $(\beta_t, R_t)$ is
+
+$$
+\frac{d}{dt}\begin{bmatrix} \beta \\ R \end{bmatrix}
+= \begin{bmatrix} R^{-1} M_{z_2}(\beta)\,[T(\beta) - \beta]' \\ M_{z_2}(\beta) - R \end{bmatrix} ,
+$$ (eq:full_ode)
+
+where $M_{z_2}(\beta) = Ez_{2t}z_{2t}'$ evaluated at the stationary distribution
+induced by $\beta$.
+
+The fixed point of {eq}`eq:full_ode` is $(\beta_f, R_f)$
+where $R_f = M_{z_2}(\beta_f)$.
+
+### Stability Governs Convergence
+
+Let $\mathcal{M}$ be the Jacobian matrix of $T(\beta) - \beta$ evaluated at the
+REE $\beta_f$:
+
+$$
+\mathcal{M} = \frac{d\,\text{col}(T(\beta) - \beta)}{d\,\text{col}(\beta)'}\Bigg|_{\beta=\beta_f} .
+$$ (eq:jacobian)
+
+**Proposition 3** of {cite}`MarcetSargent1989jet` establishes that the Jacobian of
+the full system {eq}`eq:full_ode` at $(\beta_f, R_f)$ has $n_2^2$ repeated
+eigenvalues equal to $-1$ (from the $R$ equation), plus the eigenvalues of
+$\mathcal{M}$ (from the $\beta$ equation).
+
+Consequently:
+
+* If all eigenvalues of $\mathcal{M}$ have **strictly negative real parts**, both
+  {eq}`eq:small_ode` and {eq}`eq:full_ode` are locally stable.  Under suitable
+  boundedness conditions, Proposition 1 guarantees $\beta_t \to \beta_f$ **almost
+  surely**.
+
+* If any eigenvalue of $\mathcal{M}$ has **positive real part**, then
+  $P(\beta_t \to \beta_f) = 0$ — convergence is **impossible**.
+
+The stability condition $\text{Re}(\lambda_i(\mathcal{M})) < 0$ for all $i$ is
+what the E-stability literature (see {cite}`Evans1985`) calls **E-stability**: the
+REE is a stable rest point of the "expectational dynamics" $\dot\beta = T(\beta) - \beta$.
+
+### The Projection Facility
+
+E-stability is necessary but not quite sufficient for almost-sure convergence.
+
+Ljung's theorem requires the sample path $(\beta_t, R_t)$ to remain in a
+**bounded region** with probability one (assumptions A.6–A.7 of
+{cite}`MarcetSargent1989jet`).
+
+This boundedness is the job of the **projection
+facility**.
+
+#### What the Projection Facility Does
+
+The full learning algorithm augments the plain RLS update with a constraint set
+$D_1 \supset D_2$ in $(\beta, R)$-space.
+
+After each unconstrained RLS step
+produces a candidate $(\tilde\beta_t, \tilde R_t)$, the projection facility
+enforces:
+
+$$
+(\beta_t, R_t) = \begin{cases}
+  (\tilde\beta_t,\, \tilde R_t) & \text{if } (\tilde\beta_t, \tilde R_t) \in D_1 , \\
+  \text{some point in } D_2     & \text{otherwise.}
+\end{cases}
+$$ (eq:projection)
+
+The set $D_1$ is chosen so that the model remains well-defined (e.g., $R_t$
+stays positive definite; $\beta_t$ stays in a region where $T(\beta)$ is
+well-defined and the state process is covariance-stationary).
+
+The set $D_2
+\subset D_1$ is a slightly smaller "safe" region to which the algorithm is
+retracted whenever it threatens to leave $D_1$.
+
+The facility can be thought of as forcing agents to **discard observations that
+are inconsistent with their priors** — a form of bounded rationality that is
+necessary for the mathematical argument but innocuous in practice.
+
+#### Why It Is Needed
+
+Without the projection facility, the stochastic path $(\beta_t, R_t)$ might
+temporarily wander to regions where the system {eq}`eq:actual_lom` is
+non-stationary (e.g., an explosive VAR).
+
+Ljung's convergence theorem requires
+the algorithm to revisit a compact set infinitely often; the projection facility
+guarantees this by construction.
+
+Formally, {cite}`MarcetSargent1989jet` require that the ODE trajectories
+originating in $D_1$ point **inward** at the boundary $\partial D_1$ — that is,
+the vector field $T(\beta) - \beta$ must point back into $D_1$ everywhere on its
+boundary.
+
+When this holds (Assumption A.7.2), the projection is **invoked only
+finitely many times** with probability one, and after the last invocation the
+algorithm runs as plain RLS.
+
+Corollary 1 of {cite}`MarcetSargent1989jet`
+formalises this: either $\beta_t \to \beta_f$ a.s., or $\beta_t$ clusters on the
+boundary $\partial D_1 \setminus D_2$ — but the latter event has probability zero
+when the ODE trajectories point inward.
+
+#### The Exogenous-Regressor Case (Corollary 2)
+
+When the regressors $z_{2t}$ are **exogenous** — so that $M_{z_2}(\beta) \equiv M$
+does not depend on $\beta$ — a particularly clean sufficient condition for
+convergence is available (Corollary 2 of {cite}`MarcetSargent1989jet`):
+
+$$
+\text{all eigenvalues of } H(\beta) \equiv \frac{d\,\text{col}[T(\beta) - T(\beta_f)]}{d\,\text{col}[\beta - \beta_f]'}
+\text{ have real parts} < 0 \quad \forall\, |\beta - \beta_f| \leq K .
+$$ (eq:corollary2_cond)
+
+Under this condition one can take $D_1$ to be a ball of radius $K$ around
+$\beta_f$, and the boundary condition is automatically satisfied.
+
+For all four
+scalar examples in this lecture, $H(\beta) = \mathcal{M}$ is constant, so
+Corollary 2 reduces simply to E-stability.
+
+```{note}
+In the scalar examples studied here (Bray, Bray–Savin, present-value model), the
+state $z_{2t} = 1$ is a constant regressor, so $M_{z_2} = 1$ is trivially
+exogenous.  For the investment model with endogenous regressors, verifying the
+boundary condition on $D_1$ is much harder and may require numerical solution of
+the ODE on a grid of boundary points.
+```
+
+#### Simulating the Projection Facility
+
+The following code demonstrates the projection facility at work.
+
+We use Bray's
+model with $b = 0.6$ and deliberately start $\beta_0$ far from $\beta_f$,
+imposing a projection set $D_1 = \{|\beta| < K\}$ with $K = 5$.
+
+We track how
+often the facility is invoked and show that after a finite number of
+interventions, the path converges normally.
+
+```{code-cell} ipython3
+def simulate_rls_with_projection(T_map, sigma_u, beta0, K_proj,
+                                 T_periods=500, N_paths=50, seed=0):
+    """
+    Simulate RLS with a scalar projection facility.
+
+    The facility keeps beta_t in [-K_proj, K_proj].  Whenever the unconstrained
+    update would push beta outside this interval, beta is retracted to 0
+    (an arbitrary point in D2 = {|beta| <= K_proj/2}).
+
+    Returns
+    -------
+    beta_paths      : (N_paths, T_periods) array of belief paths
+    n_projections   : (N_paths,) array counting projection invocations per path
+    first_proj_free : (N_paths,) array of first period with no further projections
+    """
+    rng = np.random.default_rng(seed)
+    beta_paths    = np.empty((N_paths, T_periods))
+    n_projections = np.zeros(N_paths, dtype=int)
+    last_proj     = np.full(N_paths, -1, dtype=int)
+
+    for i in range(N_paths):
+        beta = beta0
+        R    = 1.0
+
+        for t in range(T_periods):
+            u_t = rng.normal(0, sigma_u)
+            z1  = T_map(beta) + u_t          # z2 = 1 (constant regressor)
+
+            # Unconstrained RLS update
+            R_new    = R    + (1.0 / (t + 1)) * (1.0 - R)
+            beta_new = beta + (1.0 / (t + 1)) / R_new * (z1 - beta)
+
+            # Projection facility: retract to D2 = {0} if outside D1
+            if abs(beta_new) > K_proj:
+                beta_new = 0.0           # retract to interior of D2
+                n_projections[i] += 1
+                last_proj[i] = t
+
+            beta = beta_new
+            R    = max(R_new, 1e-8)
+            beta_paths[i, t] = beta
+
+    # First period after which no further projections occur
+    first_proj_free = last_proj + 1   # -1 + 1 = 0 if never projected
+
+    return beta_paths, n_projections, first_proj_free
+
+
+# Run the simulation
+a_bray_pf, b_bray_pf, sigma_pf = 1.0, 0.6, 1.5
+T_bray_pf  = lambda beta: a_bray_pf + b_bray_pf * beta
+beta_f_pf  = a_bray_pf / (1 - b_bray_pf)
+beta0_far  = 8.0    # well outside D1 = {|beta| < 5}
+K_pf       = 5.0
+T_pf_sim   = 600
+N_pf_sim   = 80
+
+paths_pf, n_proj, first_free = simulate_rls_with_projection(
+    T_bray_pf, sigma_pf, beta0_far, K_pf,
+    T_periods=T_pf_sim, N_paths=N_pf_sim)
+
+# Also run without projection for comparison
+paths_no_pf = simulate_rls_scalar(
+    T_bray_pf, sigma_pf, beta0_far,
+    T_periods=T_pf_sim, N_paths=N_pf_sim, seed=0)
+
+fig = plt.figure(figsize=(15, 10))
+gs  = GridSpec(2, 2, figure=fig)
+
+# Top left: paths with projection
+ax1 = fig.add_subplot(gs[0, 0])
+for i in range(min(30, N_pf_sim)):
+    ax1.plot(paths_pf[i], color='steelblue', alpha=0.25, lw=0.8)
+ax1.plot(np.mean(paths_pf, axis=0), color='navy', lw=2, label='average')
+ax1.axhline(beta_f_pf, color='red', ls='--', lw=1.5,
+            label=f'$\\beta_f={beta_f_pf:.1f}$')
+ax1.axhline( K_pf, color='gray', ls=':', lw=1.2, label=f'$D_1$ boundary ($K={K_pf}$)')
+ax1.axhline(-K_pf, color='gray', ls=':', lw=1.2)
+ax1.set_title('With Projection Facility ($\\beta_0=8$, $K=5$)')
+ax1.set_xlabel('$t$'); ax1.set_ylabel('$\\beta_t$'); ax1.legend(fontsize=8)
+
+# Top right: paths without projection
+ax2 = fig.add_subplot(gs[0, 1])
+for i in range(min(30, N_pf_sim)):
+    ax2.plot(paths_no_pf[i], color='darkorange', alpha=0.25, lw=0.8)
+ax2.plot(np.mean(paths_no_pf, axis=0), color='saddlebrown', lw=2, label='average')
+ax2.axhline(beta_f_pf, color='red', ls='--', lw=1.5,
+            label=f'$\\beta_f={beta_f_pf:.1f}$')
+ax2.set_title('Without Projection Facility ($\\beta_0=8$)')
+ax2.set_xlabel('$t$'); ax2.set_ylabel('$\\beta_t$'); ax2.legend(fontsize=8)
+
+# Bottom left: histogram of projection counts
+ax3 = fig.add_subplot(gs[1, 0])
+ax3.hist(n_proj, bins=range(0, int(n_proj.max()) + 2),
+         color='steelblue', edgecolor='white', alpha=0.8)
+ax3.set_xlabel('Number of projections invoked')
+ax3.set_ylabel('Number of paths')
+ax3.set_title('Distribution of Projection Invocations\n'
+              '(finite a.s. — Corollary 1)')
+
+# Bottom right: period of last projection
+ax4 = fig.add_subplot(gs[1, 1])
+ax4.hist(first_free[n_proj > 0], bins=20,
+         color='darkorange', edgecolor='white', alpha=0.8)
+ax4.set_xlabel('Last period with a projection')
+ax4.set_ylabel('Number of paths')
+ax4.set_title('After the Last Projection, RLS Runs Freely\n'
+              '(projection invoked only finitely many times)')
+
+plt.tight_layout()
+plt.show()
+
+print(f"Paths with at least one projection: {(n_proj > 0).sum()} / {N_pf_sim}")
+print(f"Mean number of projections per path: {n_proj.mean():.2f}")
+print(f"Max number of projections:           {n_proj.max()}")
+print(f"Mean last-projection period:         {first_free[n_proj>0].mean():.1f}")
+```
+
+The simulation illustrates the key theoretical point from Corollary 1: the
+projection is invoked only a **finite number of times** on almost every sample
+path.  After the last invocation the algorithm runs as unconstrained RLS and
+converges to $\beta_f$ at the usual rate.  The projection does not bias the
+asymptotic estimate — it merely provides the boundedness guarantee that Ljung's
+theorem requires.
+
+## Four Illustrative Examples
+
+We now work through four examples from Section 4 of {cite}`MarcetSargent1989jet`,
+computing the ODE, finding the REE, checking E-stability, and simulating the RLS
+learning path.
+
+### Example 1: Bray's Cobweb Model
+
+{cite}`Bray1982` studied a simple cobweb economy in which the equilibrium price
+satisfies
+
+$$
+p_t = a + b \beta_t + \tilde{u}_t ,
+$$ (eq:bray_price)
+
+where $\beta_t$ is agents' OLS estimate of the price (their point forecast of
+$p_t$), and $\tilde{u}_t$ is i.i.d. noise with mean zero and variance
+$\sigma_u^2$.
+
+The mapping $T$ is simply $T(\beta) = a + b\beta$.  The REE is
+
+$$
+\beta_f = \frac{a}{1 - b} , \quad |b| < 1 .
+$$ (eq:bray_ree)
+
+The small ODE is
+
+$$
+\dot\beta = T(\beta) - \beta = a + b\beta - \beta = a - (1-b)\beta ,
+$$ (eq:bray_ode)
+
+which has the unique fixed point $\beta_f = a/(1-b)$.
+
+Its Jacobian is
+$\mathcal{M} = b - 1 < 0$ when $|b| < 1$, so the REE is E-stable and RLS
+converges almost surely.  When $b > 1$, $\mathcal{M} > 0$ and convergence fails.
+
+### Example 2: Bray–Savin Supply-Shifter Model
+
+{cite}`BraySavin1984` studied a model where
+
+$$
+p_t = x_t'(m + a\beta_{t-1}) + \tilde{u}_t , \quad p_t^e = x_t'\beta_{t-1} ,
+$$ (eq:bs_price)
+
+with $x_t$ an exogenous supply-shifter, $a$ a scalar feedback parameter, and
+agents running an OLS regression of $p$ on $x$.
+
+The mapping is $T(\beta) = m + a\beta$ (scalar case), giving
+
+$$
+\dot\beta = (a-1)\beta + m , \quad \beta_f = \frac{m}{1-a} ,
+$$ (eq:bs_ode)
+
+with Jacobian $\mathcal{M} = a - 1 < 0$ iff $a < 1$.
+
+### Example 3: Hyperinflation / Asset Prices (Fourgeaud–Gourieroux–Pradel)
+
+Consider the present-value asset pricing model
+
+$$
+y_t = \lambda E_t y_{t+1} + x_t , \quad x_t = \rho x_{t-1} + \varepsilon_t ,
+$$ (eq:pv_model)
+
+where $|\lambda| < 1$, $|\rho| < 1$, and agents perceive $y_t = \beta_t x_{t-1}+ v_t$.
+ 
+The mapping is $T(\beta) = (\lambda\beta + 1)\rho$ and the REE is
+
+$$
+\beta_f = \frac{\rho}{1 - \lambda\rho} .
+$$ (eq:pv_ree)
+
+The small ODE is
+
+$$
+\dot\beta = (\lambda\rho - 1)\beta + \rho ,
+$$ (eq:pv_ode)
+
+with Jacobian $\mathcal{M} = \lambda\rho - 1 < 0$ for $|\lambda\rho| < 1$, so
+convergence is guaranteed.
+
+### Example 4: Investment under Uncertainty (Self-Referential with Endogenous Regressors)
+
+In Sargent's version of the Lucas–Prescott investment model, agents learn about the
+aggregate capital stock $K_t$ by regressing on $(K_{t-1}, w_{t-1})$ where $w_t$
+is an exogenous cost shock.
+
+The perceived law of motion is
+
+$$
+K_t = \beta_1 K_{t-1} + \beta_2 w_{t-1} + \eta_t ,
+$$
+
+while the actual law (from firms' optimal investment decisions and market clearing) is
+
+$$
+K_t = T_1(\beta) K_{t-1} + T_2(\beta) w_{t-1} + V(\beta) u_t ,
+$$ (eq:inv_actual)
+
+where the nonlinear mappings $T_1, T_2$ come from solving the firms' linear
+quadratic control problems.
+
+The small ODE decomposes as:
+
+$$
+\dot\beta_1 = T_1(\beta_1) - \beta_1 , \quad
+\dot\beta_2 = T_2(\beta_1, \beta_2) - \beta_2 ,
+$$ (eq:inv_ode)
+
+and E-stability can be verified analytically for $|\beta_1| < b^{-1/2}$ (where
+$b$ is the discount factor).
+
+## Simulating the Learning Dynamics
+
+We now simulate all four examples numerically, plotting both the ODE solution
+(continuous-time approximation) and the sample paths of $\beta_t$ under RLS.
+
+### Bray's Model
+
+```{code-cell} ipython3
+# ------------------------------------------------------------------
+# Bray's cobweb model: T(beta) = a + b*beta,  REE = a/(1-b)
+# ------------------------------------------------------------------
+a_bray, b_bray, sigma_bray = 1.0, 0.6, 1.0
+T_bray = lambda beta: a_bray + b_bray * beta
+beta_f_bray = a_bray / (1 - b_bray)
+
+beta0_bray = 0.0   # start well below the REE
+T_sim = 400
+N_sim = 80
+
+beta_paths_bray = simulate_rls_scalar(T_bray, sigma_bray, beta0_bray,
+                                      T_periods=T_sim, N_paths=N_sim)
+
+# ODE solution for two starting values
+ode_bray = lambda beta: a_bray + b_bray * beta - beta
+t_ode, sol_low  = solve_ode(ode_bray, 0.0)
+_,     sol_high = solve_ode(ode_bray, 4.5)
+
+fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+
+ax = axes[0]
+for i in range(min(30, N_sim)):
+    ax.plot(beta_paths_bray[i], color='steelblue', alpha=0.25, lw=0.8)
+ax.plot(np.mean(beta_paths_bray, axis=0), color='navy', lw=2,
+        label='cross-path average')
+ax.axhline(beta_f_bray, color='red', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_bray:.2f}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta_t$')
+ax.set_title("Bray's Model: RLS Paths ($b=0.6$)")
+ax.legend()
+
+ax = axes[1]
+ax.plot(t_ode, sol_low,  color='steelblue', lw=2, label='ODE from $\\beta_0=0$')
+ax.plot(t_ode, sol_high, color='darkorange', lw=2, label='ODE from $\\beta_0=4.5$')
+ax.axhline(beta_f_bray, color='red', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_bray:.2f}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta(t)$')
+ax.set_title("Bray's Model: ODE Trajectories")
+ax.legend()
+
+plt.tight_layout()
+plt.show()
+print(f"REE: beta_f = a/(1-b) = {beta_f_bray:.4f}")
+print(f"Jacobian M = b - 1 = {b_bray - 1:.4f}  (< 0: E-stable)")
+```
+
+### Bray–Savin Model
+
+```{code-cell} ipython3
+# ------------------------------------------------------------------
+# Bray–Savin: T(beta) = m + a*beta,  REE = m/(1-a)
+# ------------------------------------------------------------------
+m_bs, a_bs, sigma_bs = 0.5, 0.7, 1.0
+T_bs = lambda beta: m_bs + a_bs * beta
+beta_f_bs = m_bs / (1 - a_bs)
+
+beta_paths_bs = simulate_rls_scalar(T_bs, sigma_bs, 0.0,
+                                    T_periods=T_sim, N_paths=N_sim)
+
+ode_bs = lambda beta: T_bs(beta) - beta
+t_ode_bs, sol_bs_low  = solve_ode(ode_bs, 0.0)
+_,         sol_bs_high = solve_ode(ode_bs, 4.0)
+
+fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+
+ax = axes[0]
+for i in range(min(30, N_sim)):
+    ax.plot(beta_paths_bs[i], color='darkorange', alpha=0.25, lw=0.8)
+ax.plot(np.mean(beta_paths_bs, axis=0), color='saddlebrown', lw=2,
+        label='cross-path average')
+ax.axhline(beta_f_bs, color='red', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_bs:.2f}$')
+ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+ax.set_title('Bray–Savin Model: RLS Paths ($a=0.7$)')
+ax.legend()
+
+ax = axes[1]
+ax.plot(t_ode_bs, sol_bs_low,  color='darkorange', lw=2, label='ODE from $\\beta_0=0$')
+ax.plot(t_ode_bs, sol_bs_high, color='steelblue',  lw=2, label='ODE from $\\beta_0=4$')
+ax.axhline(beta_f_bs, color='red', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_bs:.2f}$')
+ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta(t)$')
+ax.set_title('Bray–Savin Model: ODE Trajectories')
+ax.legend()
+
+plt.tight_layout()
+plt.show()
+print(f"REE: beta_f = m/(1-a) = {beta_f_bs:.4f}")
+print(f"Jacobian M = a - 1 = {a_bs - 1:.4f}  (< 0: E-stable)")
+```
+
+### Present-Value / Hyperinflation Model
+
+```{code-cell} ipython3
+# ------------------------------------------------------------------
+# Present-value model: T(beta) = (lambda*beta + 1)*rho
+# REE = rho / (1 - lambda*rho)
+# ------------------------------------------------------------------
+lam, rho_pv, sigma_pv = 0.8, 0.9, 1.0
+T_pv = lambda beta: (lam * beta + 1) * rho_pv
+beta_f_pv = rho_pv / (1 - lam * rho_pv)
+
+beta_paths_pv = simulate_rls_scalar(T_pv, sigma_pv, 0.0,
+                                    T_periods=T_sim, N_paths=N_sim)
+
+ode_pv = lambda beta: T_pv(beta) - beta
+t_ode_pv, sol_pv_low  = solve_ode(ode_pv, 0.0, t_span=(0, 50))
+_,         sol_pv_high = solve_ode(ode_pv, 10.0, t_span=(0, 50))
+
+fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+
+ax = axes[0]
+for i in range(min(30, N_sim)):
+    ax.plot(beta_paths_pv[i], color='seagreen', alpha=0.25, lw=0.8)
+ax.plot(np.mean(beta_paths_pv, axis=0), color='darkgreen', lw=2,
+        label='cross-path average')
+ax.axhline(beta_f_pv, color='red', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_pv:.2f}$')
+ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+ax.set_title('Present-Value Model: RLS Paths')
+ax.legend()
+
+ax = axes[1]
+ax.plot(t_ode_pv, sol_pv_low,  color='seagreen',  lw=2, label='ODE from $\\beta_0=0$')
+ax.plot(t_ode_pv, sol_pv_high, color='steelblue', lw=2, label='ODE from $\\beta_0=10$')
+ax.axhline(beta_f_pv, color='red', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_pv:.2f}$')
+ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta(t)$')
+ax.set_title('Present-Value Model: ODE Trajectories')
+ax.legend()
+
+plt.tight_layout()
+plt.show()
+print(f"REE: beta_f = rho/(1 - lambda*rho) = {beta_f_pv:.4f}")
+print(f"Jacobian M = lambda*rho - 1 = {lam*rho_pv - 1:.4f}  (< 0: E-stable)")
+```
+
+### Instability When E-Stability Fails
+
+To see what happens when E-stability is violated, we repeat Bray's model with $b > 1$.
+
+```{code-cell} ipython3
+# ------------------------------------------------------------------
+# Unstable case: Bray's model with b > 1
+# ------------------------------------------------------------------
+b_unstable = 1.4
+T_unstable = lambda beta: a_bray + b_unstable * beta
+beta_f_unstable = a_bray / (1 - b_unstable)   # negative
+
+beta_paths_unstable = simulate_rls_scalar(
+    T_unstable, sigma_bray, beta0=0.0,
+    T_periods=200, N_paths=50)
+
+ode_unstable = lambda beta: T_unstable(beta) - beta
+
+# Phase diagram: plot drift for beta in [-5, 5]
+beta_grid = np.linspace(-5, 5, 300)
+drift = np.array([ode_unstable(b) for b in beta_grid])
+
+fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+
+ax = axes[0]
+for i in range(min(30, 50)):
+    ax.plot(beta_paths_unstable[i], color='crimson', alpha=0.3, lw=0.8)
+ax.axhline(beta_f_unstable, color='black', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_unstable:.2f}$ (unstable)')
+ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+ax.set_title('Bray Model with $b=1.4$: RLS Diverges')
+ax.legend()
+
+ax = axes[1]
+ax.plot(beta_grid, drift, color='crimson', lw=2)
+ax.axhline(0, color='black', lw=0.8)
+ax.axvline(beta_f_unstable, color='black', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_unstable:.2f}$')
+ax.fill_between(beta_grid, drift, 0,
+                where=(drift > 0), color='crimson', alpha=0.15)
+ax.fill_between(beta_grid, drift, 0,
+                where=(drift < 0), color='steelblue', alpha=0.15)
+ax.set_xlabel('$\\beta$'); ax.set_ylabel('$T(\\beta) - \\beta$')
+ax.set_title('Phase Diagram: Drift Points Away from REE')
+ax.legend()
+
+plt.tight_layout()
+plt.show()
+print(f"Jacobian M = b - 1 = {b_unstable - 1:.2f}  (> 0: NOT E-stable)")
+```
+
+## Phase Diagrams and E-Stability
+
+The E-stability condition has a clean geometric interpretation.  At the REE
+$\beta_f$, the small ODE {eq}`eq:small_ode` must have trajectories **pointing
+inward**.
+
+This requires the slope $dT/d\beta - 1$ to be **negative** at $\beta_f$.
+
+The figure below plots the phase diagrams for all three scalar examples side by
+side.
+
+```{code-cell} ipython3
+beta_vec = np.linspace(-1.0, 5.5, 400)
+
+models = [
+    ("Bray ($b=0.6$)",       lambda b: a_bray + 0.6*b - b,   a_bray/(1-0.6),   'steelblue'),
+    ("Bray–Savin ($a=0.7$)", lambda b: m_bs + 0.7*b - b,     m_bs/(1-0.7),     'darkorange'),
+    ("Present-value",        lambda b: T_pv(b) - b,           beta_f_pv,        'seagreen'),
+]
+
+fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+
+for ax, (name, ode_fn, bf, color) in zip(axes, models):
+    drift = np.array([ode_fn(b) for b in beta_vec])
+    ax.plot(beta_vec, drift, color=color, lw=2)
+    ax.axhline(0, color='black', lw=0.8)
+    ax.axvline(bf, color='red', ls='--', lw=1.5, label=f'$\\beta_f={bf:.2f}$')
+    ax.fill_between(beta_vec, drift, 0, where=(drift > 0),
+                    color=color, alpha=0.12)
+    ax.fill_between(beta_vec, drift, 0, where=(drift < 0),
+                    color=color, alpha=0.12)
+    # Draw arrows showing direction of drift
+    for bv in np.linspace(beta_vec[20], beta_vec[-20], 7):
+        d = ode_fn(bv)
+        ax.annotate('', xy=(bv + 0.3*np.sign(d), 0),
+                    xytext=(bv, 0),
+                    arrowprops=dict(arrowstyle='->', color=color, lw=1.5))
+    ax.set_xlabel('$\\beta$')
+    ax.set_ylabel('$T(\\beta) - \\beta$')
+    ax.set_title(name)
+    ax.legend(fontsize=9)
+
+plt.suptitle('Phase Diagrams of the Small ODE $\\dot{\\beta} = T(\\beta) - \\beta$',
+             y=1.01, fontsize=13)
+plt.tight_layout()
+plt.show()
+```
+
+## Two-Dimensional Example: The Investment Model
+
+The investment-under-uncertainty example is two-dimensional and highlights how
+E-stability of the composite map $T(\beta) = (T_1(\beta_1), T_2(\beta_1, \beta_2))$
+works when the ODE is recursive.
+
+```{code-cell} ipython3
+def T_invest(beta, b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, rho_w=0.5):
+    """
+    Mapping T for the investment model (scalar version of equations 11 in
+    Marcet–Sargent 1989).
+
+    beta = [beta1, beta2]
+    T1(beta1) = (1 - beta1*b) / (1 - beta1*b + d^{-1} f^2 A1 N)
+    T2(beta1, beta2) = -N/(d*(1-rho_w*b)) * (1 - beta1*b + f^2 A1 beta2 b*rho_w)
+                       / (1 - beta1*b + d^{-1} f^2 A1 N) * rho_w
+    """
+    b1, b2 = beta
+    denom1 = 1 - b1*b + (1/d)*f**2*A1*N
+    T1 = (1 - b1*b) / denom1
+    numer2 = (1 - b1*b + f**2*A1*b2*b*rho_w)
+    T2 = (-N / (d*(1 - rho_w*b))) * (numer2 / denom1) * rho_w
+    return np.array([T1, T2])
+
+
+def ode_invest(t, beta, **kwargs):
+    Tb = T_invest(beta, **kwargs)
+    return Tb - beta
+
+
+# REE: solve T(beta) = beta numerically
+from scipy.optimize import fsolve
+
+params = dict(b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, rho_w=0.5)
+beta_f_inv = fsolve(lambda b: T_invest(b, **params) - b, [0.5, 0.1])
+print(f"REE: beta_f = {beta_f_inv}")
+
+# Check E-stability via Jacobian
+from numpy import linalg as la
+
+eps = 1e-6
+J = np.zeros((2, 2))
+for j in range(2):
+    e = np.zeros(2); e[j] = eps
+    J[:, j] = (T_invest(beta_f_inv + e, **params) -
+               T_invest(beta_f_inv - e, **params)) / (2*eps)
+M = J - np.eye(2)
+eigs = la.eigvals(M)
+print(f"Jacobian M eigenvalues: {eigs}")
+print(f"E-stable: {np.all(eigs.real < 0)}")
+
+# Solve ODE from several initial conditions
+fig, ax = plt.subplots(figsize=(8, 6))
+
+# Plot the vector field
+b1_grid = np.linspace(-0.1, 1.2, 20)
+b2_grid = np.linspace(-0.8, 0.5, 20)
+B1, B2 = np.meshgrid(b1_grid, b2_grid)
+U = np.zeros_like(B1); V_field = np.zeros_like(B2)
+for i in range(B1.shape[0]):
+    for j in range(B1.shape[1]):
+        beta_ij = np.array([B1[i,j], B2[i,j]])
+        drift = T_invest(beta_ij, **params) - beta_ij
+        U[i,j] = drift[0]; V_field[i,j] = drift[1]
+
+speed = np.sqrt(U**2 + V_field**2)
+speed[speed == 0] = 1e-8
+ax.streamplot(b1_grid, b2_grid, U, V_field, color=speed,
+              cmap='Blues', density=1.3, linewidth=1)
+
+# Plot trajectories from several starts
+starts = [(0.1, 0.0), (0.9, 0.4), (1.1, -0.6), (0.3, -0.7)]
+colors_traj = ['red', 'darkorange', 'green', 'purple']
+for (b10, b20), col in zip(starts, colors_traj):
+    sol = solve_ivp(ode_invest, [0, 30], [b10, b20],
+                    t_eval=np.linspace(0, 30, 300),
+                    kwargs=params, method='RK45')
+    ax.plot(sol.y[0], sol.y[1], color=col, lw=2)
+    ax.plot(b10, b20, 'o', color=col, ms=7)
+
+ax.plot(*beta_f_inv, 'k*', ms=14, label=f'REE $\\beta_f$')
+ax.set_xlabel('$\\beta_1$', fontsize=12)
+ax.set_ylabel('$\\beta_2$', fontsize=12)
+ax.set_title('Investment Model: Phase Portrait of $\\dot{\\beta} = T(\\beta) - \\beta$')
+ax.legend()
+plt.tight_layout()
+plt.show()
+```
+
+## Necessary Condition: Only REE Can Be Limit Points
+
+Proposition 2(i) of {cite}`MarcetSargent1989jet` shows that **non-REE limit points
+have probability zero**: for any $\hat\beta \neq \beta_f$ in the interior of the
+domain,
+
+$$
+P(\beta_t \to \hat\beta) = 0 .
+$$
+
+This is a converse: RLS either converges to the REE or it diverges.
+
+It
+cannot converge to a non-equilibrium fixed point.
+
+The following simulation makes this vivid by starting agents with an initial
+belief that happens to satisfy $T(\beta_0) \approx \beta_0$ only approximately.
+
+```{code-cell} ipython3
+# Illustration: starting near a non-fixed-point of T still sends beta to beta_f
+# (Bray model, stable case b=0.6)
+beta_false_rest = 3.0   # T(3.0) = 1 + 0.6*3 = 2.8 ≠ 3
+paths_from_false = simulate_rls_scalar(
+    T_bray, sigma_bray, beta0=beta_false_rest,
+    T_periods=300, N_paths=60, seed=7)
+
+fig, ax = plt.subplots(figsize=(10, 5))
+for i in range(60):
+    ax.plot(paths_from_false[i], color='steelblue', alpha=0.2, lw=0.8)
+ax.plot(np.mean(paths_from_false, axis=0), color='navy', lw=2,
+        label='cross-path average')
+ax.axhline(beta_f_bray, color='red', ls='--', lw=1.5,
+           label=f'REE $\\beta_f = {beta_f_bray:.2f}$')
+ax.axhline(beta_false_rest, color='gray', ls=':', lw=1.5,
+           label=f'False start $\\beta_0 = {beta_false_rest}$')
+ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+ax.set_title('RLS from a Non-Equilibrium Start Always Converges to the REE\n'
+             '(Proposition 2(i): only the REE is a possible limit point)')
+ax.legend()
+plt.tight_layout()
+plt.show()
+```
+
+## Connection to Rational Learning 
+
+The {cite}`MarcetSargent1989jet` framework belongs to the programme of learning
+*about* a rational expectations equilibrium, as distinct from learning *within*
+one — a distinction emphasised by {cite}`BrayKreps1987`.
+
+**Learning *within* an REE** (the subject of the companion lecture
+{doc}`rational_learning_re`) refers to Bayesian inference inside a correctly
+specified model.
+
+In that setting the data-generating process is stationary from
+the agent's perspective, and Bayes' rule is fully rationalized.
+
+**Learning *about* an REE** — the present lecture's topic — involves an agent who
+does not know the equilibrium price function.
+
+Because the agent's beliefs shift
+the equilibrium price, the data the agent uses to update beliefs are themselves
+generated by a non-stationary process.
+
+As {cite}`MarcetSargent1989jet` note (p.
+338, footnote 2):
+
+> *"The models do not incorporate fully optimal behavior or rational expectations,
+> because agents operate under the continually falsified assumption that the law of
+> motion is time invariant and known for sure."*
+
+This "continually falsified" assumption is precisely the sense in which the RLS
+algorithm cannot be derived from Bayesian rationality applied to a correctly
+specified model.
+
+It is nonetheless a compelling learning rule because it is
+consistent, computationally tractable, and — when E-stability holds — converges to
+the REE despite the misspecification.
+
+The E-stability condition thus plays the same role in this literature that the
+prior-support condition plays in the Bayesian learning literature: it tells us
+when the learning algorithm can find its way to the equilibrium.
+
+
+## Summary
+
+This lecture has presented the {cite}`MarcetSargent1989jet` framework for analysing
+least squares learning in self-referential linear stochastic models.
+
+Key takeaways:
+
+1. **Self-referential structure**: the actual law of motion depends on the
+   perceived law of motion through the mapping $T$.  A rational expectations
+   equilibrium is a fixed point $\beta_f = T(\beta_f)$.
+
+2. **Recursive least squares**: agents update their beliefs by running RLS,
+   which is adaptive but not fully Bayesian — it "continually falsifies" the
+   assumption that the environment is stationary.
+
+3. **The governing ODE**: the almost-sure limiting behaviour of $\beta_t$ is
+   described by the small ODE $\dot\beta = T(\beta) - \beta$.  Only fixed
+   points of this ODE (REE) are possible limit points of RLS.
+
+4. **E-stability**: the REE is the almost-sure limit of RLS if and only if
+   it is a **locally stable** fixed point of the small ODE — that is, if all
+   eigenvalues of the Jacobian $\mathcal{M} = dT/d\beta - I$ at $\beta_f$ have
+   strictly negative real parts.
+
+5. **Instability**: if any eigenvalue of $\mathcal{M}$ has positive real part,
+   $P(\beta_t \to \beta_f) = 0$ — convergence to that REE is impossible.
+
+6. **Connection to the rational learning literature**: the RLS algorithm
+   studies learning *about* a rational expectations equilibrium; it is
+   complementary to the Bayesian learning *within* an REE studied by
+   {cite}`BrayKreps1987`.
+
+## Exercises
+
+```{exercise}
+:label: ls_ex1
+
+**E-Stability and the Slope of T**
+
+Consider the scalar model with $T(\beta) = a + b\beta$.
+
+(a) Derive a formula for the unique REE $\beta_f$ in terms of $a$ and $b$.
+
+(b) Show that the small ODE $\dot\beta = T(\beta) - \beta$ is globally stable if
+and only if $b < 1$.
+
+(c) Simulate $N = 200$ paths of length $T = 500$ for $a = 1$ and each of
+$b \in \{0.3, 0.7, 0.9, 0.99\}$ (all less than 1).  Plot the cross-path
+average of $\beta_t$ for each $b$ value on the same figure.  Comment on how the
+rate of convergence changes as $b \to 1$.
+```
+
+```{solution-start} ls_ex1
+:class: dropdown
+```
+
+**(a)** The REE satisfies $\beta_f = T(\beta_f) = a + b\beta_f$, so
+
+$$
+\beta_f (1 - b) = a \implies \beta_f = \frac{a}{1-b} .
+$$
+
+**(b)** The small ODE is $\dot\beta = a + b\beta - \beta = a - (1-b)\beta$.
+This is linear with slope $-(1-b)$, so the unique fixed point $\beta_f = a/(1-b)$
+is globally stable iff $1-b > 0$, i.e., $b < 1$.
+
+**(c)**
+
+```{code-cell} ipython3
+a_ex, T_ex, N_ex = 1.0, 500, 200
+b_values = [0.3, 0.7, 0.9, 0.99]
+colors_ex = ['steelblue', 'darkorange', 'seagreen', 'purple']
+
+fig, ax = plt.subplots(figsize=(11, 5))
+for b_val, col in zip(b_values, colors_ex):
+    T_fn = lambda beta, bv=b_val: a_ex + bv * beta
+    paths = simulate_rls_scalar(T_fn, sigma_u=1.0, beta0=0.0,
+                                T_periods=T_ex, N_paths=N_ex, seed=0)
+    bf = a_ex / (1 - b_val)
+    ax.plot(np.mean(paths, axis=0), color=col, lw=2,
+            label=f'$b={b_val}$, $\\beta_f={bf:.2f}$')
+
+ax.set_xlabel('$t$')
+ax.set_ylabel('$E[\\beta_t]$')
+ax.set_title('Convergence Rate Slows as $b \\to 1$')
+ax.legend()
+plt.tight_layout()
+plt.show()
+
+print("As b → 1, the Jacobian M = b - 1 → 0, so the ODE becomes slow to")
+print("return to the fixed point.  Convergence still occurs but takes longer.")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: ls_ex2
+
+**Necessary Condition: Non-REE Limit Points**
+
+Proposition 2(i) of {cite}`MarcetSargent1989jet` states that $P(\beta_t \to \hat\beta) = 0$
+for any $\hat\beta \neq \beta_f$ in the interior.
+
+(a) Using the Bray model with $a=1$, $b=0.6$, simulate 100 paths of length
+$T = 600$ starting from $\beta_0 = 6$ (far from $\beta_f = 2.5$).  Show that
+paths still converge to $\beta_f$.
+
+(b) Now consider the **unstable** case $b = 1.5$.  Simulate 50 paths of length
+$T = 200$ starting from $\beta_0 = 0.1$ (close to the REE $\beta_f = -2$).
+Describe what happens.
+
+(c) For the unstable case, plot the phase diagram and explain geometrically why
+the paths diverge.
+```
+
+```{solution-start} ls_ex2
+:class: dropdown
+```
+
+**(a) and (b)**
+
+```{code-cell} ipython3
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+# (a) far start, stable case
+T_st = lambda beta: 1.0 + 0.6*beta
+paths_far = simulate_rls_scalar(T_st, 1.0, beta0=6.0,
+                                T_periods=600, N_paths=100, seed=1)
+ax = axes[0]
+for i in range(40):
+    ax.plot(paths_far[i], color='steelblue', alpha=0.2, lw=0.8)
+ax.plot(np.mean(paths_far, axis=0), color='navy', lw=2, label='average')
+ax.axhline(2.5, color='red', ls='--', lw=1.5, label='$\\beta_f = 2.5$')
+ax.set_title('Stable ($b=0.6$): far start still converges')
+ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$'); ax.legend()
+
+# (b) unstable case, start near REE
+T_un = lambda beta: 1.0 + 1.5*beta
+beta_f_un = 1.0 / (1 - 1.5)   # = -2
+paths_un = simulate_rls_scalar(T_un, 1.0, beta0=0.1,
+                               T_periods=200, N_paths=50, seed=2)
+ax = axes[1]
+for i in range(50):
+    ax.plot(paths_un[i], color='crimson', alpha=0.3, lw=0.8)
+ax.axhline(beta_f_un, color='black', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_un}$ (unstable)')
+ax.set_title('Unstable ($b=1.5$): diverges even near REE')
+ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$'); ax.legend()
+
+plt.tight_layout()
+plt.show()
+```
+
+**(c)** Phase diagram of the unstable case:
+
+```{code-cell} ipython3
+beta_g = np.linspace(-8, 6, 400)
+drift_un = np.array([1.0 + 1.5*b - b for b in beta_g])
+
+fig, ax = plt.subplots(figsize=(8, 4))
+ax.plot(beta_g, drift_un, color='crimson', lw=2)
+ax.axhline(0, color='black', lw=0.8)
+ax.axvline(beta_f_un, color='black', ls='--', lw=1.5,
+           label=f'$\\beta_f = {beta_f_un}$')
+ax.fill_between(beta_g, drift_un, 0, where=(drift_un > 0),
+                color='crimson', alpha=0.15)
+ax.fill_between(beta_g, drift_un, 0, where=(drift_un < 0),
+                color='steelblue', alpha=0.15)
+ax.set_xlabel('$\\beta$'); ax.set_ylabel('$T(\\beta) - \\beta$')
+ax.set_title('Phase Diagram: Unstable REE ($b=1.5$)\n'
+             'Drift points away from $\\beta_f$ everywhere')
+ax.legend()
+plt.tight_layout()
+plt.show()
+
+print("Geometrically: the slope dT/d(beta) - 1 = b - 1 = 0.5 > 0 at the REE,")
+print("so the ODE pushes beta AWAY from beta_f in both directions.")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: ls_ex3
+
+**The Present-Value Model: Effect of $\lambda$ on E-Stability**
+
+In the present-value model {eq}`eq:pv_model`, $T(\beta) = (\lambda\beta + 1)\rho$
+and the Jacobian is $\mathcal{M} = \lambda\rho - 1$.
+
+(a) For $\rho = 0.9$ and each of $\lambda \in \{0.5, 0.8, 0.95, 1.0\}$:
+    - Compute $\beta_f$ and $\mathcal{M}$
+    - Determine whether the REE is E-stable
+
+(b) For the E-stable cases, simulate 100 paths of length $T=400$ and
+plot the cross-path average against the ODE solution.
+
+(c) At $\lambda = 1$, $\mathcal{M} = \rho - 1 < 0$ (still E-stable when
+$|\rho| < 1$).  Simulate paths for this case and compare the convergence
+speed with the $\lambda = 0.5$ case.  Provide an intuitive explanation.
+```
+
+```{solution-start} ls_ex3
+:class: dropdown
+```
+
+**(a)**
+
+```{code-cell} ipython3
+rho_ex = 0.9
+lambdas = [0.5, 0.8, 0.95, 1.0]
+
+print(f"{'lambda':>8}  {'beta_f':>10}  {'M = lam*rho-1':>15}  {'E-stable':>10}")
+print("-" * 50)
+for lv in lambdas:
+    bf = rho_ex / (1 - lv * rho_ex) if abs(lv * rho_ex) < 1 else float('inf')
+    M_jac = lv * rho_ex - 1
+    estab = "YES" if M_jac < 0 else "NO"
+    print(f"{lv:>8.2f}  {bf:>10.4f}  {M_jac:>15.4f}  {estab:>10}")
+```
+
+**(b) and (c)**
+
+```{code-cell} ipython3
+fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+colors_lam = ['steelblue', 'darkorange', 'seagreen', 'purple']
+
+for ax, lv, col in zip(axes.flat, lambdas, colors_lam):
+    T_fn = lambda beta, l=lv: (l * beta + 1) * rho_ex
+    ode_fn = lambda beta, l=lv: T_fn(beta, l) - beta
+    bf = rho_ex / (1 - lv * rho_ex) if abs(lv * rho_ex) < 1 else None
+
+    paths_lam = simulate_rls_scalar(T_fn, 1.0, beta0=0.0,
+                                    T_periods=400, N_paths=100, seed=3)
+    for i in range(20):
+        ax.plot(paths_lam[i], color=col, alpha=0.2, lw=0.8)
+    ax.plot(np.mean(paths_lam, axis=0), color=col, lw=2, label='RLS average')
+
+    if bf is not None:
+        # ODE solution
+        t_o, sol_o = solve_ode(ode_fn, 0.0, t_span=(0, 400), n_points=400)
+        ax.plot(t_o, sol_o, color='black', ls='--', lw=1.5, label='ODE')
+        ax.axhline(bf, color='red', ls=':', lw=1.2,
+                   label=f'$\\beta_f={bf:.2f}$')
+
+    M_jac = lv * rho_ex - 1
+    ax.set_title(f'$\\lambda={lv}$,  $\\mathcal{{M}}={M_jac:.3f}$')
+    ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+    ax.legend(fontsize=8)
+
+plt.suptitle('Present-Value Model: Convergence for Different $\\lambda$ Values',
+             y=1.02, fontsize=13)
+plt.tight_layout()
+plt.show()
+
+print("\n(c) When lambda=1, M = rho-1 ≈ -0.1 (small in absolute value).")
+print("    This means the ODE is very 'flat' near beta_f: the restoring force")
+print("    is weak and convergence is slow.  When lambda=0.5, M = -0.55,")
+print("    giving a stronger restoring force and faster convergence.")
+```
+
+```{solution-end}
+```
+
diff --git a/lectures/rational_learning_re.bib b/lectures/rational_learning_re.bib
index 585c27c1b..0eff25810 100644
--- a/lectures/rational_learning_re.bib
+++ b/lectures/rational_learning_re.bib
@@ -139,6 +139,18 @@ @article{Townsend1983
   doi     = {10.1086/261170}
 }
 
+@article{MarcetSargent1989,
+  author    = {Marcet, Albert and Sargent, Thomas J.},
+  title     = {Convergence of Least Squares Learning Mechanisms in Self-Referential Linear Stochastic Models},
+  journal   = {Journal of Economic Theory},
+  year      = {1989},
+  volume    = {48},
+  number    = {2},
+  pages     = {337--368},
+  publisher = {Elsevier},
+  doi       = {10.1016/0022-0531(89)90032-X}
+}
+
 @article{ArrowGreen1973,
   author  = {Arrow, Kenneth J. and Green, Jerry R.},
   title   = {Notes on Expectations Equilibria in Bayesian Settings},
diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index 8f3cba541..8ee4a4041 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -33,9 +33,10 @@ This lecture explores a classic question in economic theory: can agents **learn*
 {cite}`BrayKreps1987` examine this question in a rigorously specified model.
 
 In a rational expectations equilibrium, agents use market prices to make inferences about other agents' private information.
+
 Each agent knows the **statistical relationship** between prices and the underlying payoff-relevant variables — and that relationship is **correct** given the equilibrium.
 
-But this raises a deep question: where does that knowledge come from?
+But this raises a  question: where does that knowledge come from?
 
 The **rational learning** approach studied by Bray and Kreps asks whether agents who start with uncertainty about the equilibrium price function can, over time, learn it from observations of past prices.
 
@@ -47,9 +48,9 @@ The key findings are:
 
 This lecture presents the Bray–Kreps framework, works through their benchmark example in detail, and provides Python code to simulate Bayesian learning dynamics.
 
-```{note}
-This lecture draws on {cite}`BrayKreps1987`, Chapter 19 in *Advances in Economic Theory* (1987), which synthesizes earlier work by {cite}`Bray1982`, {cite}`BraySavin1984`, and the rational expectations literature of {cite}`Radner1979`, {cite}`grossman1976`, and {cite}`Jordan1982`.
-```
+
+This lecture describes  {cite}`BrayKreps1987`, Chapter 19 in *Advances in Economic Theory* (1987), which synthesizes earlier work by {cite}`Bray1982`, {cite}`BraySavin1984`, and the rational expectations literature of {cite}`Radner1979`, {cite}`grossman1976`, and {cite}`Jordan1982`.
+
 
 Let's start with the necessary imports.
 
@@ -158,6 +159,7 @@ So $U$'s task is to learn the single parameter $b$ from observations of prices a
 ### Observing the Signal
 
 At date $t$, agent $U$ observes $p_t$.
+
 The signal $U$ extracts is the return implied by the price:
 
 $$
@@ -183,6 +185,7 @@ b \mid \text{data} \sim \mathcal{N}(\mu_t, v_t)
 $$
 
 The posterior is updated using Bayes' rule.
+
 Since $p_t = b \cdot r_t$ (with $a = 0$), each pair $(r_s, p_s)$ provides the observation $p_s = b \cdot r_s$, i.e., a noisy linear measurement of $b$.
 
 For a Gaussian prior and Gaussian likelihood, the posterior updates as:
@@ -195,10 +198,11 @@ $$
 \mu_t = v_t \left( v_0^{-1} \mu_0 + \frac{1}{\sigma^2} \sum_{s=1}^{t} r_s p_s \right)
 $$ (eq:posterior_mean)
 
-```{note}
+
 Equations {eq}`eq:posterior_precision` and {eq}`eq:posterior_mean` follow from the standard Gaussian linear regression posterior.
+
 Each observation $(r_s, p_s)$ with $p_s = b r_s + 0$ is treated as a noisy signal of $b$ with signal-to-noise ratio $r_s^2 / \sigma^2$.
-```
+
 
 ### The Key Convergence Result
 
@@ -348,6 +352,7 @@ plt.show()
 ```
 
 The left panel shows that regardless of the (misspecified) prior mean, agent $U$'s posterior mean converges to the true equilibrium value $b^* = 2$.
+
 The right panel confirms that the posterior variance vanishes at rate $1/t$, consistent with the formula in {eq}`eq:posterior_precision`.
 
 ## The Demand and Equilibrium
@@ -477,6 +482,7 @@ The more subtle failure mode — identified by Bray and Kreps — arises when ag
 
 In the fully general setting, the price at date $t$ depends on $U$'s current beliefs $\mu_t$.
 But $\mu_t$ is updated based on past prices.
+
 This creates a **self-referential** system: beliefs drive prices, and prices update beliefs.
 
 {cite}`BrayKreps1987` show (their Proposition 2 and Section 5) that this feedback can lead to **non-stationary** dynamics and that convergence to the rational expectations equilibrium requires additional conditions — essentially that the economy "settles down" to a stationary relationship before agents learn the parameters of that relationship.
@@ -590,6 +596,7 @@ The main result (Proposition 3) states that even in large general-equilibrium ec
 The formal statement requires some notation.
 
 Let $\theta$ be the vector of unknown parameters of the economy (e.g., preferences, endowments), and let $\phi$ be the state space.
+
 Denote by $F_t(\cdot; \theta)$ the agents' conditional distribution function for $\theta$ at date $t$.
 
 **Theorem (Convergence to REE):**
@@ -611,12 +618,15 @@ While the positive convergence results are elegant, {cite}`BrayKreps1987` are ca
 When the economy admits multiple rational expectations equilibria, agents learning within one equilibrium may receive price signals that are informative about the *current* equilibrium but not necessarily about which equilibrium will prevail in the long run.
 
 A concrete example: suppose there are two spot market equilibria for some payoff-relevant variable $\theta$: one equilibrium at $\theta_1$ and another at $\theta_2$.
+
 The informed agents choose randomly among these each period (since they are indifferent).
+
 The uninformed agent's posterior mean can never converge to a single value — it will bounce between neighborhoods of $\theta_1$ and $\theta_2$.
 
 ### Obstacle 2: Non-Stationarity of Beliefs
 
 Even if the economy has a unique REE, if agents' beliefs are updating over time, the **realized** price process is non-stationary.
+
 In that case, past data provides **biased** information about the future.
 
 This is a **philosophical problem** with the idea of learning in equilibrium: one cannot use data generated by a learning process (in which prices depend on beliefs that are changing) to learn the *stationary* equilibrium relationship.
@@ -627,6 +637,130 @@ If $U$'s prior assigns zero probability to $b^*$ — that is, if $U$'s model is
 
 {cite}`BrayKreps1987` note (p. 622) that this is a subtle but important caveat: convergence is guaranteed only when the "true $\theta$ may lie outside the set of states $\Omega$" to which the agent's prior assigns positive probability is not the case.
 
+## Learning *Within* versus Learning *About* a Rational Expectations Equilibrium
+
+One of the deepest conceptual contributions of {cite}`BrayKreps1987` is a distinction they draw in their concluding section between two fundamentally different notions of learning in a rational expectations context.
+
+### The Distinction
+
+**Learning *within* a rational expectations equilibrium** is the subject of this lecture.
+The phrase refers to Bayesian inference that takes place *inside* a correctly specified model of the economy.
+
+The uninformed agent knows the true structural form of the price function (that it is linear, that $a = 0$), knows the true distribution of fundamentals, and entertains uncertainty only about the single unknown parameter $b^*$.
+
+Because the true $b^*$ lies in the support of agent $U$'s prior, the agent's model is **correctly specified**.
+
+The Bayesian updating rule — standard Gaussian conjugate updating — is therefore fully rationalized: it is exactly what a rational agent with a correct model would do.
+
+Convergence of beliefs to $b^*$ then follows from the standard Bayesian consistency theorem (Proposition 2 of {cite}`BrayKreps1987`).
+
+**Learning *about* a rational expectations equilibrium** is a quite different enterprise.
+Here the agent does not know the statistical relationship between prices and fundamentals, and that relationship is itself an *endogenous* object — it is determined in equilibrium by the very beliefs the agent is trying to learn.
+
+As Bray and Kreps put it (p. 601):
+
+> *"The question is whether this sequence of stationary relationships, and the equilibrium it engenders, will converge to some stationary relationship, and then agents can learn that stationary relationship long enough to hold on to their initial beliefs."*
+
+The difficulty is that during the learning phase, agents' beliefs are changing, which changes the equilibrium price function, which changes the data used to update beliefs.
+
+The learning process and the equilibrium are **simultaneously evolving**, so the data are generated by a **non-stationary** process that is itself a function of beliefs.
+
+### Why Learning *About* an REE Requires Non-Bayesian Updating
+
+This simultaneity creates a fundamental obstacle to fully rational Bayesian learning.
+
+To see why, suppose agent $U$ attempts to learn $b^*$ by treating the problem as Bayesian inference in a fixed, correctly specified model.
+
+For that to be valid, the agent would need to know:
+
+1. The true structural form of the price function (which depends on the equilibrium).
+2. The distribution of prices conditional on the unknown parameter (which also depends on the equilibrium).
+
+But both of these are themselves functions of the equilibrium that agent $U$ is trying to learn.
+
+If $U$'s beliefs at date $t$ are $\mu_t \neq b^*$, then $U$'s model of the price process is **misspecified** — the prices generated in the economy reflect other agents' optimization given the *actual* (possibly non-stationary) beliefs of $U$, not the stationary REE price function that $U$ is treating as fixed.
+
+Thus, the agent's model can be correctly specified *only if* the economy is already at the rational expectations equilibrium.
+
+But if the economy were already there, there would be nothing to learn.
+
+Bray and Kreps make this point sharply at the end of Section 5 (p. 620):
+
+> *"Note that it is unnecessary to tell U about the allocation contained in previous and current equilibria information, all the information that U could exceed the amount of information in equilibrium prices* [because] *information contained in those equilibrium prices could reflect more information than all agents put together possess.*"
+
+And in their concluding section they observe that the rational-learning model is:
+
+> *"...concerned with learning* within *and learning* about *an equilibrium, and then the sense of* rational learning *within* ... is equivalent to* rational learning about *in some sense other than as formally equivalent to* rational expectations equilibrium."*
+
+The distinction is that learning *within* an REE — our Bayesian model above — is consistent with full rationality because the agent's model is correct.
+
+Learning *about* an REE, by contrast, requires the agent to use data generated by a **non-stationary** process as if it were generated by a stationary REE, which is a form of model misspecification that cannot be rationalized as Bayesian updating with a correct prior.
+
+### The Role of "Irrational" Learning Algorithms
+
+This explains why the literature on learning *about* rational expectations equilibria — going back to {cite}`Bray1982` and {cite}`BraySavin1984`, and extended in the influential work of {cite}`MarcetSargent1989` — tends to rely on **ordinary least squares (OLS)** or other adaptive algorithms rather than Bayes' rule.
+
+```{note}
+{cite}`MarcetSargent1989` use some theorems about stochastic approximation to extend some of Bray and 
+Savin's results to other settings.
+```
+
+In those models, agent $U$ runs a regression of observed prices on observed fundamentals, updating the estimated coefficient as new data arrive.
+
+OLS is consistent and computationally tractable, but it is *not* the optimal rule for an agent who knows the true data-generating process.
+
+It is, as Bray and Kreps call it, a form of **"irrational" learning** — rational in the limited sense of using past data intelligently, but not derivable from Bayes' theorem applied to a correctly specified model.
+
+An OLS learner implicitly assumes the data-generating process is stationary — that the relationship between prices and fundamentals is the same in every period.
+
+But during the learning transition, it is not: the price function shifts as beliefs shift.
+
+OLS ignores this, treating past and present observations as exchangeable draws from a fixed distribution.
+
+This is a misspecification, and the resulting estimates are biased in finite samples, even if they converge in the long run.
+
+Bray and Kreps note (pp. 598–599) that in the models studied by {cite}`Bray1982` and {cite}`BraySavin1984`:
+
+> *"Agents are doing Bayesian updating, but their model is, almost by construction, wrong — they are learning as if the environment were stationary when it is not."*
+
+There is a fundamental **epistemic tension** at the heart of learning about rational expectations equilibria:
+
+* A fully rational (Bayesian, correctly specified) learner can only apply Bayes' rule to a model whose structure is *already known* — but the structure of the REE is exactly what the agent is trying to learn.
+* A learner who uses an adaptive algorithm (OLS, least-mean-squares, etc.) can potentially converge to the REE, but only by using a rule that cannot be derived from Bayesian rationality applied to a correctly specified model.
+
+The benchmark model in this lecture avoids this tension by assumption: agent $U$ knows the structural form of the price function and needs only to learn one parameter.
+
+That is learning *within* an REE — a clean, tractable, and fully rational exercise — but it is also a special case that sidesteps the deeper difficulty of learning *about* an REE from scratch.
+
+
+## Summary
+
+This lecture has discussed  ideas from {cite}`BrayKreps1987`:
+
+1. **Rational expectations equilibria** require agents to know the statistical relationship between prices and fundamentals — but this knowledge is typically assumed, not derived.
+
+2. **Rational learning** asks whether Bayesian agents can *learn* the REE from data.
+   In a benchmark linear model, the answer is yes: the uninformed agent's posterior on the slope parameter $b^*$ converges almost surely to the truth.
+
+3. The convergence relies on **Bayesian consistency** — the uninformed agent accumulates sufficient information to identify $b^*$ from observed prices and returns.
+
+4. Convergence can **fail** when:
+   - There are **multiple equilibria** and agents' learning rules interact with equilibrium selection.
+   - The agent's **model is misspecified** (prior assigns zero weight to the truth).
+   - The learning process generates **non-stationary** prices that contaminate inference.
+
+5. A **general convergence theorem** guarantees that under correct specification and unique equilibria, Bayesian posteriors converge weakly to a point mass at the truth.
+
+6. **Learning *within* versus *about* an REE** is a crucial distinction.
+   The benchmark model in this lecture exemplifies learning *within* an REE: agent $U$ knows the structural form of the price function and uses a correctly specified Bayesian model.
+
+   Learning *about* an REE — where the equilibrium price function is itself the unknown object — is fundamentally harder, because the data-generating process shifts as beliefs shift.
+
+   This non-stationarity means that learning *about* an REE cannot in general be rationalized as Bayes' rule applied to a correctly specified model, which is why the literature on this topic relies on adaptive algorithms such as OLS rather than fully Bayesian updating.
+
+The broader message of Bray and Kreps is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle — and the conditions under which learning succeeds are more restrictive than they might appear.
+
+
 ## Exercises
 
 ```{exercise}
@@ -893,23 +1027,3 @@ print("mixture, leading to noisier information accumulation.")
 
 ```{solution-end}
 ```
-
-## Summary
-
-This lecture has covered the following key ideas from {cite}`BrayKreps1987`:
-
-1. **Rational expectations equilibria** require agents to know the statistical relationship between prices and fundamentals — but this knowledge is typically assumed, not derived.
-
-2. **Rational learning** asks whether Bayesian agents can *learn* the REE from data.
-   In a benchmark linear model, the answer is yes: the uninformed agent's posterior on the slope parameter $b^*$ converges almost surely to the truth.
-
-3. The convergence relies on **Bayesian consistency** — the uninformed agent accumulates sufficient information to identify $b^*$ from observed prices and returns.
-
-4. Convergence can **fail** when:
-   - There are **multiple equilibria** and agents' learning rules interact with equilibrium selection.
-   - The agent's **model is misspecified** (prior assigns zero weight to the truth).
-   - The learning process generates **non-stationary** prices that contaminate inference.
-
-5. A **general convergence theorem** guarantees that under correct specification and unique equilibria, Bayesian posteriors converge weakly to a point mass at the truth.
-
-The broader message of Bray and Kreps is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle — and the conditions under which learning succeeds are more restrictive than they might appear.

From fb47870d67e6ccb6d5794d2187fa1cd1fb80b3fd Mon Sep 17 00:00:00 2001
From: thomassargent30 <ts43@nyu.edu>
Date: Sun, 10 May 2026 14:48:30 -0400
Subject: [PATCH 03/25] Tom's addition of a lecture

---
 lectures/_config.yml                |    1 +
 lectures/_static/quant-econ.bib     |  106 +++
 lectures/_toc.yml                   |    1 +
 lectures/long_run_risk_operator.bib |  131 +++
 lectures/long_run_risk_operator.md  | 1316 +++++++++++++++++++++++++++
 5 files changed, 1555 insertions(+)
 create mode 100644 lectures/long_run_risk_operator.bib
 create mode 100644 lectures/long_run_risk_operator.md

diff --git a/lectures/_config.yml b/lectures/_config.yml
index 92c4f0129..ef184ed1e 100644
--- a/lectures/_config.yml
+++ b/lectures/_config.yml
@@ -23,6 +23,7 @@ execute:
 
 bibtex_bibfiles:
    - _static/quant-econ.bib
+   - long_run_risk_operator.bib
 
 html:
   baseurl: https://python.quantecon.org/
diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 414cde9ae..24f480231 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -4037,3 +4037,109 @@ @article{Kobayashi1977
   number  = {1},
   pages   = {32--43}
 }
+
+@article{HansenHeatonLi2008,
+  author    = {Hansen, Lars Peter and Heaton, John C. and Li, Nan},
+  title     = {Consumption Strikes Back? Measuring Long-Run Risk},
+  journal   = {Journal of Political Economy},
+  year      = {2008},
+  volume    = {116},
+  number    = {2},
+  pages     = {260--302},
+  doi       = {10.1086/588200}
+}
+
+@article{BansalYaron2004,
+  author    = {Bansal, Ravi and Yaron, Amir},
+  title     = {Risks for the Long Run: A Potential Resolution of Asset
+               Pricing Puzzles},
+  journal   = {Journal of Finance},
+  year      = {2004},
+  volume    = {59},
+  number    = {4},
+  pages     = {1481--1509},
+  doi       = {10.1111/j.1540-6261.2004.00670.x}
+}
+
+@article{Breeden1979,
+  author    = {Breeden, Douglas T.},
+  title     = {An Intertemporal Asset Pricing Model with Stochastic
+               Consumption and Investment Opportunities},
+  journal   = {Journal of Financial Economics},
+  year      = {1979},
+  volume    = {7},
+  number    = {3},
+  pages     = {265--296},
+  doi       = {10.1016/0304-405X(79)90016-3}
+}
+
+@article{KrepsPorteus1978,
+  author    = {Kreps, David M. and Porteus, Evan L.},
+  title     = {Temporal Resolution of Uncertainty and Dynamic Choice Theory},
+  journal   = {Econometrica},
+  year      = {1978},
+  volume    = {46},
+  number    = {1},
+  pages     = {185--200},
+  doi       = {10.2307/1913656}
+}
+
+@article{EpsteinZin1989,
+  author    = {Epstein, Larry G. and Zin, Stanley E.},
+  title     = {Substitution, Risk Aversion, and the Temporal Behavior of
+               Consumption and Asset Returns: A Theoretical Framework},
+  journal   = {Econometrica},
+  year      = {1989},
+  volume    = {57},
+  number    = {4},
+  pages     = {937--969},
+  doi       = {10.2307/1913778}
+}
+
+@article{AndersonHansenSargent2003,
+  author    = {Anderson, Evan W. and Hansen, Lars Peter and Sargent, Thomas J.},
+  title     = {A Quartet of Semigroups for Model Specification, Robustness,
+               Prices of Risk, and Model Detection},
+  journal   = {Journal of the European Economic Association},
+  year      = {2003},
+  volume    = {1},
+  number    = {1},
+  pages     = {68--123},
+  doi       = {10.1162/154247603322256774}
+}
+
+@article{KontoyiannisM2003,
+  author    = {Kontoyiannis, Ioannis and Meyn, Sean P.},
+  title     = {Spectral Theory and Limit Theorems for Geometrically
+               Ergodic {Markov} Processes},
+  journal   = {Annals of Applied Probability},
+  year      = {2003},
+  volume    = {13},
+  number    = {1},
+  pages     = {304--362},
+  doi       = {10.1214/aoap/1042765670}
+}
+
+@article{HansenScheinkman1995,
+  author    = {Hansen, Lars Peter and Scheinkman, Jos{\'e} A.},
+  title     = {Back to the Future: Generating Moment Implications for
+               Continuous-Time {Markov} Processes},
+  journal   = {Econometrica},
+  year      = {1995},
+  volume    = {63},
+  number    = {4},
+  pages     = {767--804},
+  doi       = {10.2307/2171800}
+}
+
+@article{LettauWachter2007,
+  author    = {Lettau, Martin and Wachter, Jessica A.},
+  title     = {Why Is Long-Horizon Equity Less Risky? {A} Duration-Based
+               Explanation of the Value Premium},
+  journal   = {Journal of Finance},
+  year      = {2007},
+  volume    = {62},
+  number    = {1},
+  pages     = {55--92},
+  doi       = {10.1111/j.1540-6261.2007.01203.x}
+}
diff --git a/lectures/_toc.yml b/lectures/_toc.yml
index 74a694c97..7ba292779 100644
--- a/lectures/_toc.yml
+++ b/lectures/_toc.yml
@@ -145,6 +145,7 @@ parts:
   - file: ls_learning
   - file: affine_risk_prices
   - file: ross_recovery
+  - file: long_run_risk_operator
   - file: misspecified_recovery
 - caption: Data and Empirics
   numbered: true
diff --git a/lectures/long_run_risk_operator.bib b/lectures/long_run_risk_operator.bib
new file mode 100644
index 000000000..64bd0f101
--- /dev/null
+++ b/lectures/long_run_risk_operator.bib
@@ -0,0 +1,131 @@
+% BibTeX references for long_run_risk_operator.md
+% References NOT already in quant-econ.bib
+
+@article{HansenScheinkman2009,
+  author    = {Hansen, Lars Peter and Scheinkman, Jos{\'e} A.},
+  title     = {Long-Term Risk: An Operator Approach},
+  journal   = {Econometrica},
+  year      = {2009},
+  volume    = {77},
+  number    = {1},
+  pages     = {177--234},
+  doi       = {10.3982/ECTA6761}
+}
+
+@article{AlvarezJermann2005,
+  author    = {Alvarez, Fernando and Jermann, Urban J.},
+  title     = {Using Asset Prices to Measure the Persistence in the
+               Marginal Utility of Wealth},
+  journal   = {Econometrica},
+  year      = {2005},
+  volume    = {73},
+  number    = {6},
+  pages     = {1977--2016},
+  doi       = {10.1111/j.1468-0262.2005.00643.x}
+}
+
+@article{HansenHeatonLi2008,
+  author    = {Hansen, Lars Peter and Heaton, John C. and Li, Nan},
+  title     = {Consumption Strikes Back? Measuring Long-Run Risk},
+  journal   = {Journal of Political Economy},
+  year      = {2008},
+  volume    = {116},
+  number    = {2},
+  pages     = {260--302},
+  doi       = {10.1086/588200}
+}
+
+@article{BansalYaron2004,
+  author    = {Bansal, Ravi and Yaron, Amir},
+  title     = {Risks for the Long Run: A Potential Resolution of Asset
+               Pricing Puzzles},
+  journal   = {Journal of Finance},
+  year      = {2004},
+  volume    = {59},
+  number    = {4},
+  pages     = {1481--1509},
+  doi       = {10.1111/j.1540-6261.2004.00670.x}
+}
+
+@article{Breeden1979,
+  author    = {Breeden, Douglas T.},
+  title     = {An Intertemporal Asset Pricing Model with Stochastic
+               Consumption and Investment Opportunities},
+  journal   = {Journal of Financial Economics},
+  year      = {1979},
+  volume    = {7},
+  number    = {3},
+  pages     = {265--296},
+  doi       = {10.1016/0304-405X(79)90016-3}
+}
+
+@article{KrepsPorteus1978,
+  author    = {Kreps, David M. and Porteus, Evan L.},
+  title     = {Temporal Resolution of Uncertainty and Dynamic Choice Theory},
+  journal   = {Econometrica},
+  year      = {1978},
+  volume    = {46},
+  number    = {1},
+  pages     = {185--200},
+  doi       = {10.2307/1913656}
+}
+
+@article{EpsteinZin1989,
+  author    = {Epstein, Larry G. and Zin, Stanley E.},
+  title     = {Substitution, Risk Aversion, and the Temporal Behavior of
+               Consumption and Asset Returns: A Theoretical Framework},
+  journal   = {Econometrica},
+  year      = {1989},
+  volume    = {57},
+  number    = {4},
+  pages     = {937--969},
+  doi       = {10.2307/1913778}
+}
+
+@article{AndersonHansenSargent2003,
+  author    = {Anderson, Evan W. and Hansen, Lars Peter and Sargent, Thomas J.},
+  title     = {A Quartet of Semigroups for Model Specification, Robustness,
+               Prices of Risk, and Model Detection},
+  journal   = {Journal of the European Economic Association},
+  year      = {2003},
+  volume    = {1},
+  number    = {1},
+  pages     = {68--123},
+  doi       = {10.1162/154247603322256774}
+}
+
+@article{KontoyiannisM2003,
+  author    = {Kontoyiannis, Ioannis and Meyn, Sean P.},
+  title     = {Spectral Theory and Limit Theorems for Geometrically
+               Ergodic {Markov} Processes},
+  journal   = {Annals of Applied Probability},
+  year      = {2003},
+  volume    = {13},
+  number    = {1},
+  pages     = {304--362},
+  doi       = {10.1214/aoap/1042765670}
+}
+
+@article{HansenScheinkman1995,
+  author    = {Hansen, Lars Peter and Scheinkman, Jos{\'e} A.},
+  title     = {Back to the Future: Generating Moment Implications for
+               Continuous-Time {Markov} Processes},
+  journal   = {Econometrica},
+  year      = {1995},
+  volume    = {63},
+  number    = {4},
+  pages     = {767--804},
+  doi       = {10.2307/2171800}
+}
+
+@article{LettauWachter2007,
+  author    = {Lettau, Martin and Wachter, Jessica A.},
+  title     = {Why Is Long-Horizon Equity Less Risky? {A} Duration-Based
+               Explanation of the Value Premium},
+  journal   = {Journal of Finance},
+  year      = {2007},
+  volume    = {62},
+  number    = {1},
+  pages     = {55--92},
+  doi       = {10.1111/j.1540-6261.2007.01203.x}
+}
diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
new file mode 100644
index 000000000..d098eeb7e
--- /dev/null
+++ b/lectures/long_run_risk_operator.md
@@ -0,0 +1,1316 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+(long_run_risk_operator)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Long-Run Risk: An Operator Approach
+
+```{contents} Contents
+:depth: 2
+```
+
+## Overview
+
+This lecture presents key ideas from {cite}`HansenScheinkman2009`, which develops
+an analytical structure that reveals the long-run risk-return relationship for
+nonlinear continuous-time Markov environments.
+
+The core insight is that to understand how risky assets are priced over *long*
+horizons — not just instantaneously — we need tools that reach beyond local
+stochastic calculus.
+
+The paper's main device is a **multiplicative
+decomposition** of a positive stochastic process $\{M_t\}$ into three
+components:
+
+$$
+M_t = e^{\rho t} \hat{M}_t \frac{\phi(X_0)}{\phi(X_t)}
+$$
+
+where
+
+- $e^{\rho t}$ is a deterministic exponential trend governed by an **eigenvalue** $\rho$,
+- $\hat{M}_t$ is a **martingale** that encodes a change of probability measure, and
+- $\phi(X_0)/\phi(X_t)$ is a **transient** (stationary) component built from
+  the **principal eigenfunction** $\phi$ of an operator associated with $M$.
+
+This factorization is the continuous-time, nonlinear Markov generalization of
+the Perron–Frobenius theorem for positive matrices, and it plays the same role
+that the dominant eigenvalue plays in linear systems: it governs long-run
+growth rates.
+
+**What you will learn:**
+
+- What a *multiplicative functional* is and why semigroups arise naturally in
+  asset pricing.
+- How to find the *principal eigenfunction* $\phi$ and eigenvalue $\rho$ for
+  a given semigroup.
+- How the eigenvalue $\rho$ encodes long-run risk-adjusted discount rates.
+- How to compute these objects numerically for a finite-state Markov chain and
+  for a continuous diffusion.
+- How the long-run risk-return trade-off differs from its familiar short-run
+  (local) counterpart.
+
+```{note}
+This lecture focuses on discrete-state and affine (Gaussian / square-root)
+continuous-state examples that admit closed-form or easily-computed
+eigenfunctions. 
+
+The general theory in {cite}`HansenScheinkman2009` handles
+far more general nonlinear Markov environments.
+```
+
+Let's start by importing the Python tools we will use.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.linalg import eig, expm
+from scipy.optimize import fsolve
+import warnings
+warnings.filterwarnings('ignore')
+
+plt.rcParams.update({'figure.figsize': (10, 6), 'font.size': 12})
+```
+
+## Multiplicative Functionals and Semigroups
+
+### The Asset-Pricing Setup
+
+Fix a continuous-time Markov process $\{X_t : t \ge 0\}$ on a state space
+$\mathcal{D}_0 \subset \mathbb{R}^n$.  A **stochastic discount factor (SDF)**
+process $\{S_t : t \ge 0\}$ prices all assets: the date-$0$ price of a
+payoff $\Pi_t$ at date $t$ is
+
+$$
+E[S_t \Pi_t \mid \mathcal{F}_0].
+$$
+
+The key structural property of $S$ is **temporal consistency**: if we may
+trade at an intermediate date $\tau \le t$, the date-$\tau$ price of the
+payoff $\Pi_t$ must equal
+
+$$
+E\!\left[\frac{S_t}{S_\tau} \Pi_t \;\Big|\; \mathcal{F}_\tau\right].
+$$
+
+When prices depend only on the current Markov state, this temporal
+consistency forces $S$ to satisfy a **multiplicative property**.
+
+### Multiplicative Functionals
+
+```{admonition} Definition (Multiplicative Functional)
+A functional $\{M_t : t \ge 0\}$ adapted to the filtration generated by $X$
+is **multiplicative** if $M_0 = 1$ and
+
+$$
+M_{t+u} = M_u(\theta_t) \cdot M_t \qquad \forall\, t, u \ge 0,
+$$
+
+where $\theta_t$ is the shift operator on the Markov process.
+```
+
+Equivalently, if $A_t = \log M_t$ then $A$ is **additive**:
+$A_0 = 0$ and $A_{t+u} = A_u(\theta_t) + A_t$.
+
+For a diffusion with Brownian motion $B$ and jump compensator $\eta$, the
+general additive functional takes the form
+
+$$
+A_t = \int_0^t \beta(X_u)\,du + \int_0^t \gamma(X_{u-})'\,dB_u
+      + \sum_{0 \le u \le t} \kappa(X_u, X_{u-}),
+$$
+
+so $M_t = e^{A_t}$ is parameterized by the triple $(\beta, \gamma, \kappa)$.
+
+### The Semigroup
+
+Given a multiplicative functional $M$, the family of operators
+
+$$
+\mathbb{M}_t \psi(x) = E[M_t \psi(X_t) \mid X_0 = x]
+$$
+
+forms a **semigroup**: $\mathbb{M}_0 = \mathbb{I}$ and
+$\mathbb{M}_{t+u} = \mathbb{M}_t \mathbb{M}_u$.
+
+The semigroup property manifests itself here as  the **iterated-values property** in
+asset pricing — it holds because of frictionless trading at intermediate
+dates.
+
+Table I of {cite}`HansenScheinkman2009` lists four important semigroups:
+
+| Object | Multiplicative Functional | Semigroup |
+|---|---|---|
+| Stochastic discount factor | $S$ | $\{\mathbb{S}_t\}$ |
+| Cumulated return | $V$ | $\{\mathbb{V}_t\}$ |
+| Stochastic growth | $G$ | $\{\mathbb{G}_t\}$ |
+| Valuation with growth | $Q = GS$ | $\{\mathbb{Q}_t\}$ |
+
+## The Generator and Its Eigenvalue Problem
+
+### The Extended Generator
+
+The **extended generator** $\mathbb{A}$ of $M$ is defined by: a Borel
+function $\psi$ belongs to the domain of $\mathbb{A}$ if there exists a
+Borel function $\chi$ such that
+
+$$
+N_t = M_t \psi(X_t) - \psi(X_0) - \int_0^t M_s \chi(X_s)\,ds
+$$
+
+is a local martingale.  We then write $\chi = \mathbb{A}\psi$.
+
+For a diffusion parameterized by $(\eta, \xi, \Gamma)$ and a multiplicative
+functional $M$ parameterized by $(\beta, \gamma, \kappa)$, the generator
+takes the form
+
+$$
+\mathbb{A}\phi(x)
+= \underbrace{\left[\xi(x) + \Gamma(x)\gamma(x)\right] \cdot
+              \frac{\partial \phi(x)}{\partial x}}_{\text{drift (twisted)}}
++ \underbrace{\frac{1}{2}
+  \operatorname{tr}\!\left[\Sigma(x)\frac{\partial^2\phi(x)}{\partial x \partial x'}\right]}_{\text{diffusion}}
++ \underbrace{\int [\phi(y) - \phi(x)]\,
+              e^{\kappa(y,x)}\eta(dy \mid x)}_{\text{jumps}}
++ \underbrace{\left[\beta(x) + \frac{|\gamma(x)|^2}{2}
+  + \int (e^{\kappa(y,x)} - 1)\,\eta(dy\mid x)\right]\phi(x)}_{\text{level}}
+$$
+
+where $\Sigma = \Gamma\Gamma'$.
+
+### The Principal Eigenvalue Problem
+
+```{admonition} Definition (Principal Eigenfunction)
+A strictly positive Borel function $\phi$ is a **principal eigenfunction**
+of $\mathbb{A}$ with **eigenvalue** $\rho$ if
+
+$$
+\mathbb{A}\phi = \rho\,\phi.
+$$
+```
+
+This is a key equation.
+
+Equivalently (and more computationally useful),
+$\phi$ solves the **principal eigenvalue problem** for the semigroup:
+
+$$
+\mathbb{M}_t \phi = e^{\rho t} \phi \qquad \forall\, t \ge 0.
+$$
+
+### The Multiplicative Decomposition
+
+Once we have a principal eigenfunction $\phi$ with eigenvalue $\rho$, we
+obtain the **multiplicative factorization** {cite}`HansenScheinkman2009`:
+
+$$
+M_t = e^{\rho t} \,\hat{M}_t\, \frac{\phi(X_0)}{\phi(X_t)},
+$$
+
+where the **martingale component** is
+
+$$
+\hat{M}_t = e^{-\rho t} M_t \frac{\phi(X_t)}{\phi(X_0)}.
+$$
+
+```{note}
+{cite}`AlvarezJermann2005` proposed a multiplicative decomposition of the SDF into a permanent martingale component and a transitory component. 
+{cite}`HansenScheinkman2009` established the connection to principal
+eigenfunctions and proved existence and uniqueness results.
+```
+
+### Long-Run Dominance
+
+Proposition 7.1 of {cite}`HansenScheinkman2009` establishes that, under
+appropriate stability conditions,
+
+$$
+\lim_{t\to\infty} e^{-\rho t} \mathbb{M}_t \psi
+= \phi \int \frac{\psi}{\phi}\,d\hat{\varsigma},
+$$
+
+where $\hat{\varsigma}$ is the stationary distribution of the **twisted**
+(i.e., $\hat{M}$-distorted) Markov process.
+
+This is the long-run counterpart of the Perron–Frobenius theorem: $\rho$
+governs the exponential growth (or decay) rate of the semigroup, and $\phi$
+determines the limiting state dependence.
+
+## Finite-State Markov Chain: The Matrix Case
+
+The continuous-time theory is cleanest when the state space is finite.
+This section works through the finite-state case in detail — it is exactly
+the Perron–Frobenius theorem for non-negative matrices.
+
+### Intensity Matrix and Multiplicative Functional
+
+Let $X$ be a continuous-time Markov chain with $N$ states
+$\{x_1, \ldots, x_N\}$ and **intensity matrix** $\mathbb{U}$
+(with $u_{ij} \ge 0$ for $i \ne j$ and $u_{ii} = -\sum_{j \ne i} u_{ij}$).
+
+A multiplicative functional is parameterized by
+
+- a decay rate $\beta_i \ge 0$ in state $x_i$, and
+- a jump scaling $e^{\kappa(x_j, x_i)}$ when jumping from $x_i$ to $x_j$.
+
+The **generator matrix** $\mathbb{A}$ for the multiplicative semigroup has
+entries
+
+$$
+a_{ij} = \begin{cases}
+u_{ii} - \beta_i & \text{if } i = j, \\
+u_{ij}\, e^{\kappa(x_j,\, x_i)} & \text{if } i \ne j.
+\end{cases}
+$$
+
+The semigroup is $\mathbb{M}_t = e^{t\mathbb{A}}$ (matrix exponential).
+
+### Finding the Principal Eigenvalue
+
+The principal eigenvalue $\rho$ is the **largest real eigenvalue** of
+$\mathbb{A}$, and the principal eigenfunction $\phi$ is the corresponding
+strictly positive (Perron) eigenvector.
+
+```{code-cell} ipython3
+def build_generator(U, beta, kappa_mat):
+    """
+    Build the generator matrix A for the multiplicative semigroup.
+
+    Parameters
+    ----------
+    U : (N, N) array — intensity matrix of X
+    beta : (N,) array — discount rates in each state
+    kappa_mat : (N, N) array — kappa[j, i] = kappa(x_j, x_i)
+
+    Returns
+    -------
+    A : (N, N) generator matrix
+    """
+    N = U.shape[0]
+    A = np.zeros((N, N))
+    for i in range(N):
+        for j in range(N):
+            if i == j:
+                A[i, i] = U[i, i] - beta[i]
+            else:
+                A[i, j] = U[i, j] * np.exp(kappa_mat[j, i])
+    return A
+
+
+def principal_eigen(A):
+    """
+    Return the largest real eigenvalue and corresponding positive eigenvector.
+
+    Parameters
+    ----------
+    A : (N, N) array
+
+    Returns
+    -------
+    rho : float — principal eigenvalue
+    phi : (N,) array — principal eigenfunction (positive, normalized)
+    """
+    eigenvalues, eigenvectors = eig(A)
+    # Keep only real eigenvalues
+    real_mask = np.abs(eigenvalues.imag) < 1e-10
+    real_eigs = eigenvalues[real_mask].real
+    real_vecs = eigenvectors[:, real_mask].real
+
+    # Largest real eigenvalue
+    idx = np.argmax(real_eigs)
+    rho = real_eigs[idx]
+    phi = real_vecs[:, idx]
+
+    # Make positive and normalize
+    if phi.min() < 0:
+        phi = -phi
+    phi = phi / phi.max()
+    return rho, phi
+```
+
+### A Two-State Example: Boom and Recession
+
+Consider an economy that alternates between a **boom** state ($x_1$) and a
+**recession** state ($x_2$).
+
+```{code-cell} ipython3
+# Intensity matrix: boom <-> recession
+# Expected duration of boom = 1/lambda_1, recession = 1/lambda_2
+lambda_1 = 0.3   # rate of leaving boom
+lambda_2 = 0.5   # rate of leaving recession
+
+U = np.array([[-lambda_1,  lambda_1],
+              [ lambda_2, -lambda_2]])
+
+# Stochastic discount factor parameters
+# Higher discount rate in boom (asset prices high, SDF low)
+beta = np.array([0.05, 0.02])   # per-unit-time decay
+
+# No jump scaling in this example
+kappa_mat = np.zeros((2, 2))
+
+A = build_generator(U, beta, kappa_mat)
+rho, phi = principal_eigen(A)
+
+print("Generator matrix A:")
+print(np.round(A, 4))
+print(f"\nPrincipal eigenvalue ρ = {rho:.6f}")
+print(f"Principal eigenfunction φ = {phi}")
+print(f"\nInterpretation: long-run SDF decay rate = {rho:.4f} per unit time")
+```
+
+```{code-cell} ipython3
+# Verify: M_t φ = exp(ρt) φ  for t = 1, 2, 5
+for t in [1.0, 2.0, 5.0]:
+    Mt = expm(t * A)     # semigroup at time t
+    lhs = Mt @ phi
+    rhs = np.exp(rho * t) * phi
+    print(f"t={t}: max |M_t φ - exp(ρt)φ| = {np.max(np.abs(lhs - rhs)):.2e}")
+```
+
+```{code-cell} ipython3
+# Show long-run dominance: exp(-ρt) M_t ψ → φ ∫(ψ/φ) dς̂
+# for any ψ
+
+# Compute twisted stationary distribution ς̂ via M̂_t = exp(-ρt) M_t φ(X_t)/φ(X_0)
+# The generator of M̂ is: Â_ij = (1/φ_i) A_ij φ_j  (similarity transform)
+phi_diag_inv = np.diag(1.0 / phi)
+phi_diag     = np.diag(phi)
+A_hat = phi_diag_inv @ A @ phi_diag - rho * np.eye(2)
+
+# Stationary distribution of Â: solve π A_hat = 0, sum π = 1
+# (left eigenvector corresponding to eigenvalue 0)
+evals, evecs = eig(A_hat.T)
+idx0 = np.argmin(np.abs(evals.real))
+varsigma_hat = evecs[:, idx0].real
+varsigma_hat = np.abs(varsigma_hat) / np.abs(varsigma_hat).sum()
+
+print("Twisted stationary distribution ς̂:")
+print(f"  Boom:      {varsigma_hat[0]:.4f}")
+print(f"  Recession: {varsigma_hat[1]:.4f}")
+
+# Test convergence for ψ = [1, 2]
+psi = np.array([1.0, 2.0])
+limit_theoretical = phi * np.sum((psi / phi) * varsigma_hat)
+
+for t in [5, 20, 50, 100]:
+    Mt = expm(t * A)
+    approx = np.exp(-rho * t) * Mt @ psi
+    print(f"t={t:3d}: exp(-ρt)M_t ψ = {approx}, theoretical limit = {limit_theoretical}")
+```
+
+### Impact of Jump Scaling
+
+Now introduce jump scaling: the SDF **jumps up** (positive surprise) when
+transitioning from recession to boom, and jumps down otherwise.
+
+```{code-cell} ipython3
+# kappa_mat[j, i] = kappa(x_j, x_i): jump when going from state i to state j
+kappa_mat2 = np.array([[0.0,  0.3],   # boom <- recession: positive jump
+                        [-0.2, 0.0]]) # recession <- boom: negative jump
+
+A2 = build_generator(U, beta, kappa_mat2)
+rho2, phi2 = principal_eigen(A2)
+
+print(f"Without jumps: ρ = {rho:.6f}")
+print(f"With jumps:    ρ = {rho2:.6f}")
+print(f"\nPrincipal eigenfunctions:")
+print(f"  φ (no jumps):   {phi}")
+print(f"  φ (with jumps): {phi2}")
+```
+
+```{code-cell} ipython3
+# Plot how ρ varies with jump size κ_21 (recession -> boom jump)
+kappa_values = np.linspace(-0.5, 0.5, 100)
+rho_values = []
+
+for kappa_21 in kappa_values:
+    kmat = np.array([[0.0, kappa_21],
+                     [-0.2, 0.0]])
+    Ak = build_generator(U, beta, kmat)
+    rk, _ = principal_eigen(Ak)
+    rho_values.append(rk)
+
+fig, ax = plt.subplots()
+ax.plot(kappa_values, rho_values, 'b-', lw=2)
+ax.axhline(rho, color='r', ls='--', label=f'ρ (no jumps) = {rho:.4f}')
+ax.axvline(0, color='k', ls=':', lw=0.8)
+ax.set_xlabel('Jump size κ (recession → boom)')
+ax.set_ylabel('Principal eigenvalue ρ')
+ax.set_title('Long-run growth rate as a function of jump scaling')
+ax.legend()
+plt.tight_layout()
+plt.show()
+```
+
+## Continuous-State Affine Example
+
+{cite}`HansenScheinkman2009` present a Markov diffusion with two components:
+a **Feller square-root process** $X^f$ (stochastic volatility) and an
+**Ornstein–Uhlenbeck process** $X^o$ (predictable growth).
+
+### State Dynamics
+
+$$
+dX^f_t = \xi_f(\bar{x}_f - X^f_t)\,dt + \sqrt{X^f_t}\,\sigma_f\,dB^f_t,
+\qquad dX^o_t = \xi_o(\bar{x}_o - X^o_t)\,dt + \sigma_o\,dB^o_t.
+$$
+
+Parameter restrictions: $\xi_f, \bar{x}_f > 0$, $2\xi_f \bar{x}_f \ge \sigma_f^2$.
+
+### Multiplicative Functional
+
+Consider a multiplicative functional $M = e^A$ with additive functional
+
+$$
+A_t = \bar{\beta} t
+     + \int_0^t \beta_f X^f_s\,ds
+     + \int_0^t \beta_o X^o_s\,ds
+     + \int_0^t \sqrt{X^f_s}\,\gamma_f\,dB^f_s
+     + \int_0^t \gamma_o\,dB^o_s.
+$$
+
+### Affine Eigenfunction
+
+Guess an eigenfunction of the form $\phi(x^f, x^o) = e^{c_f x^f + c_o x^o}$.
+
+Substituting into the eigenvalue equation $\mathbb{A}\phi = \rho\phi$ and
+collecting coefficients yields two equations:
+
+**Coefficient of $x^f$:**
+
+$$
+0 = \beta_f + \frac{\gamma_f^2}{2} + c_f(\gamma_f \sigma_f - \xi_f)
+    + c_f^2 \frac{\sigma_f^2}{2}
+$$
+
+This is a **quadratic** in $c_f$:
+
+$$
+c_f = \frac{(\xi_f - \gamma_f\sigma_f) \pm
+\sqrt{(\xi_f - \gamma_f\sigma_f)^2 - \sigma_f^2(2\beta_f + \gamma_f^2)}}{\sigma_f^2}
+$$
+
+**Coefficient of $x^o$:**
+
+$$
+0 = \beta_o - c_o \xi_o \implies c_o = \frac{\beta_o}{\xi_o}
+$$
+
+**Eigenvalue:**
+
+$$
+\rho = \bar{\beta} + \frac{\gamma_o^2}{2} + c_f \xi_f \bar{x}_f
+       + c_o(\xi_o \bar{x}_o + \gamma_o\sigma_o) + c_o^2\frac{\sigma_o^2}{2}
+$$
+
+The **correct** root for $c_f$ is the one that implies mean reversion in the
+twisted process (so the distorted $X^f$ remains stationary).
+
+```{code-cell} ipython3
+def solve_affine_eigenfunction(params):
+    """
+    Solve for the affine eigenfunction of the Hansen-Scheinkman
+    diffusion example.
+
+    Parameters
+    ----------
+    params : dict with keys
+        xi_f, xbar_f, sigma_f : square-root process parameters
+        xi_o, xbar_o, sigma_o : OU process parameters
+        beta_bar, beta_f, beta_o : drift loadings
+        gamma_f, gamma_o : diffusion loadings
+
+    Returns
+    -------
+    cf, co, rho : eigenfunction coefficients and eigenvalue
+    rho_check : sign of drift coefficient in twisted X^f (must be negative)
+    """
+    xi_f   = params['xi_f']
+    xbar_f = params['xbar_f']
+    sigma_f = params['sigma_f']
+    xi_o   = params['xi_o']
+    xbar_o = params['xbar_o']
+    sigma_o = params['sigma_o']
+    beta_bar = params['beta_bar']
+    beta_f  = params['beta_f']
+    beta_o  = params['beta_o']
+    gamma_f = params['gamma_f']
+    gamma_o = params['gamma_o']
+
+    # co from linear equation
+    co = beta_o / xi_o
+
+    # cf from quadratic equation
+    discriminant = (xi_f - gamma_f * sigma_f)**2 - sigma_f**2 * (2*beta_f + gamma_f**2)
+    if discriminant < 0:
+        raise ValueError("No real solution for c_f; check parameters.")
+
+    cf_plus  = ((xi_f - gamma_f*sigma_f) + np.sqrt(discriminant)) / sigma_f**2
+    cf_minus = ((xi_f - gamma_f*sigma_f) - np.sqrt(discriminant)) / sigma_f**2
+
+    # Select root giving mean reversion in twisted X^f
+    # Twisted drift coefficient on x^f: xi_f - sigma_f^2 * cf
+    # (must be negative for stationarity)
+    def twisted_drift_coef(cf):
+        return -(xi_f - sigma_f * (gamma_f + cf * sigma_f))
+
+    drift_plus  = twisted_drift_coef(cf_plus)
+    drift_minus = twisted_drift_coef(cf_minus)
+
+    # Choose cf so that twisted process is mean reverting (drift_coef < 0)
+    if drift_minus < 0:
+        cf = cf_minus
+    elif drift_plus < 0:
+        cf = cf_plus
+    else:
+        # Pick the one with smaller |cf|
+        cf = cf_minus if abs(cf_minus) < abs(cf_plus) else cf_plus
+
+    # Eigenvalue
+    rho = (beta_bar
+           + gamma_o**2 / 2
+           + cf * xi_f * xbar_f
+           + co * (xi_o * xbar_o + gamma_o * sigma_o)
+           + co**2 * sigma_o**2 / 2)
+
+    return cf, co, rho
+```
+
+```{code-cell} ipython3
+# Breeden (1979) consumption-based model parameters
+# (Section 3.3 and 8.1 of Hansen-Scheinkman 2009)
+params_breeden = {
+    'xi_f'  : 2.5,    # mean reversion speed of X^f
+    'xbar_f': 0.04,   # long-run mean of X^f (average variance)
+    'sigma_f': -0.1,  # volatility of X^f (negative by convention)
+    'xi_o'  : 0.1,    # mean reversion speed of X^o
+    'xbar_o': 0.02,   # long-run mean of X^o (average growth)
+    'sigma_o': 0.02,  # volatility of X^o
+    # Stochastic discount factor loadings (Breeden CRRA with risk aversion a)
+    'a'     : 5.0,    # risk aversion
+    'theta_f': 1.0,   # consumption-volatility loading
+    'theta_o': 1.0,   # consumption-growth loading
+}
+
+a  = params_breeden['a']
+theta_f = params_breeden['theta_f']
+theta_o = params_breeden['theta_o']
+sigma_f = params_breeden['sigma_f']
+sigma_o = params_breeden['sigma_o']
+xi_f    = params_breeden['xi_f']
+xi_o    = params_breeden['xi_o']
+
+# For the Breeden SDF: S_t = exp(A^s_t) where
+#   beta_bar = -b (subjective discount rate)
+#   beta_f   = -a * beta_o_in_sdf (risk aversion x volatility loading)
+#   gamma_f  = -a * sqrt(x^f) * theta_f  => loading
+#   gamma_o  = -a * theta_o
+b = 0.03   # subjective discount rate
+
+params_sdf = {
+    'xi_f'   : xi_f,
+    'xbar_f' : params_breeden['xbar_f'],
+    'sigma_f': sigma_f,
+    'xi_o'   : xi_o,
+    'xbar_o' : params_breeden['xbar_o'],
+    'sigma_o': sigma_o,
+    'beta_bar': -b,
+    'beta_f'  : 0.0,       # no x^f level loading in SDF
+    'beta_o'  : -a * params_breeden['xbar_o'],   # approximate
+    'gamma_f' : -a * theta_f,
+    'gamma_o' : -a * theta_o,
+}
+
+cf, co, rho = solve_affine_eigenfunction(params_sdf)
+
+print("Affine eigenfunction φ(x^f, x^o) = exp(c_f x^f + c_o x^o)")
+print(f"  c_f = {cf:.6f}")
+print(f"  c_o = {co:.6f}")
+print(f"\nPrincipal eigenvalue ρ = {rho:.6f}")
+print(f"\nInterpretation:")
+print(f"  Long-run SDF growth rate = {rho:.4f}")
+print(f"  Long-run risk-free rate ≈ {-rho:.4f}")
+```
+
+### Sensitivity to Risk Aversion
+
+A key result of {cite}`HansenScheinkman2009` is that the eigenvalue $\rho$
+encodes the long-run risk adjustment.
+
+We can trace out a **long-run
+risk-return frontier** by varying risk exposure.
+
+```{code-cell} ipython3
+# Vary risk aversion and trace the long-run eigenvalue
+a_values = np.linspace(0.5, 10.0, 50)
+rho_values = []
+
+for a_val in a_values:
+    p = dict(params_sdf)   # copy
+    p['beta_o']  = -a_val * params_breeden['xbar_o']
+    p['gamma_f'] = -a_val * theta_f
+    p['gamma_o'] = -a_val * theta_o
+    try:
+        _, _, rho_val = solve_affine_eigenfunction(p)
+        rho_values.append(rho_val)
+    except ValueError:
+        rho_values.append(np.nan)
+
+fig, ax = plt.subplots()
+ax.plot(a_values, rho_values, 'b-', lw=2)
+ax.set_xlabel('Risk aversion $a$')
+ax.set_ylabel('Principal eigenvalue $\\rho$')
+ax.set_title('Long-run decay rate of SDF vs. risk aversion')
+ax.axhline(0, color='k', ls=':', lw=0.8)
+plt.tight_layout()
+plt.show()
+```
+
+## The Multiplicative Decomposition in the Diffusion Example
+
+Given the affine eigenfunction, we can explicitly construct the martingale
+component $\hat{M}$ and illustrate the decomposition.
+
+The martingale $\hat{M}_t = e^{\hat{A}_t}$ where
+
+$$
+\hat{A}_t = \int_0^t \sqrt{X^f_s}(\gamma_f + c_f\sigma_f)\,dB^f_s
+           + \int_0^t (\gamma_o + c_o\sigma_o)\,dB^o_s
+           - \frac{(\gamma_f + c_f\sigma_f)^2}{2}\int_0^t X^f_s\,ds
+           - \frac{(\gamma_o + c_o\sigma_o)^2}{2} t.
+$$
+
+The **twisted drift** for $X^f$ under the $\hat{M}$-measure is
+
+$$
+\xi_f(\bar{x}_f - x^f) + x^f \sigma_f(\gamma_f + c_f\sigma_f),
+$$
+
+and for $X^o$:
+
+$$
+\xi_o(\bar{x}_o - x^o) + \sigma_o(\gamma_o + c_o\sigma_o).
+$$
+
+```{code-cell} ipython3
+def simulate_diffusion(params_sdf, T=50.0, dt=0.01, seed=42):
+    """
+    Simulate the Hansen-Scheinkman affine diffusion and
+    the multiplicative decomposition M_t = exp(ρt) M̂_t φ(X0)/φ(X_t).
+
+    Returns
+    -------
+    times : array of time points
+    Xf, Xo : state paths
+    Mt : M_t path
+    Mt_hat : M̂_t path (martingale component)
+    phi_ratio : φ(X0)/φ(X_t) path (transient component)
+    rho : eigenvalue
+    """
+    rng = np.random.default_rng(seed)
+    cf, co, rho = solve_affine_eigenfunction(params_sdf)
+
+    xi_f   = params_sdf['xi_f']
+    xbar_f = params_sdf['xbar_f']
+    sigma_f = params_sdf['sigma_f']
+    xi_o   = params_sdf['xi_o']
+    xbar_o = params_sdf['xbar_o']
+    sigma_o = params_sdf['sigma_o']
+    beta_bar = params_sdf['beta_bar']
+    beta_f   = params_sdf['beta_f']
+    beta_o   = params_sdf['beta_o']
+    gamma_f  = params_sdf['gamma_f']
+    gamma_o  = params_sdf['gamma_o']
+
+    n_steps = int(T / dt)
+    times = np.linspace(0, T, n_steps + 1)
+
+    # Initialize at long-run means
+    Xf = np.zeros(n_steps + 1)
+    Xo = np.zeros(n_steps + 1)
+    Xf[0] = xbar_f
+    Xo[0] = xbar_o
+
+    # Additive functional A_t (log M_t)
+    A = np.zeros(n_steps + 1)
+    A_hat = np.zeros(n_steps + 1)  # log M̂_t
+
+    for i in range(n_steps):
+        xf = max(Xf[i], 1e-8)
+        xo = Xo[i]
+
+        dBf = rng.standard_normal() * np.sqrt(dt)
+        dBo = rng.standard_normal() * np.sqrt(dt)
+
+        # State evolution
+        Xf[i+1] = max(xf + xi_f * (xbar_f - xf) * dt
+                      + np.sqrt(xf) * sigma_f * dBf, 1e-8)
+        Xo[i+1] = xo + xi_o * (xbar_o - xo) * dt + sigma_o * dBo
+
+        # Additive functional increment
+        dA = (beta_bar + beta_f * xf + beta_o * xo) * dt \
+             + np.sqrt(xf) * gamma_f * dBf \
+             + gamma_o * dBo \
+             + 0.5 * (gamma_f**2 * xf + gamma_o**2) * dt  # Ito correction
+
+        A[i+1] = A[i] + dA
+
+        # Martingale component increment
+        dA_hat = (np.sqrt(xf) * (gamma_f + cf * sigma_f) * dBf
+                  + (gamma_o + co * sigma_o) * dBo
+                  - 0.5 * ((gamma_f + cf * sigma_f)**2 * xf
+                           + (gamma_o + co * sigma_o)**2) * dt)
+
+        A_hat[i+1] = A_hat[i] + dA_hat
+
+    phi0 = np.exp(cf * Xf[0] + co * Xo[0])
+    phi_t = np.exp(cf * Xf + co * Xo)
+
+    Mt     = np.exp(A)
+    Mt_hat = np.exp(A_hat)
+    phi_ratio = phi0 / phi_t
+
+    return times, Xf, Xo, Mt, Mt_hat, phi_ratio, rho, cf, co
+
+
+times, Xf, Xo, Mt, Mt_hat, phi_ratio, rho, cf, co = simulate_diffusion(
+    params_sdf, T=30.0, dt=0.01
+)
+
+print(f"ρ = {rho:.6f},  c_f = {cf:.4f},  c_o = {co:.4f}")
+```
+
+```{code-cell} ipython3
+# Plot the three components of the decomposition
+fig, axes = plt.subplots(2, 2, figsize=(12, 8))
+
+ax = axes[0, 0]
+ax.plot(times, Xf, 'b-', lw=1)
+ax.set_title('$X^f_t$ (stochastic volatility)')
+ax.set_xlabel('$t$')
+
+ax = axes[0, 1]
+ax.plot(times, Xo, 'g-', lw=1)
+ax.set_title('$X^o_t$ (predictable growth)')
+ax.set_xlabel('$t$')
+
+ax = axes[1, 0]
+ax.plot(times, Mt, 'b-', lw=1.5, label='$M_t$')
+ax.plot(times, np.exp(rho * times) * Mt_hat * phi_ratio, 'r--',
+        lw=1, label='$e^{\\rho t}\\hat{M}_t\\phi(X_0)/\\phi(X_t)$')
+ax.set_title('Decomposition check: $M_t = e^{\\rho t}\\hat{M}_t \\phi(X_0)/\\phi(X_t)$')
+ax.set_xlabel('$t$')
+ax.legend(fontsize=9)
+
+ax = axes[1, 1]
+ax.plot(times, np.exp(rho * times), 'k-', lw=1.5, label=f'$e^{{\\rho t}}$, ρ={rho:.4f}')
+ax.plot(times, Mt_hat, 'b-', lw=1, alpha=0.7, label='$\\hat{M}_t$ (martingale)')
+ax.plot(times, phi_ratio, 'r-', lw=1, alpha=0.7, label='$\\phi(X_0)/\\phi(X_t)$ (transient)')
+ax.set_title('Three components of $M_t$')
+ax.set_xlabel('$t$')
+ax.legend(fontsize=9)
+
+plt.suptitle('Multiplicative Decomposition of SDF', fontsize=13, y=1.01)
+plt.tight_layout()
+plt.show()
+```
+
+## Long-Run Risk-Return Trade-offs
+
+### The Short-Run (Local) Trade-off
+
+From Corollary 3.1 of {cite}`HansenScheinkman2009`, the instantaneous
+required expected rate of return for a portfolio with Brownian exposure
+$\gamma_v$ to the SDF with Brownian component $\gamma_s$ is
+
+$$
+\varepsilon_v = -\beta_s - \gamma_v \cdot \gamma_s - \frac{|\gamma_s|^2}{2}.
+$$
+
+The vector $-\gamma_s$ contains the **local (instantaneous) risk prices**.
+
+### The Long-Run Trade-off via Changing Cash Flows
+
+For a cash flow $D_t = G_t \psi(X_t) D_0$ with growth functional $G =
+e^{A^g}$, the **long-run risk-adjusted return** is $-\rho + \delta$, where:
+
+- $\delta$ is the expected growth rate of $G$, and
+- $\rho$ is the principal eigenvalue of the semigroup built from $M = GS$.
+
+```{admonition} Long-Run Risk Price Formula
+For the affine diffusion with an Ornstein–Uhlenbeck growth predictor $X^o$,
+the long-run risk price for exposure $\gamma^g_o$ to the $B^o$ shock is
+
+$$
+\frac{d\rho}{d\gamma^g_o} = -\gamma^s_o - \frac{\beta^s_o}{\xi_o}\,\sigma_o.
+$$
+
+The term $\beta^s_o / \xi_o$ captures the **persistence effect**: a shock to
+$X^o$ reverberates over a horizon of order $1/\xi_o$.
+
+The more persistent
+the growth process, the larger the long-run risk price relative to the
+local risk price $-\gamma^s_o$.
+```
+
+```{code-cell} ipython3
+def long_run_risk_return(gamma_g_o_values, params_sdf):
+    """
+    Compute the long-run risk-adjusted return -ρ+δ for varying
+    cash-flow exposure γ^g_o to the B^o shock.
+
+    The combined multiplicative functional M = GS has loadings:
+        gamma_o = gamma_g_o + gamma_s_o
+        beta_o  = beta_s_o  (unchanged, since G is a martingale)
+    """
+    _, _, rho_s = solve_affine_eigenfunction(params_sdf)
+    gamma_s_o = params_sdf['gamma_o']
+    delta = 0.02   # assumed cash-flow growth rate
+
+    rho_vals = []
+    for gamma_g_o in gamma_g_o_values:
+        p = dict(params_sdf)
+        p['gamma_o'] = gamma_s_o + gamma_g_o   # combined loading
+        # beta_bar includes growth correction: delta - (gamma_g_o)^2/2
+        p['beta_bar'] = params_sdf['beta_bar'] + delta - 0.5 * gamma_g_o**2
+        try:
+            _, _, rho_val = solve_affine_eigenfunction(p)
+            rho_vals.append(-rho_val + delta)
+        except ValueError:
+            rho_vals.append(np.nan)
+    return np.array(rho_vals)
+
+
+gamma_g_o_vals = np.linspace(-0.5, 0.5, 100)
+ret_vals = long_run_risk_return(gamma_g_o_vals, params_sdf)
+
+# Local risk price: ∂ε_v/∂γ_v = -γ_s_o (constant)
+gamma_s_o = params_sdf['gamma_o']
+local_slope = -gamma_s_o
+local_return = -params_sdf['beta_bar'] + local_slope * gamma_g_o_vals
+
+fig, ax = plt.subplots()
+ax.plot(gamma_g_o_vals, ret_vals, 'b-', lw=2, label='Long-run return $-\\rho+\\delta$')
+ax.plot(gamma_g_o_vals, local_return, 'r--', lw=2, label='Local return approximation')
+ax.set_xlabel('Cash-flow risk exposure $\\gamma^g_o$')
+ax.set_ylabel('Required rate of return')
+ax.set_title('Long-Run vs. Local Risk-Return Trade-off ($B^o$ exposure)')
+ax.legend()
+plt.tight_layout()
+plt.show()
+```
+
+```{code-cell} ipython3
+# Quantify the long-run risk price vs local risk price
+xi_o    = params_sdf['xi_o']
+sigma_o = params_sdf['sigma_o']
+beta_s_o = params_sdf['beta_o']
+
+local_price   = -gamma_s_o
+lr_price      = -gamma_s_o - (beta_s_o / xi_o) * sigma_o
+
+print("Risk prices for B^o exposure:")
+print(f"  Local (instantaneous) risk price: {local_price:.4f}")
+print(f"  Long-run risk price:              {lr_price:.4f}")
+print(f"  Persistence amplification factor: {(beta_s_o/xi_o)*sigma_o:.4f}")
+print(f"  (= β^s_o/ξ_o × σ_o, captures reverberation over horizon 1/ξ_o={1/xi_o:.1f})")
+```
+
+## Perron–Frobenius Theory and the Finite-State Case
+
+The long-run dominance result (Proposition 7.1 of {cite}`HansenScheinkman2009`)
+is the continuous-time, general Markov generalization of classical
+Perron–Frobenius theory.
+
+Let us illustrate this with a
+three-state Markov chain.
+
+```{code-cell} ipython3
+# Three-state example: expansion, normal, contraction
+N = 3
+state_names = ['Expansion', 'Normal', 'Contraction']
+
+# Intensity matrix
+U3 = np.array([[-0.4,  0.3,  0.1],
+               [ 0.2, -0.5,  0.3],
+               [ 0.1,  0.2, -0.3]])
+
+# Discount rates (higher in expansion = rich economy)
+beta3 = np.array([0.06, 0.04, 0.01])
+
+# No jumps
+kappa3 = np.zeros((3, 3))
+A3 = build_generator(U3, beta3, kappa3)
+rho3, phi3 = principal_eigen(A3)
+
+print("Three-state Markov chain")
+print(f"\nGenerator matrix A:")
+print(np.round(A3, 3))
+print(f"\nPrincipal eigenvalue ρ = {rho3:.6f}")
+print(f"Principal eigenfunction φ = {phi3}")
+
+# Verify Perron-Frobenius dominance
+eigenvalues, _ = eig(A3)
+real_eigs = sorted(eigenvalues.real, reverse=True)
+print(f"\nAll eigenvalues (real parts): {[f'{e:.4f}' for e in real_eigs]}")
+print("ρ is strictly largest: confirms long-run dominance")
+```
+
+```{code-cell} ipython3
+# Demonstrate long-run dominance: exp(-ρt) M_t ψ → φ ∫(ψ/φ) dς̂
+# for three different initial functions ψ
+
+# Compute twisted stationary distribution
+phi3_diag_inv = np.diag(1.0 / phi3)
+phi3_diag     = np.diag(phi3)
+A3_hat = phi3_diag_inv @ A3 @ phi3_diag - rho3 * np.eye(3)
+
+evals3, evecs3 = eig(A3_hat.T)
+idx0 = np.argmin(np.abs(evals3.real))
+varsigma3 = evecs3[:, idx0].real
+varsigma3 = np.abs(varsigma3) / np.abs(varsigma3).sum()
+
+psi_functions = {
+    'ψ = [1, 0, 0]': np.array([1.0, 0.0, 0.0]),
+    'ψ = [0, 1, 0]': np.array([0.0, 1.0, 0.0]),
+    'ψ = [1, 2, 3]': np.array([1.0, 2.0, 3.0]),
+}
+
+t_grid = np.linspace(0, 30, 200)
+fig, axes = plt.subplots(1, 3, figsize=(14, 4))
+
+for ax, (label, psi) in zip(axes, psi_functions.items()):
+    limit = phi3 * np.sum((psi / phi3) * varsigma3)
+
+    for state_idx, color in enumerate(['b', 'g', 'r']):
+        vals = []
+        for t in t_grid:
+            Mt = expm(t * A3)
+            approx = np.exp(-rho3 * t) * (Mt @ psi)
+            vals.append(approx[state_idx])
+        ax.plot(t_grid, vals, color=color, lw=1.5, alpha=0.7,
+                label=f'State {state_idx+1}')
+        ax.axhline(limit[state_idx], color=color, ls='--', lw=0.8)
+
+    ax.set_title(label)
+    ax.set_xlabel('$t$')
+    ax.set_ylabel('$e^{-\\rho t}\\mathbb{M}_t\\psi$')
+
+axes[0].legend(fontsize=9)
+fig.suptitle('Long-Run Dominance: $e^{-\\rho t}\\mathbb{M}_t\\psi \\to \\phi\\int(\\psi/\\phi)\\,d\\hat{\\varsigma}$\n'
+             '(dashed lines = theoretical limits)', fontsize=11)
+plt.tight_layout()
+plt.show()
+```
+
+## Summary
+
+This lecture has illustrated the main ideas of {cite}`HansenScheinkman2009`:
+
+1. **Multiplicative functionals** and their associated semigroups are the
+   natural language for intertemporal asset pricing.
+
+2. The **principal eigenvalue** $\rho$ and **eigenfunction** $\phi$ of the
+   semigroup generator provide the long-run risk-return relationship:
+   $\rho$ is the asymptotic growth (or decay) rate and $\phi$ determines
+   the limiting state dependence.
+
+3. The **multiplicative decomposition**
+   $M_t = e^{\rho t}\hat{M}_t(\phi(X_0)/\phi(X_t))$
+   separates permanent ($e^{\rho t}\hat{M}_t$) from transient
+   ($\phi(X_0)/\phi(X_t)$) components.
+
+4. In finite-state chains, this is exactly **Perron–Frobenius theory**.
+
+5. For the affine diffusion example, the eigenfunction is exponential in
+   the state, and the eigenvalue formula reveals how **persistence**
+   amplifies long-run risk prices beyond their local counterparts.
+
+## Exercises
+
+```{exercise}
+:label: lrr_ex1
+
+Consider a two-state Markov chain with intensity matrix
+
+$$
+\mathbb{U} = \begin{pmatrix} -\lambda & \lambda \\ \mu & -\mu \end{pmatrix}
+$$
+
+and a multiplicative functional with decay rates $\beta_1 > 0$ in state 1
+and $\beta_2 = 0$ in state 2, and no jump scaling.
+
+(a) Write down the generator matrix $\mathbb{A}$.
+
+(b) Find the principal eigenvalue $\rho$ in terms of $\lambda$, $\mu$,
+    and $\beta_1$.
+
+(c) Verify numerically with $\lambda = 0.4$, $\mu = 0.6$, $\beta_1 = 0.05$
+    that your formula matches the output of `principal_eigen`.
+
+(d) Show that $\rho$ lies strictly between $-\beta_1$ and $0$.
+```
+
+```{solution-start} lrr_ex1
+:class: dropdown
+```
+
+**(a)** The generator matrix is
+
+$$
+\mathbb{A} = \begin{pmatrix} -\lambda - \beta_1 & \lambda \\ \mu & -\mu \end{pmatrix}
+$$
+
+**(b)** The eigenvalues solve $\det(\mathbb{A} - \rho I) = 0$:
+
+$$
+(-\lambda - \beta_1 - \rho)(-\mu - \rho) - \lambda\mu = 0
+$$
+
+Expanding:
+
+$$
+\rho^2 + (\lambda + \mu + \beta_1)\rho + \mu\beta_1 = 0
+$$
+
+The principal eigenvalue is the larger root:
+
+$$
+\rho = \frac{-(\lambda + \mu + \beta_1) + \sqrt{(\lambda + \mu + \beta_1)^2 - 4\mu\beta_1}}{2}
+$$
+
+**(c)** Numerical verification:
+
+```{code-cell} ipython3
+lam, mu_val, b1 = 0.4, 0.6, 0.05
+
+# Analytical formula
+disc = (lam + mu_val + b1)**2 - 4 * mu_val * b1
+rho_analytical = (-( lam + mu_val + b1) + np.sqrt(disc)) / 2
+
+# Numerical
+U_ex = np.array([[-lam, lam], [mu_val, -mu_val]])
+beta_ex = np.array([b1, 0.0])
+kappa_ex = np.zeros((2, 2))
+A_ex = build_generator(U_ex, beta_ex, kappa_ex)
+rho_numerical, phi_ex = principal_eigen(A_ex)
+
+print(f"Analytical ρ = {rho_analytical:.8f}")
+print(f"Numerical  ρ = {rho_numerical:.8f}")
+print(f"Difference   = {abs(rho_analytical - rho_numerical):.2e}")
+```
+
+**(d)** From the quadratic $\rho^2 + (\lambda+\mu+\beta_1)\rho + \mu\beta_1 = 0$:
+
+- At $\rho = 0$: LHS $= \mu\beta_1 > 0$.
+- At $\rho = -\beta_1$: LHS $= \beta_1^2 - \lambda\beta_1 = \beta_1(\beta_1 - \lambda)$,
+  which can be positive or negative.
+- At $\rho = -(\lambda+\mu+\beta_1)$: LHS $= \mu\beta_1 > 0$.
+
+Since the parabola opens upward and has two real roots summing to $-(\lambda+\mu+\beta_1) < 0$
+with product $\mu\beta_1 > 0$, both roots are negative.  The larger root $\rho$ satisfies
+$-\beta_1 < \rho < 0$ because:
+- $\rho > -(\lambda+\mu+\beta_1) > -\beta_1 - (\lambda+\mu)$, so clearly $\rho > -\infty$.
+- Evaluating the quadratic at $\rho = 0$ gives $\mu\beta_1 > 0$, so 0 is above the right root.
+
+```{code-cell} ipython3
+print(f"ρ = {rho_numerical:.6f}")
+print(f"-β₁ = {-b1:.6f}")
+print(f"0 = 0")
+print(f"Is -β₁ < ρ < 0? {-b1 < rho_numerical < 0}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: lrr_ex2
+
+**Long-run vs. short-run risk prices in the affine model.**
+
+Using the `solve_affine_eigenfunction` function, compute both local and
+long-run risk prices for varying levels of the mean-reversion parameter
+$\xi_o$ of the Ornstein–Uhlenbeck predictor $X^o$.
+
+Specifically, set $\xi_o \in \{0.05, 0.1, 0.2, 0.5, 1.0, 5.0\}$ and for
+each value:
+
+(a) Compute the local risk price for $B^o$ exposure: $-\gamma^s_o$.
+
+(b) Compute the long-run risk price formula:
+    $-\gamma^s_o - (\beta^s_o/\xi_o)\sigma_o$.
+
+(c) Plot both as functions of $\xi_o$ on the same axes.
+
+(d) Explain intuitively why the two prices converge as $\xi_o \to \infty$.
+```
+
+```{solution-start} lrr_ex2
+:class: dropdown
+```
+
+```{code-cell} ipython3
+xi_o_values = np.array([0.05, 0.1, 0.2, 0.5, 1.0, 5.0])
+
+gamma_s_o  = params_sdf['gamma_o']
+beta_s_o   = params_sdf['beta_o']
+sigma_o_val = params_sdf['sigma_o']
+
+local_price_val = -gamma_s_o   # constant, independent of ξ_o
+
+lr_prices = []
+for xi_o_val in xi_o_values:
+    lr_price_val = -gamma_s_o - (beta_s_o / xi_o_val) * sigma_o_val
+    lr_prices.append(lr_price_val)
+
+fig, ax = plt.subplots()
+ax.axhline(local_price_val, color='r', ls='--', lw=2, label='Local risk price $-\\gamma^s_o$')
+ax.plot(xi_o_values, lr_prices, 'bo-', lw=2, ms=8, label='Long-run risk price')
+ax.set_xlabel('Mean-reversion speed $\\xi_o$')
+ax.set_ylabel('Risk price')
+ax.set_title('Local vs. Long-Run Risk Prices for $B^o$ Exposure')
+ax.legend()
+ax.set_xscale('log')
+plt.tight_layout()
+plt.show()
+
+print(f"Local risk price (all ξ_o): {local_price_val:.4f}")
+print("\nξ_o  |  Long-run risk price")
+for xi_o_val, lr in zip(xi_o_values, lr_prices):
+    print(f"{xi_o_val:.2f}  |  {lr:.4f}")
+```
+
+**(d)** As $\xi_o \to \infty$, shocks to $X^o$ dissipate extremely quickly
+(the process reverts to its mean almost instantaneously).  A shock today
+has no lasting effect, so there is no "reverberation" to price.  The
+persistence amplification term $\beta^s_o \sigma_o / \xi_o \to 0$, and the
+long-run price converges to the local price $-\gamma^s_o$.
+
+```{solution-end}
+```
+
+```{exercise}
+:label: lrr_ex3
+
+**Numerically illustrate long-run dominance for the three-state chain.**
+
+Using the three-state example from the lecture (stored in `U3`, `beta3`,
+`A3`, `rho3`, `phi3`, `varsigma3`):
+
+(a) For a generic initial function $\psi = [3, 1, 2]$, compute the
+    theoretical long-run limit $\phi \int (\psi/\phi)\,d\hat{\varsigma}$
+    and the path $t \mapsto e^{-\rho t}\mathbb{M}_t\psi$ for each state.
+
+(b) Plot the convergence speed: compute
+
+    $$
+    \text{error}(t) = \max_i \left|e^{-\rho_3 t}(\mathbb{M}_t \psi)_i
+                            - \phi_i \int \frac{\psi}{\phi}\,d\hat{\varsigma}\right|
+    $$
+
+    and plot $\log(\text{error}(t))$ vs $t$.  What is the approximate
+    rate of convergence?  Compare to the **spectral gap**
+    $\rho_3 - \rho_2$ where $\rho_2$ is the second-largest real eigenvalue.
+
+(c) How does the convergence rate change if you make the chain more
+    "sluggish" by scaling the intensity matrix as
+    $\mathbb{U} \leftarrow 0.1 \times \mathbb{U}_3$?
+```
+
+```{solution-start} lrr_ex3
+:class: dropdown
+```
+
+```{code-cell} ipython3
+# (a) Theoretical limit and convergence
+psi_ex3 = np.array([3.0, 1.0, 2.0])
+limit_ex3 = phi3 * np.sum((psi_ex3 / phi3) * varsigma3)
+
+print("Theoretical long-run limit φ ∫(ψ/φ) dς̂:")
+for i, (s, lim) in enumerate(zip(state_names, limit_ex3)):
+    print(f"  {s}: {lim:.6f}")
+
+# (b) Convergence speed
+t_fine = np.linspace(0.01, 40, 300)
+errors = []
+
+for t in t_fine:
+    Mt = expm(t * A3)
+    approx = np.exp(-rho3 * t) * (Mt @ psi_ex3)
+    errors.append(np.max(np.abs(approx - limit_ex3)))
+
+# Spectral gap
+evals_A3 = sorted(np.linalg.eigvals(A3).real, reverse=True)
+spectral_gap = evals_A3[0] - evals_A3[1]
+print(f"\nSpectral gap ρ₁ - ρ₂ = {spectral_gap:.4f}")
+print(f"Expected convergence rate ≈ {spectral_gap:.4f}")
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+axes[0].semilogy(t_fine, errors, 'b-', lw=2)
+axes[0].set_xlabel('$t$')
+axes[0].set_ylabel('$\\log(\\text{error})$')
+axes[0].set_title('Convergence of $e^{-\\rho t}\\mathbb{M}_t\\psi$ to limit')
+
+# Overlay fitted exponential decay
+t_fit = t_fine[t_fine > 2]
+err_fit = [errors[i] for i, t in enumerate(t_fine) if t > 2]
+log_err = np.log(np.maximum(err_fit, 1e-15))
+t_fit_arr = np.array(t_fit)
+slope = np.polyfit(t_fit_arr, log_err, 1)[0]
+axes[0].plot(t_fine, np.exp(np.log(errors[0]) + slope * t_fine), 'r--',
+             label=f'Fitted rate ≈ {abs(slope):.4f}')
+axes[0].axhline(1e-10, color='k', ls=':', lw=0.8)
+axes[0].legend()
+
+# (c) Sluggish chain
+U3_slow = 0.1 * U3
+A3_slow = build_generator(U3_slow, beta3, kappa3)
+rho3_slow, phi3_slow = principal_eigen(A3_slow)
+
+evals_slow = sorted(np.linalg.eigvals(A3_slow).real, reverse=True)
+gap_slow = evals_slow[0] - evals_slow[1]
+
+errors_slow = []
+for t in t_fine:
+    Mt = expm(t * A3_slow)
+    limit_slow = phi3_slow * np.sum((psi_ex3 / phi3_slow) * varsigma3)
+    approx = np.exp(-rho3_slow * t) * (Mt @ psi_ex3)
+    errors_slow.append(np.max(np.abs(approx - limit_slow)))
+
+axes[1].semilogy(t_fine, errors, 'b-', lw=2, label=f'Original (gap={spectral_gap:.3f})')
+axes[1].semilogy(t_fine, errors_slow, 'r-', lw=2, label=f'Sluggish (gap={gap_slow:.3f})')
+axes[1].set_xlabel('$t$')
+axes[1].set_ylabel('$\\log(\\text{error})$')
+axes[1].set_title('Effect of chain speed on convergence')
+axes[1].legend()
+
+plt.tight_layout()
+plt.show()
+
+print(f"\nOriginal chain:  spectral gap = {spectral_gap:.4f}, fitted rate = {abs(slope):.4f}")
+print(f"Sluggish chain:  spectral gap = {gap_slow:.4f}")
+print("Convergence is slower when the chain is sluggish (smaller spectral gap)")
+```
+
+```{solution-end}
+```

From fb0f7365fce8f9e452dfcd7c0871cc8f297bac6b Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Tue, 19 May 2026 21:23:06 +0800
Subject: [PATCH 04/25] updates

---
 lectures/_config.yml                |    1 -
 lectures/_static/quant-econ.bib     |  140 +-
 lectures/long_run_risk_operator.bib |  131 --
 lectures/long_run_risk_operator.md  | 1915 +++++++++++++++------------
 4 files changed, 1052 insertions(+), 1135 deletions(-)
 delete mode 100644 lectures/long_run_risk_operator.bib

diff --git a/lectures/_config.yml b/lectures/_config.yml
index ef184ed1e..92c4f0129 100644
--- a/lectures/_config.yml
+++ b/lectures/_config.yml
@@ -23,7 +23,6 @@ execute:
 
 bibtex_bibfiles:
    - _static/quant-econ.bib
-   - long_run_risk_operator.bib
 
 html:
   baseurl: https://python.quantecon.org/
diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 24f480231..19bbd8f79 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -203,7 +203,8 @@ @article{Epstein_Zin1989
   volume    = {57},
   number    = {4},
   pages     = {937--969},
-  year      = {1989}
+  year      = {1989},
+  doi       = {10.2307/1913778}
 }
 
 @article{Epstein_Zin1991,
@@ -292,7 +293,8 @@ @article{Kreps_Porteus1978
   volume    = {46},
   number    = {1},
   pages     = {185--200},
-  year      = {1978}
+  year      = {1978},
+  doi       = {10.2307/1913656}
 }
 
 @article{Lucas_Stokey1984,
@@ -2133,17 +2135,6 @@ @book{Lucas1987
   publisher = {Oxford Blackwell}
 }
 
-@article{Hans_Scheink_2009,
-  author  = {Lars Peter Hansen and Jose A. Scheinkman},
-  title   = {Long-Term Risk: An Operator Approach},
-  journal = {Econometrica},
-  year    = {2009},
-  volume  = {77},
-  number  = {1},
-  pages   = {177-234},
-  month   = {01}
-}
-
 @book{Whittle1963,
   title     = {Prediction and regulation by linear least-square methods},
   author    = {Whittle, Peter},
@@ -3110,29 +3101,26 @@ @article{Schelling1969
 }
 
 @article{Bansal_Yaron_2004,
-  author   = {Ravi Bansal and Amir Yaron},
-  title    = {{Risks for the Long Run: A Potential Resolution of Asset Pricing Puzzles}},
-  journal  = {Journal of Finance},
-  year     = 2004,
-  volume   = {59},
-  number   = {4},
-  pages    = {1481-1509},
-  month    = {08},
-  keywords = {},
-  doi      = {},
-  abstract = { We model consumption and dividend growth rates as containing (1) a small long-run predictable component, and (2) fluctuating economic uncertainty (consumption volatility). These dynamics, for which we provide empirical support, in conjunction with Epstein and Zin's (1989) preferences, can explain key asset markets phenomena. In our economy, financial markets dislike economic uncertainty and better long-run growth prospects raise equity prices. The model can justify the equity premium, the risk-free rate, and the volatility of the market return, risk-free rate, and the price-dividend ratio. As in the data, dividend yields predict returns and the volatility of returns is time-varying. Copyright 2004 by The American Finance Association.},
-  url      = {https://ideas.repec.org/a/bla/jfinan/v59y2004i4p1481-1509.html}
+  author  = {Bansal, Ravi and Yaron, Amir},
+  title   = {Risks for the Long Run: A Potential Resolution of Asset Pricing Puzzles},
+  journal = {Journal of Finance},
+  year    = {2004},
+  volume  = {59},
+  number  = {4},
+  pages   = {1481--1509},
+  doi     = {10.1111/j.1540-6261.2004.00670.x}
 }
 
 @article{hansen2008consumption,
-  title     = {Consumption strikes back? Measuring long-run risk},
-  author    = {Hansen, Lars Peter and Heaton, John C and Li, Nan},
-  journal   = {Journal of Political economy},
+  title     = {Consumption Strikes Back? Measuring Long-Run Risk},
+  author    = {Hansen, Lars Peter and Heaton, John C. and Li, Nan},
+  journal   = {Journal of Political Economy},
   volume    = {116},
   number    = {2},
   pages     = {260--302},
   year      = {2008},
-  publisher = {The University of Chicago Press}
+  publisher = {The University of Chicago Press},
+  doi       = {10.1086/588200}
 }
 
 @article{Hansen_2007,
@@ -4038,29 +4026,6 @@ @article{Kobayashi1977
   pages   = {32--43}
 }
 
-@article{HansenHeatonLi2008,
-  author    = {Hansen, Lars Peter and Heaton, John C. and Li, Nan},
-  title     = {Consumption Strikes Back? Measuring Long-Run Risk},
-  journal   = {Journal of Political Economy},
-  year      = {2008},
-  volume    = {116},
-  number    = {2},
-  pages     = {260--302},
-  doi       = {10.1086/588200}
-}
-
-@article{BansalYaron2004,
-  author    = {Bansal, Ravi and Yaron, Amir},
-  title     = {Risks for the Long Run: A Potential Resolution of Asset
-               Pricing Puzzles},
-  journal   = {Journal of Finance},
-  year      = {2004},
-  volume    = {59},
-  number    = {4},
-  pages     = {1481--1509},
-  doi       = {10.1111/j.1540-6261.2004.00670.x}
-}
-
 @article{Breeden1979,
   author    = {Breeden, Douglas T.},
   title     = {An Intertemporal Asset Pricing Model with Stochastic
@@ -4072,74 +4037,3 @@ @article{Breeden1979
   pages     = {265--296},
   doi       = {10.1016/0304-405X(79)90016-3}
 }
-
-@article{KrepsPorteus1978,
-  author    = {Kreps, David M. and Porteus, Evan L.},
-  title     = {Temporal Resolution of Uncertainty and Dynamic Choice Theory},
-  journal   = {Econometrica},
-  year      = {1978},
-  volume    = {46},
-  number    = {1},
-  pages     = {185--200},
-  doi       = {10.2307/1913656}
-}
-
-@article{EpsteinZin1989,
-  author    = {Epstein, Larry G. and Zin, Stanley E.},
-  title     = {Substitution, Risk Aversion, and the Temporal Behavior of
-               Consumption and Asset Returns: A Theoretical Framework},
-  journal   = {Econometrica},
-  year      = {1989},
-  volume    = {57},
-  number    = {4},
-  pages     = {937--969},
-  doi       = {10.2307/1913778}
-}
-
-@article{AndersonHansenSargent2003,
-  author    = {Anderson, Evan W. and Hansen, Lars Peter and Sargent, Thomas J.},
-  title     = {A Quartet of Semigroups for Model Specification, Robustness,
-               Prices of Risk, and Model Detection},
-  journal   = {Journal of the European Economic Association},
-  year      = {2003},
-  volume    = {1},
-  number    = {1},
-  pages     = {68--123},
-  doi       = {10.1162/154247603322256774}
-}
-
-@article{KontoyiannisM2003,
-  author    = {Kontoyiannis, Ioannis and Meyn, Sean P.},
-  title     = {Spectral Theory and Limit Theorems for Geometrically
-               Ergodic {Markov} Processes},
-  journal   = {Annals of Applied Probability},
-  year      = {2003},
-  volume    = {13},
-  number    = {1},
-  pages     = {304--362},
-  doi       = {10.1214/aoap/1042765670}
-}
-
-@article{HansenScheinkman1995,
-  author    = {Hansen, Lars Peter and Scheinkman, Jos{\'e} A.},
-  title     = {Back to the Future: Generating Moment Implications for
-               Continuous-Time {Markov} Processes},
-  journal   = {Econometrica},
-  year      = {1995},
-  volume    = {63},
-  number    = {4},
-  pages     = {767--804},
-  doi       = {10.2307/2171800}
-}
-
-@article{LettauWachter2007,
-  author    = {Lettau, Martin and Wachter, Jessica A.},
-  title     = {Why Is Long-Horizon Equity Less Risky? {A} Duration-Based
-               Explanation of the Value Premium},
-  journal   = {Journal of Finance},
-  year      = {2007},
-  volume    = {62},
-  number    = {1},
-  pages     = {55--92},
-  doi       = {10.1111/j.1540-6261.2007.01203.x}
-}
diff --git a/lectures/long_run_risk_operator.bib b/lectures/long_run_risk_operator.bib
deleted file mode 100644
index 64bd0f101..000000000
--- a/lectures/long_run_risk_operator.bib
+++ /dev/null
@@ -1,131 +0,0 @@
-% BibTeX references for long_run_risk_operator.md
-% References NOT already in quant-econ.bib
-
-@article{HansenScheinkman2009,
-  author    = {Hansen, Lars Peter and Scheinkman, Jos{\'e} A.},
-  title     = {Long-Term Risk: An Operator Approach},
-  journal   = {Econometrica},
-  year      = {2009},
-  volume    = {77},
-  number    = {1},
-  pages     = {177--234},
-  doi       = {10.3982/ECTA6761}
-}
-
-@article{AlvarezJermann2005,
-  author    = {Alvarez, Fernando and Jermann, Urban J.},
-  title     = {Using Asset Prices to Measure the Persistence in the
-               Marginal Utility of Wealth},
-  journal   = {Econometrica},
-  year      = {2005},
-  volume    = {73},
-  number    = {6},
-  pages     = {1977--2016},
-  doi       = {10.1111/j.1468-0262.2005.00643.x}
-}
-
-@article{HansenHeatonLi2008,
-  author    = {Hansen, Lars Peter and Heaton, John C. and Li, Nan},
-  title     = {Consumption Strikes Back? Measuring Long-Run Risk},
-  journal   = {Journal of Political Economy},
-  year      = {2008},
-  volume    = {116},
-  number    = {2},
-  pages     = {260--302},
-  doi       = {10.1086/588200}
-}
-
-@article{BansalYaron2004,
-  author    = {Bansal, Ravi and Yaron, Amir},
-  title     = {Risks for the Long Run: A Potential Resolution of Asset
-               Pricing Puzzles},
-  journal   = {Journal of Finance},
-  year      = {2004},
-  volume    = {59},
-  number    = {4},
-  pages     = {1481--1509},
-  doi       = {10.1111/j.1540-6261.2004.00670.x}
-}
-
-@article{Breeden1979,
-  author    = {Breeden, Douglas T.},
-  title     = {An Intertemporal Asset Pricing Model with Stochastic
-               Consumption and Investment Opportunities},
-  journal   = {Journal of Financial Economics},
-  year      = {1979},
-  volume    = {7},
-  number    = {3},
-  pages     = {265--296},
-  doi       = {10.1016/0304-405X(79)90016-3}
-}
-
-@article{KrepsPorteus1978,
-  author    = {Kreps, David M. and Porteus, Evan L.},
-  title     = {Temporal Resolution of Uncertainty and Dynamic Choice Theory},
-  journal   = {Econometrica},
-  year      = {1978},
-  volume    = {46},
-  number    = {1},
-  pages     = {185--200},
-  doi       = {10.2307/1913656}
-}
-
-@article{EpsteinZin1989,
-  author    = {Epstein, Larry G. and Zin, Stanley E.},
-  title     = {Substitution, Risk Aversion, and the Temporal Behavior of
-               Consumption and Asset Returns: A Theoretical Framework},
-  journal   = {Econometrica},
-  year      = {1989},
-  volume    = {57},
-  number    = {4},
-  pages     = {937--969},
-  doi       = {10.2307/1913778}
-}
-
-@article{AndersonHansenSargent2003,
-  author    = {Anderson, Evan W. and Hansen, Lars Peter and Sargent, Thomas J.},
-  title     = {A Quartet of Semigroups for Model Specification, Robustness,
-               Prices of Risk, and Model Detection},
-  journal   = {Journal of the European Economic Association},
-  year      = {2003},
-  volume    = {1},
-  number    = {1},
-  pages     = {68--123},
-  doi       = {10.1162/154247603322256774}
-}
-
-@article{KontoyiannisM2003,
-  author    = {Kontoyiannis, Ioannis and Meyn, Sean P.},
-  title     = {Spectral Theory and Limit Theorems for Geometrically
-               Ergodic {Markov} Processes},
-  journal   = {Annals of Applied Probability},
-  year      = {2003},
-  volume    = {13},
-  number    = {1},
-  pages     = {304--362},
-  doi       = {10.1214/aoap/1042765670}
-}
-
-@article{HansenScheinkman1995,
-  author    = {Hansen, Lars Peter and Scheinkman, Jos{\'e} A.},
-  title     = {Back to the Future: Generating Moment Implications for
-               Continuous-Time {Markov} Processes},
-  journal   = {Econometrica},
-  year      = {1995},
-  volume    = {63},
-  number    = {4},
-  pages     = {767--804},
-  doi       = {10.2307/2171800}
-}
-
-@article{LettauWachter2007,
-  author    = {Lettau, Martin and Wachter, Jessica A.},
-  title     = {Why Is Long-Horizon Equity Less Risky? {A} Duration-Based
-               Explanation of the Value Premium},
-  journal   = {Journal of Finance},
-  year      = {2007},
-  volume    = {62},
-  number    = {1},
-  pages     = {55--92},
-  doi       = {10.1111/j.1540-6261.2007.01203.x}
-}
diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index d098eeb7e..d3e284111 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -20,7 +20,7 @@ kernelspec:
 </div>
 ```
 
-# Long-Run Risk: An Operator Approach
+# Long-Term Risk: An Operator Approach
 
 ```{contents} Contents
 :depth: 2
@@ -28,1022 +28,1214 @@ kernelspec:
 
 ## Overview
 
-This lecture presents key ideas from {cite}`HansenScheinkman2009`, which develops
-an analytical structure that reveals the long-run risk-return relationship for
-nonlinear continuous-time Markov environments.
+This lecture studies the operator approach to long-term risk developed by
+{cite:t}`HansenScheinkman2009`.
 
-The core insight is that to understand how risky assets are priced over *long*
-horizons — not just instantaneously — we need tools that reach beyond local
-stochastic calculus.
+The paper asks how asset-pricing risk adjustments behave when the payoff
+horizon becomes large.
 
-The paper's main device is a **multiplicative
-decomposition** of a positive stochastic process $\{M_t\}$ into three
-components:
+Local continuous-time asset pricing tells us how expected returns compensate
+investors for instantaneous exposure to Brownian and jump shocks.
+
+Hansen and Scheinkman instead focus on valuation operators indexed by the
+time between the valuation date and the payoff date.
+
+These operators form a *semigroup*.
+
+The central object is a positive multiplicative functional $\{M_t\}_{t \geq 0}$,
+such as a stochastic discount factor, a cumulated return, a stochastic growth
+functional, or a product of discounting and growth.
+
+Under suitable conditions, $M$ admits the factorization
 
 $$
-M_t = e^{\rho t} \hat{M}_t \frac{\phi(X_0)}{\phi(X_t)}
-$$
+    M_t
+    =
+    \exp(\rho t) \hat M_t
+    \frac{\phi(X_0)}{\phi(X_t)} ,
+$$ (eq:hs-factorization)
 
 where
 
-- $e^{\rho t}$ is a deterministic exponential trend governed by an **eigenvalue** $\rho$,
-- $\hat{M}_t$ is a **martingale** that encodes a change of probability measure, and
-- $\phi(X_0)/\phi(X_t)$ is a **transient** (stationary) component built from
-  the **principal eigenfunction** $\phi$ of an operator associated with $M$.
+* $\rho$ is a principal eigenvalue,
+* $\phi$ is a strictly positive principal eigenfunction,
+* $\hat M$ is a martingale used to change probability measure, and
+* $\phi(X_0)/\phi(X_t)$ is a transient state-dependent component.
 
-This factorization is the continuous-time, nonlinear Markov generalization of
-the Perron–Frobenius theorem for positive matrices, and it plays the same role
-that the dominant eigenvalue plays in linear systems: it governs long-run
-growth rates.
+This is the Hansen-Scheinkman factorization.
 
-**What you will learn:**
+It generalizes the Perron-Frobenius decomposition of a positive matrix to
+continuous-time Markov valuation problems.
 
-- What a *multiplicative functional* is and why semigroups arise naturally in
-  asset pricing.
-- How to find the *principal eigenfunction* $\phi$ and eigenvalue $\rho$ for
-  a given semigroup.
-- How the eigenvalue $\rho$ encodes long-run risk-adjusted discount rates.
-- How to compute these objects numerically for a finite-state Markov chain and
-  for a continuous diffusion.
-- How the long-run risk-return trade-off differs from its familiar short-run
-  (local) counterpart.
+For long horizons, the scalar $\rho$ controls the exponential growth or decay
+rate of the relevant valuation semigroup, while $\phi$ controls the limiting
+dependence on the current Markov state.
 
-```{note}
-This lecture focuses on discrete-state and affine (Gaussian / square-root)
-continuous-state examples that admit closed-form or easily-computed
-eigenfunctions. 
+{cite:t}`AlvarezJermann2005` used a related permanent-transitory decomposition
+for stochastic discount factors.
 
-The general theory in {cite}`HansenScheinkman2009` handles
-far more general nonlinear Markov environments.
-```
+{cite:t}`HansenScheinkman2009` link this decomposition to principal
+eigenfunctions and use it to characterize long-run risk-return trade-offs.
+
+This lecture covers
 
-Let's start by importing the Python tools we will use.
+* multiplicative functionals and valuation semigroups,
+* the extended generator associated with a multiplicative functional,
+* principal eigenfunctions and the Hansen-Scheinkman factorization,
+* a finite-state example where the analysis reduces to Perron-Frobenius theory,
+* the affine diffusion example from the paper, and
+* long-run risk prices for persistent growth shocks.
+
+We start with imports.
 
 ```{code-cell} ipython3
 import numpy as np
 import matplotlib.pyplot as plt
 from scipy.linalg import eig, expm
-from scipy.optimize import fsolve
-import warnings
-warnings.filterwarnings('ignore')
 
-plt.rcParams.update({'figure.figsize': (10, 6), 'font.size': 12})
+plt.rcParams.update({
+    "figure.figsize": (10, 6),
+    "font.size": 12
+})
 ```
 
-## Multiplicative Functionals and Semigroups
+## Multiplicative Functionals
 
-### The Asset-Pricing Setup
+Let $\{X_t : t \geq 0\}$ be a continuous-time Markov process with state space
+$\mathcal D_0$.
 
-Fix a continuous-time Markov process $\{X_t : t \ge 0\}$ on a state space
-$\mathcal{D}_0 \subset \mathbb{R}^n$.  A **stochastic discount factor (SDF)**
-process $\{S_t : t \ge 0\}$ prices all assets: the date-$0$ price of a
-payoff $\Pi_t$ at date $t$ is
+Let $\mathcal F_t$ be the filtration generated by the history of $X$.
+
+An adapted functional $\{M_t\}$ is **multiplicative** if $M_0 = 1$ and
 
 $$
-E[S_t \Pi_t \mid \mathcal{F}_0].
-$$
+    M_{t+u} = M_u(\theta_t) M_t ,
+    \qquad t, u \geq 0,
+$$ (eq:multiplicative)
 
-The key structural property of $S$ is **temporal consistency**: if we may
-trade at an intermediate date $\tau \le t$, the date-$\tau$ price of the
-payoff $\Pi_t$ must equal
+where $\theta_t$ shifts the underlying Markov path forward by $t$ units.
 
-$$
-E\!\left[\frac{S_t}{S_\tau} \Pi_t \;\Big|\; \mathcal{F}_\tau\right].
-$$
+For example, if $S_t$ is a stochastic discount factor, then
+$S_{t+u}/S_t$ is the date-$t$ discount factor for payoffs at date $t+u$.
 
-When prices depend only on the current Markov state, this temporal
-consistency forces $S$ to satisfy a **multiplicative property**.
+The Markov version of this intertemporal consistency condition is exactly
+{eq}`eq:multiplicative`.
 
-### Multiplicative Functionals
+When $M_t > 0$, we can write $M_t = \exp(A_t)$.
 
-```{admonition} Definition (Multiplicative Functional)
-A functional $\{M_t : t \ge 0\}$ adapted to the filtration generated by $X$
-is **multiplicative** if $M_0 = 1$ and
+Then $A$ is additive:
 
 $$
-M_{t+u} = M_u(\theta_t) \cdot M_t \qquad \forall\, t, u \ge 0,
+    A_{t+u} = A_u(\theta_t) + A_t .
 $$
 
-where $\theta_t$ is the shift operator on the Markov process.
-```
+For the jump-diffusion setting in {cite:t}`HansenScheinkman2009`, a useful
+parameterization is
 
-Equivalently, if $A_t = \log M_t$ then $A$ is **additive**:
-$A_0 = 0$ and $A_{t+u} = A_u(\theta_t) + A_t$.
+$$
+\begin{aligned}
+    A_t
+    &=
+    \int_0^t \beta(X_s) ds
+    + \int_0^t \gamma(X_{s-})^\top dB_s
+    + \sum_{0 \leq s \leq t} \kappa(X_s, X_{s-}) .
+\end{aligned}
+$$ (eq:additive-functional)
 
-For a diffusion with Brownian motion $B$ and jump compensator $\eta$, the
-general additive functional takes the form
+The functions $(\beta, \gamma, \kappa)$ control drift, Brownian exposure, and
+jump scaling.
 
-$$
-A_t = \int_0^t \beta(X_u)\,du + \int_0^t \gamma(X_{u-})'\,dB_u
-      + \sum_{0 \le u \le t} \kappa(X_u, X_{u-}),
-$$
+In this notation, $\beta$ is allowed to be positive or negative.
 
-so $M_t = e^{A_t}$ is parameterized by the triple $(\beta, \gamma, \kappa)$.
+For instance, a pure discount factor with short rate $r(X_t)$ has
+$\beta(x) = -r(x)$.
 
-### The Semigroup
+## Semigroups
 
-Given a multiplicative functional $M$, the family of operators
+A multiplicative functional defines a family of linear operators
 
 $$
-\mathbb{M}_t \psi(x) = E[M_t \psi(X_t) \mid X_0 = x]
-$$
+    \mathbb M_t \psi(x)
+    =
+    E\left[M_t \psi(X_t) \mid X_0 = x\right].
+$$ (eq:m-semigroup)
 
-forms a **semigroup**: $\mathbb{M}_0 = \mathbb{I}$ and
-$\mathbb{M}_{t+u} = \mathbb{M}_t \mathbb{M}_u$.
+These operators form a semigroup:
 
-The semigroup property manifests itself here as  the **iterated-values property** in
-asset pricing — it holds because of frictionless trading at intermediate
-dates.
+$$
+    \mathbb M_0 = I,
+    \qquad
+    \mathbb M_{t+u} = \mathbb M_t \mathbb M_u .
+$$
 
-Table I of {cite}`HansenScheinkman2009` lists four important semigroups:
+The proof is just iterated expectations plus the multiplicative property of
+$M$.
 
-| Object | Multiplicative Functional | Semigroup |
-|---|---|---|
-| Stochastic discount factor | $S$ | $\{\mathbb{S}_t\}$ |
-| Cumulated return | $V$ | $\{\mathbb{V}_t\}$ |
-| Stochastic growth | $G$ | $\{\mathbb{G}_t\}$ |
-| Valuation with growth | $Q = GS$ | $\{\mathbb{Q}_t\}$ |
+The paper uses several multiplicative functionals, summarized as follows.
 
-## The Generator and Its Eigenvalue Problem
+| Object | Multiplicative functional | Semigroup |
+|---|---:|---:|
+| stochastic discount factor | $S$ | $\{\mathbb S_t\}$ |
+| cumulated return | $V$ | $\{\mathbb V_t\}$ |
+| stochastic growth | $G$ | $\{\mathbb G_t\}$ |
+| valuation with stochastic growth | $Q = GS$ | $\{\mathbb Q_t\}$ |
 
-### The Extended Generator
+The last case is central for long-run cash-flow pricing.
 
-The **extended generator** $\mathbb{A}$ of $M$ is defined by: a Borel
-function $\psi$ belongs to the domain of $\mathbb{A}$ if there exists a
-Borel function $\chi$ such that
+If a cash flow is
 
 $$
-N_t = M_t \psi(X_t) - \psi(X_0) - \int_0^t M_s \chi(X_s)\,ds
+    D_t = D_0 G_t \psi(X_t),
 $$
 
-is a local martingale.  We then write $\chi = \mathbb{A}\psi$.
-
-For a diffusion parameterized by $(\eta, \xi, \Gamma)$ and a multiplicative
-functional $M$ parameterized by $(\beta, \gamma, \kappa)$, the generator
-takes the form
+then its date-$0$ value is
 
 $$
-\mathbb{A}\phi(x)
-= \underbrace{\left[\xi(x) + \Gamma(x)\gamma(x)\right] \cdot
-              \frac{\partial \phi(x)}{\partial x}}_{\text{drift (twisted)}}
-+ \underbrace{\frac{1}{2}
-  \operatorname{tr}\!\left[\Sigma(x)\frac{\partial^2\phi(x)}{\partial x \partial x'}\right]}_{\text{diffusion}}
-+ \underbrace{\int [\phi(y) - \phi(x)]\,
-              e^{\kappa(y,x)}\eta(dy \mid x)}_{\text{jumps}}
-+ \underbrace{\left[\beta(x) + \frac{|\gamma(x)|^2}{2}
-  + \int (e^{\kappa(y,x)} - 1)\,\eta(dy\mid x)\right]\phi(x)}_{\text{level}}
+    D_0 \mathbb Q_t \psi(X_0),
+    \qquad
+    \mathbb Q_t \psi(x)
+    =
+    E\left[G_t S_t \psi(X_t) \mid X_0=x\right].
 $$
 
-where $\Sigma = \Gamma\Gamma'$.
+The long-horizon behavior of $\mathbb Q_t$ tells us how current prices value
+cash-flow growth risk that materializes far in the future.
+
+## The Generator
 
-### The Principal Eigenvalue Problem
+The **extended generator** associated with $M$ is the local object that
+corresponds to the semigroup $\{\mathbb M_t\}$.
 
-```{admonition} Definition (Principal Eigenfunction)
-A strictly positive Borel function $\phi$ is a **principal eigenfunction**
-of $\mathbb{A}$ with **eigenvalue** $\rho$ if
+A Borel function $\psi$ belongs to the domain of the generator $\mathbb A$ if
+there is a Borel function $\chi$ such that
 
 $$
-\mathbb{A}\phi = \rho\,\phi.
+    N_t
+    =
+    M_t \psi(X_t)
+    - \psi(X_0)
+    - \int_0^t M_s \chi(X_s) ds
 $$
-```
 
-This is a key equation.
+is a local martingale.
+
+We then write $\mathbb A \psi = \chi$.
 
-Equivalently (and more computationally useful),
-$\phi$ solves the **principal eigenvalue problem** for the semigroup:
+Suppose the Markov state satisfies
 
 $$
-\mathbb{M}_t \phi = e^{\rho t} \phi \qquad \forall\, t \ge 0.
+    dX_t^c = \xi(X_t)dt + \Gamma(X_t)dB_t
 $$
 
-### The Multiplicative Decomposition
+between jumps, let $\Sigma = \Gamma \Gamma^\top$, and let
+$\eta(dy \mid x)$ denote the jump compensator.
 
-Once we have a principal eigenfunction $\phi$ with eigenvalue $\rho$, we
-obtain the **multiplicative factorization** {cite}`HansenScheinkman2009`:
+If $M=\exp(A)$ is parameterized by $(\beta,\gamma,\kappa)$ as in
+{eq}`eq:additive-functional`, then, for smooth $\phi$,
 
 $$
-M_t = e^{\rho t} \,\hat{M}_t\, \frac{\phi(X_0)}{\phi(X_t)},
-$$
+\begin{aligned}
+\mathbb A \phi(x)
+&=
+\left[\xi(x)+\Gamma(x)\gamma(x)\right]^\top
+    \frac{\partial \phi(x)}{\partial x}
+\\
+&\quad
++ \frac{1}{2}
+  \operatorname{trace}\left[
+    \Sigma(x)
+    \frac{\partial^2\phi(x)}{\partial x \partial x^\top}
+  \right]
+\\
+&\quad
++ \int
+    [\phi(y)-\phi(x)]
+    \exp[\kappa(y,x)] \eta(dy \mid x)
+\\
+&\quad
++ \left[
+    \beta(x)
+    + \frac{\gamma(x)^\top \gamma(x)}{2}
+    + \int
+        \left(\exp[\kappa(y,x)]-1\right)\eta(dy \mid x)
+  \right]\phi(x).
+\end{aligned}
+$$ (eq:extended-generator)
+
+This formula is useful because it converts a long-horizon pricing problem into
+an eigenvalue problem for a local generator.
+
+```{note}
+When $M \equiv 1$, {eq}`eq:extended-generator` reduces to the generator of
+the Markov process $X$.
+
+When $M=S$ is a stochastic discount factor, the extra terms encode local
+prices of Brownian and jump risk.
+```
 
-where the **martingale component** is
+## Principal Eigenfunctions
+
+A Borel function $\phi$ is an eigenfunction of $\mathbb A$ with eigenvalue
+$\rho$ if
 
 $$
-\hat{M}_t = e^{-\rho t} M_t \frac{\phi(X_t)}{\phi(X_0)}.
+    \mathbb A \phi = \rho \phi .
+$$ (eq:generator-eigen)
+
+A **principal eigenfunction** is an eigenfunction that is strictly positive.
+
+If $\phi > 0$ solves {eq}`eq:generator-eigen`, then
+
 $$
+    \hat M_t
+    =
+    \exp(-\rho t) M_t
+    \frac{\phi(X_t)}{\phi(X_0)}
+$$ (eq:mhat)
 
-```{note}
-{cite}`AlvarezJermann2005` proposed a multiplicative decomposition of the SDF into a permanent martingale component and a transitory component. 
-{cite}`HansenScheinkman2009` established the connection to principal
-eigenfunctions and proved existence and uniqueness results.
-```
+is a local martingale.
 
-### Long-Run Dominance
+When $\hat M$ is a martingale, it defines a new probability measure and gives
+the factorization {eq}`eq:hs-factorization`.
 
-Proposition 7.1 of {cite}`HansenScheinkman2009` establishes that, under
-appropriate stability conditions,
+It also gives the semigroup eigenvalue equation
 
 $$
-\lim_{t\to\infty} e^{-\rho t} \mathbb{M}_t \psi
-= \phi \int \frac{\psi}{\phi}\,d\hat{\varsigma},
+    \mathbb M_t \phi = \exp(\rho t)\phi,
+    \qquad t \geq 0.
+$$ (eq:semigroup-eigen)
+
+Under stochastic stability restrictions under the $\hat M$-twisted measure,
+Proposition 7.1 of {cite:t}`HansenScheinkman2009` gives the long-run
+approximation
+
 $$
+    \lim_{t \to \infty}
+    \exp(-\rho t)\mathbb M_t \psi
+    =
+    \phi
+    \int \frac{\psi}{\phi} d\hat\varsigma ,
+$$ (eq:long-run-limit)
 
-where $\hat{\varsigma}$ is the stationary distribution of the **twisted**
-(i.e., $\hat{M}$-distorted) Markov process.
+where $\hat\varsigma$ is the stationary distribution of the twisted Markov
+process.
 
-This is the long-run counterpart of the Perron–Frobenius theorem: $\rho$
-governs the exponential growth (or decay) rate of the semigroup, and $\phi$
-determines the limiting state dependence.
+This is the formal sense in which $\rho$ is the long-run growth rate and
+$\phi$ is the long-run state dependence.
 
-## Finite-State Markov Chain: The Matrix Case
+```{note}
+Positive eigenfunctions need not be unique in general state spaces.
 
-The continuous-time theory is cleanest when the state space is finite.
-This section works through the finite-state case in detail — it is exactly
-the Perron–Frobenius theorem for non-negative matrices.
+The eigenfunction used for long-run approximation must generate a martingale
+and a stochastically stable twisted process.
 
-### Intensity Matrix and Multiplicative Functional
+Proposition 7.2 of {cite:t}`HansenScheinkman2009` shows that these stability
+requirements select the relevant eigenfunction up to scale.
+```
 
-Let $X$ be a continuous-time Markov chain with $N$ states
-$\{x_1, \ldots, x_N\}$ and **intensity matrix** $\mathbb{U}$
-(with $u_{ij} \ge 0$ for $i \ne j$ and $u_{ii} = -\sum_{j \ne i} u_{ij}$).
+## A Finite-State Markov Chain
 
-A multiplicative functional is parameterized by
+We first study a finite-state chain, where the analysis is exactly
+Perron-Frobenius theory.
 
-- a decay rate $\beta_i \ge 0$ in state $x_i$, and
-- a jump scaling $e^{\kappa(x_j, x_i)}$ when jumping from $x_i$ to $x_j$.
+Let $X$ take values in $\{x_1,\ldots,x_N\}$ and let $U$ be its intensity
+matrix.
 
-The **generator matrix** $\mathbb{A}$ for the multiplicative semigroup has
-entries
+Thus $u_{ij} \geq 0$ for $i \neq j$ and each row of $U$ sums to zero.
+
+Let the multiplicative functional have
+
+* a discount or decay rate $r_i$ in state $i$, and
+* a jump multiplier $\exp[\kappa(x_j,x_i)]$ when the state jumps from $i$ to
+  $j$.
+
+The generator matrix $A$ for the multiplicative semigroup is
 
 $$
-a_{ij} = \begin{cases}
-u_{ii} - \beta_i & \text{if } i = j, \\
-u_{ij}\, e^{\kappa(x_j,\, x_i)} & \text{if } i \ne j.
-\end{cases}
-$$
+    a_{ij}
+    =
+    \begin{cases}
+        u_{ii} - r_i, & i=j, \\
+        u_{ij}\exp[\kappa(x_j,x_i)], & i \neq j .
+    \end{cases}
+$$ (eq:finite-a)
 
-The semigroup is $\mathbb{M}_t = e^{t\mathbb{A}}$ (matrix exponential).
+The semigroup is
 
-### Finding the Principal Eigenvalue
+$$
+    \mathbb M_t = \exp(tA).
+$$
 
-The principal eigenvalue $\rho$ is the **largest real eigenvalue** of
-$\mathbb{A}$, and the principal eigenfunction $\phi$ is the corresponding
-strictly positive (Perron) eigenvector.
+For an irreducible chain with strictly positive jump multipliers, the
+principal eigenvalue is the real eigenvalue of $A$ with largest real part.
+
+The associated right eigenvector is strictly positive.
 
 ```{code-cell} ipython3
-def build_generator(U, beta, kappa_mat):
+def build_generator(U, r, kappa):
     """
-    Build the generator matrix A for the multiplicative semigroup.
+    Build the generator matrix for a finite-state multiplicative semigroup.
 
     Parameters
     ----------
-    U : (N, N) array — intensity matrix of X
-    beta : (N,) array — discount rates in each state
-    kappa_mat : (N, N) array — kappa[j, i] = kappa(x_j, x_i)
+    U : array_like, shape (N, N)
+        Intensity matrix of the Markov chain.
+    r : array_like, shape (N,)
+        State-dependent decay rates.
+    kappa : array_like, shape (N, N)
+        kappa[j, i] is the log jump multiplier for a transition i -> j.
 
     Returns
     -------
-    A : (N, N) generator matrix
+    A : ndarray, shape (N, N)
+        Generator of the multiplicative semigroup.
     """
+    U = np.asarray(U, dtype=float)
+    r = np.asarray(r, dtype=float)
+    kappa = np.asarray(kappa, dtype=float)
+
     N = U.shape[0]
-    A = np.zeros((N, N))
+    A = np.empty_like(U)
+
     for i in range(N):
         for j in range(N):
             if i == j:
-                A[i, i] = U[i, i] - beta[i]
+                A[i, i] = U[i, i] - r[i]
             else:
-                A[i, j] = U[i, j] * np.exp(kappa_mat[j, i])
+                A[i, j] = U[i, j] * np.exp(kappa[j, i])
+
     return A
 
 
-def principal_eigen(A):
+def principal_eigenpair(A):
+    """
+    Compute the Perron eigenvalue and positive right eigenvector.
     """
-    Return the largest real eigenvalue and corresponding positive eigenvector.
+    vals, vecs = eig(A)
+    idx = np.argmax(vals.real)
 
-    Parameters
-    ----------
-    A : (N, N) array
+    rho = vals[idx].real
+    phi = vecs[:, idx].real
 
-    Returns
-    -------
-    rho : float — principal eigenvalue
-    phi : (N,) array — principal eigenfunction (positive, normalized)
-    """
-    eigenvalues, eigenvectors = eig(A)
-    # Keep only real eigenvalues
-    real_mask = np.abs(eigenvalues.imag) < 1e-10
-    real_eigs = eigenvalues[real_mask].real
-    real_vecs = eigenvectors[:, real_mask].real
-
-    # Largest real eigenvalue
-    idx = np.argmax(real_eigs)
-    rho = real_eigs[idx]
-    phi = real_vecs[:, idx]
-
-    # Make positive and normalize
-    if phi.min() < 0:
+    if phi.sum() < 0:
         phi = -phi
-    phi = phi / phi.max()
+
+    # Remove tiny numerical sign errors.
+    if np.any(phi <= 0):
+        phi = np.abs(phi)
+
+    phi = phi / phi.mean()
     return rho, phi
+
+
+def twisted_generator(A, rho, phi):
+    """
+    Generator of the Markov process under the twisted measure.
+    """
+    D = np.diag(phi)
+    D_inv = np.diag(1 / phi)
+    return D_inv @ A @ D - rho * np.eye(len(phi))
+
+
+def stationary_distribution(Q):
+    """
+    Stationary distribution pi for a finite-state intensity matrix Q.
+    """
+    vals, vecs = eig(Q.T)
+    idx = np.argmin(np.abs(vals))
+    pi = vecs[:, idx].real
+
+    if pi.sum() < 0:
+        pi = -pi
+
+    pi = np.maximum(pi, 0)
+    return pi / pi.sum()
 ```
 
-### A Two-State Example: Boom and Recession
+### Two States
 
-Consider an economy that alternates between a **boom** state ($x_1$) and a
-**recession** state ($x_2$).
+Consider a boom-recession economy.
+
+The boom state switches to recession at rate $\lambda_1$, while recession
+switches to boom at rate $\lambda_2$.
 
 ```{code-cell} ipython3
-# Intensity matrix: boom <-> recession
-# Expected duration of boom = 1/lambda_1, recession = 1/lambda_2
-lambda_1 = 0.3   # rate of leaving boom
-lambda_2 = 0.5   # rate of leaving recession
+lambda_1 = 0.30
+lambda_2 = 0.50
 
 U = np.array([[-lambda_1,  lambda_1],
               [ lambda_2, -lambda_2]])
 
-# Stochastic discount factor parameters
-# Higher discount rate in boom (asset prices high, SDF low)
-beta = np.array([0.05, 0.02])   # per-unit-time decay
-
-# No jump scaling in this example
-kappa_mat = np.zeros((2, 2))
+r = np.array([0.05, 0.02])
+kappa = np.zeros((2, 2))
 
-A = build_generator(U, beta, kappa_mat)
-rho, phi = principal_eigen(A)
+A = build_generator(U, r, kappa)
+rho, phi = principal_eigenpair(A)
 
-print("Generator matrix A:")
+print("A =")
 print(np.round(A, 4))
-print(f"\nPrincipal eigenvalue ρ = {rho:.6f}")
-print(f"Principal eigenfunction φ = {phi}")
-print(f"\nInterpretation: long-run SDF decay rate = {rho:.4f} per unit time")
+print(f"\nrho = {rho:.6f}")
+print(f"phi = {phi}")
+print(f"long-run zero-coupon yield = {-rho:.4f}")
 ```
 
+We can verify the eigenvalue equation
+$\mathbb M_t \phi = \exp(\rho t)\phi$.
+
 ```{code-cell} ipython3
-# Verify: M_t φ = exp(ρt) φ  for t = 1, 2, 5
-for t in [1.0, 2.0, 5.0]:
-    Mt = expm(t * A)     # semigroup at time t
-    lhs = Mt @ phi
+for t in [1.0, 5.0, 25.0]:
+    lhs = expm(t * A) @ phi
     rhs = np.exp(rho * t) * phi
-    print(f"t={t}: max |M_t φ - exp(ρt)φ| = {np.max(np.abs(lhs - rhs)):.2e}")
+    err = np.max(np.abs(lhs - rhs))
+    print(f"t = {t:4.1f}, error = {err:.2e}")
 ```
 
+Next we compute the twisted generator and the stationary distribution
+$\hat\varsigma$ under the twisted probability measure.
+
+```{code-cell} ipython3
+A_hat = twisted_generator(A, rho, phi)
+varsigma_hat = stationary_distribution(A_hat)
+
+print("twisted generator row sums:")
+print(np.round(A_hat.sum(axis=1), 12))
+
+print("\ntwisted stationary distribution:")
+print(f"  boom      {varsigma_hat[0]:.4f}")
+print(f"  recession {varsigma_hat[1]:.4f}")
+```
+
+For any payoff function $\psi$, the limit in {eq}`eq:long-run-limit` is
+
+$$
+    \phi
+    \sum_i \frac{\psi_i}{\phi_i}\hat\varsigma_i .
+$$
+
 ```{code-cell} ipython3
-# Show long-run dominance: exp(-ρt) M_t ψ → φ ∫(ψ/φ) dς̂
-# for any ψ
-
-# Compute twisted stationary distribution ς̂ via M̂_t = exp(-ρt) M_t φ(X_t)/φ(X_0)
-# The generator of M̂ is: Â_ij = (1/φ_i) A_ij φ_j  (similarity transform)
-phi_diag_inv = np.diag(1.0 / phi)
-phi_diag     = np.diag(phi)
-A_hat = phi_diag_inv @ A @ phi_diag - rho * np.eye(2)
-
-# Stationary distribution of Â: solve π A_hat = 0, sum π = 1
-# (left eigenvector corresponding to eigenvalue 0)
-evals, evecs = eig(A_hat.T)
-idx0 = np.argmin(np.abs(evals.real))
-varsigma_hat = evecs[:, idx0].real
-varsigma_hat = np.abs(varsigma_hat) / np.abs(varsigma_hat).sum()
-
-print("Twisted stationary distribution ς̂:")
-print(f"  Boom:      {varsigma_hat[0]:.4f}")
-print(f"  Recession: {varsigma_hat[1]:.4f}")
-
-# Test convergence for ψ = [1, 2]
 psi = np.array([1.0, 2.0])
-limit_theoretical = phi * np.sum((psi / phi) * varsigma_hat)
+limit = phi * np.sum((psi / phi) * varsigma_hat)
+
+for t in [1, 5, 20, 80]:
+    approx = np.exp(-rho * t) * expm(t * A) @ psi
+    print(f"t = {t:2d}, normalized value = {approx}")
 
-for t in [5, 20, 50, 100]:
-    Mt = expm(t * A)
-    approx = np.exp(-rho * t) * Mt @ psi
-    print(f"t={t:3d}: exp(-ρt)M_t ψ = {approx}, theoretical limit = {limit_theoretical}")
+print("\nlimit =", limit)
 ```
 
-### Impact of Jump Scaling
+### Jump Scaling
 
-Now introduce jump scaling: the SDF **jumps up** (positive surprise) when
-transitioning from recession to boom, and jumps down otherwise.
+Now let the multiplicative functional jump when the Markov state changes.
+
+The matrix `kappa_jump` below says that the functional jumps up on a
+recession-to-boom transition and down on a boom-to-recession transition.
 
 ```{code-cell} ipython3
-# kappa_mat[j, i] = kappa(x_j, x_i): jump when going from state i to state j
-kappa_mat2 = np.array([[0.0,  0.3],   # boom <- recession: positive jump
-                        [-0.2, 0.0]]) # recession <- boom: negative jump
-
-A2 = build_generator(U, beta, kappa_mat2)
-rho2, phi2 = principal_eigen(A2)
-
-print(f"Without jumps: ρ = {rho:.6f}")
-print(f"With jumps:    ρ = {rho2:.6f}")
-print(f"\nPrincipal eigenfunctions:")
-print(f"  φ (no jumps):   {phi}")
-print(f"  φ (with jumps): {phi2}")
+kappa_jump = np.array([[0.0,  0.30],
+                       [-0.20, 0.0]])
+
+A_jump = build_generator(U, r, kappa_jump)
+rho_jump, phi_jump = principal_eigenpair(A_jump)
+
+print(f"rho without jump scaling = {rho:.6f}")
+print(f"rho with jump scaling    = {rho_jump:.6f}")
+print("\nphi with jump scaling:")
+print(phi_jump)
 ```
 
 ```{code-cell} ipython3
-# Plot how ρ varies with jump size κ_21 (recession -> boom jump)
-kappa_values = np.linspace(-0.5, 0.5, 100)
-rho_values = []
+kappa_grid = np.linspace(-0.5, 0.5, 100)
+rho_grid = np.empty_like(kappa_grid)
 
-for kappa_21 in kappa_values:
-    kmat = np.array([[0.0, kappa_21],
-                     [-0.2, 0.0]])
-    Ak = build_generator(U, beta, kmat)
-    rk, _ = principal_eigen(Ak)
-    rho_values.append(rk)
+for n, k in enumerate(kappa_grid):
+    kappa_temp = np.array([[0.0, k],
+                           [-0.2, 0.0]])
+    A_temp = build_generator(U, r, kappa_temp)
+    rho_grid[n], _ = principal_eigenpair(A_temp)
 
 fig, ax = plt.subplots()
-ax.plot(kappa_values, rho_values, 'b-', lw=2)
-ax.axhline(rho, color='r', ls='--', label=f'ρ (no jumps) = {rho:.4f}')
-ax.axvline(0, color='k', ls=':', lw=0.8)
-ax.set_xlabel('Jump size κ (recession → boom)')
-ax.set_ylabel('Principal eigenvalue ρ')
-ax.set_title('Long-run growth rate as a function of jump scaling')
-ax.legend()
-plt.tight_layout()
+ax.plot(kappa_grid, rho_grid, lw=2)
+ax.axhline(rho, color="black", ls="--", lw=1)
+ax.axvline(0, color="black", ls=":", lw=1)
+ax.set_xlabel("jump log multiplier for recession to boom")
+ax.set_ylabel("principal eigenvalue")
+ax.set_title("Jump Scaling and the Long-Run Growth Rate")
 plt.show()
 ```
 
-## Continuous-State Affine Example
+## The Affine Diffusion Example
 
-{cite}`HansenScheinkman2009` present a Markov diffusion with two components:
-a **Feller square-root process** $X^f$ (stochastic volatility) and an
-**Ornstein–Uhlenbeck process** $X^o$ (predictable growth).
+We now turn to the continuous-state example in {cite:t}`HansenScheinkman2009`.
 
-### State Dynamics
+The state has two independent components.
 
-$$
-dX^f_t = \xi_f(\bar{x}_f - X^f_t)\,dt + \sqrt{X^f_t}\,\sigma_f\,dB^f_t,
-\qquad dX^o_t = \xi_o(\bar{x}_o - X^o_t)\,dt + \sigma_o\,dB^o_t.
-$$
+The first is a Feller square-root process $X^f$, used to model stochastic
+volatility.
 
-Parameter restrictions: $\xi_f, \bar{x}_f > 0$, $2\xi_f \bar{x}_f \ge \sigma_f^2$.
+The second is an Ornstein-Uhlenbeck process $X^o$, used to model predictable
+growth.
 
-### Multiplicative Functional
+$$
+\begin{aligned}
+dX_t^f
+&=
+\xi_f(\bar x_f - X_t^f)dt
++ \sqrt{X_t^f}\sigma_f dB_t^f,
+\\
+dX_t^o
+&=
+\xi_o(\bar x_o - X_t^o)dt
++ \sigma_o dB_t^o.
+\end{aligned}
+$$ (eq:affine-state)
+
+The paper normalizes $\sigma_o > 0$ and $\sigma_f < 0$.
+
+The sign of $\sigma_f$ is a convention that makes a positive $B^f$ shock
+reduce volatility.
+
+Consider a multiplicative functional $M=\exp(A)$ with
 
-Consider a multiplicative functional $M = e^A$ with additive functional
+$$
+\begin{aligned}
+A_t
+&=
+\bar\beta t
++ \int_0^t \beta_f X_s^f ds
++ \int_0^t \beta_o X_s^o ds
+\\
+&\quad
++ \int_0^t \sqrt{X_s^f}\gamma_f dB_s^f
++ \int_0^t \gamma_o dB_s^o .
+\end{aligned}
+$$ (eq:affine-additive)
+
+Guess an exponential-affine eigenfunction
 
 $$
-A_t = \bar{\beta} t
-     + \int_0^t \beta_f X^f_s\,ds
-     + \int_0^t \beta_o X^o_s\,ds
-     + \int_0^t \sqrt{X^f_s}\,\gamma_f\,dB^f_s
-     + \int_0^t \gamma_o\,dB^o_s.
+    \phi(x^f,x^o) = \exp(c_f x^f + c_o x^o).
 $$
 
-### Affine Eigenfunction
+Substitution into $\mathbb A\phi=\rho\phi$ gives
 
-Guess an eigenfunction of the form $\phi(x^f, x^o) = e^{c_f x^f + c_o x^o}$.
-
-Substituting into the eigenvalue equation $\mathbb{A}\phi = \rho\phi$ and
-collecting coefficients yields two equations:
+$$
+0
+=
+\beta_f
++ \frac{\gamma_f^2}{2}
++ c_f(\gamma_f\sigma_f-\xi_f)
++ c_f^2\frac{\sigma_f^2}{2},
+$$ (eq:cf-eq)
 
-**Coefficient of $x^f$:**
+and
 
 $$
-0 = \beta_f + \frac{\gamma_f^2}{2} + c_f(\gamma_f \sigma_f - \xi_f)
-    + c_f^2 \frac{\sigma_f^2}{2}
-$$
+    c_o = \frac{\beta_o}{\xi_o}.
+$$ (eq:co-eq)
 
-This is a **quadratic** in $c_f$:
+The two candidate values for $c_f$ are
 
 $$
-c_f = \frac{(\xi_f - \gamma_f\sigma_f) \pm
-\sqrt{(\xi_f - \gamma_f\sigma_f)^2 - \sigma_f^2(2\beta_f + \gamma_f^2)}}{\sigma_f^2}
+c_f
+=
+\frac{
+    \xi_f-\gamma_f\sigma_f
+    \pm
+    \sqrt{
+        (\xi_f-\gamma_f\sigma_f)^2
+        - \sigma_f^2(2\beta_f+\gamma_f^2)
+    }
+}{\sigma_f^2}.
+$$ (eq:cf-roots)
+
+The eigenvalue is
+
 $$
+\rho
+=
+\bar\beta
++ \frac{\gamma_o^2}{2}
++ c_f \xi_f \bar x_f
++ c_o(\xi_o\bar x_o+\gamma_o\sigma_o)
++ c_o^2 \frac{\sigma_o^2}{2}.
+$$ (eq:affine-rho)
+
+The relevant root is the one that keeps the twisted $X^f$ process mean
+reverting.
 
-**Coefficient of $x^o$:**
+Under the twisted measure, the drift of $X^f$ is
 
 $$
-0 = \beta_o - c_o \xi_o \implies c_o = \frac{\beta_o}{\xi_o}
+    \xi_f(\bar x_f - x^f)
+    + x^f\sigma_f(\gamma_f+c_f\sigma_f).
 $$
 
-**Eigenvalue:**
+Hence the mean-reversion coefficient is
 
 $$
-\rho = \bar{\beta} + \frac{\gamma_o^2}{2} + c_f \xi_f \bar{x}_f
-       + c_o(\xi_o \bar{x}_o + \gamma_o\sigma_o) + c_o^2\frac{\sigma_o^2}{2}
+    \xi_f - \sigma_f(\gamma_f+c_f\sigma_f),
 $$
 
-The **correct** root for $c_f$ is the one that implies mean reversion in the
-twisted process (so the distorted $X^f$ remains stationary).
+which must be positive.
 
 ```{code-cell} ipython3
 def solve_affine_eigenfunction(params):
     """
-    Solve for the affine eigenfunction of the Hansen-Scheinkman
-    diffusion example.
-
-    Parameters
-    ----------
-    params : dict with keys
-        xi_f, xbar_f, sigma_f : square-root process parameters
-        xi_o, xbar_o, sigma_o : OU process parameters
-        beta_bar, beta_f, beta_o : drift loadings
-        gamma_f, gamma_o : diffusion loadings
-
-    Returns
-    -------
-    cf, co, rho : eigenfunction coefficients and eigenvalue
-    rho_check : sign of drift coefficient in twisted X^f (must be negative)
+    Solve the affine eigenvalue problem from Hansen and Scheinkman.
     """
-    xi_f   = params['xi_f']
-    xbar_f = params['xbar_f']
-    sigma_f = params['sigma_f']
-    xi_o   = params['xi_o']
-    xbar_o = params['xbar_o']
-    sigma_o = params['sigma_o']
-    beta_bar = params['beta_bar']
-    beta_f  = params['beta_f']
-    beta_o  = params['beta_o']
-    gamma_f = params['gamma_f']
-    gamma_o = params['gamma_o']
-
-    # co from linear equation
+    xi_f = params["xi_f"]
+    xbar_f = params["xbar_f"]
+    sigma_f = params["sigma_f"]
+    xi_o = params["xi_o"]
+    xbar_o = params["xbar_o"]
+    sigma_o = params["sigma_o"]
+    beta_bar = params["beta_bar"]
+    beta_f = params["beta_f"]
+    beta_o = params["beta_o"]
+    gamma_f = params["gamma_f"]
+    gamma_o = params["gamma_o"]
+
     co = beta_o / xi_o
 
-    # cf from quadratic equation
-    discriminant = (xi_f - gamma_f * sigma_f)**2 - sigma_f**2 * (2*beta_f + gamma_f**2)
-    if discriminant < 0:
-        raise ValueError("No real solution for c_f; check parameters.")
-
-    cf_plus  = ((xi_f - gamma_f*sigma_f) + np.sqrt(discriminant)) / sigma_f**2
-    cf_minus = ((xi_f - gamma_f*sigma_f) - np.sqrt(discriminant)) / sigma_f**2
-
-    # Select root giving mean reversion in twisted X^f
-    # Twisted drift coefficient on x^f: xi_f - sigma_f^2 * cf
-    # (must be negative for stationarity)
-    def twisted_drift_coef(cf):
-        return -(xi_f - sigma_f * (gamma_f + cf * sigma_f))
-
-    drift_plus  = twisted_drift_coef(cf_plus)
-    drift_minus = twisted_drift_coef(cf_minus)
-
-    # Choose cf so that twisted process is mean reverting (drift_coef < 0)
-    if drift_minus < 0:
-        cf = cf_minus
-    elif drift_plus < 0:
-        cf = cf_plus
-    else:
-        # Pick the one with smaller |cf|
-        cf = cf_minus if abs(cf_minus) < abs(cf_plus) else cf_plus
-
-    # Eigenvalue
+    disc = ((xi_f - gamma_f * sigma_f) ** 2
+            - sigma_f ** 2 * (2 * beta_f + gamma_f ** 2))
+
+    if disc < 0:
+        raise ValueError("No real affine eigenfunction for these parameters.")
+
+    root = np.sqrt(disc)
+    cf_plus = ((xi_f - gamma_f * sigma_f) + root) / sigma_f ** 2
+    cf_minus = ((xi_f - gamma_f * sigma_f) - root) / sigma_f ** 2
+
+    def mean_reversion(cf):
+        return xi_f - sigma_f * (gamma_f + cf * sigma_f)
+
+    candidates = [(cf_minus, mean_reversion(cf_minus)),
+                  (cf_plus, mean_reversion(cf_plus))]
+
+    stationary = [(cf, mr) for cf, mr in candidates if mr > 0]
+
+    if not stationary:
+        raise ValueError("Neither root gives a stationary twisted process.")
+
+    cf, mr = stationary[0]
+
     rho = (beta_bar
-           + gamma_o**2 / 2
+           + gamma_o ** 2 / 2
            + cf * xi_f * xbar_f
            + co * (xi_o * xbar_o + gamma_o * sigma_o)
-           + co**2 * sigma_o**2 / 2)
+           + co ** 2 * sigma_o ** 2 / 2)
 
-    return cf, co, rho
+    return cf, co, rho, mr
 ```
 
+### A Breeden SDF
+
+{cite:t}`Breeden1979` studies a consumption-based continuous-time asset
+pricing model.
+
+In the present state specification, suppose log consumption satisfies
+
+$$
+    dc_t
+    =
+    X_t^o dt
+    + \sqrt{X_t^f}\vartheta_f dB_t^f
+    + \vartheta_o dB_t^o .
+$$
+
+With time-separable CRRA utility and subjective discount rate $b$, the
+stochastic discount factor is
+
+$$
+    S_t
+    =
+    \exp(-bt-a(c_t-c_0)).
+$$
+
+Thus it has the affine parameters
+
+$$
+    \bar\beta^s = -b,
+    \qquad
+    \beta_f^s = 0,
+    \qquad
+    \beta_o^s = -a,
+    \qquad
+    \gamma_f^s = -a\vartheta_f,
+    \qquad
+    \gamma_o^s = -a\vartheta_o .
+$$ (eq:breeden-sdf-params)
+
+Recursive preferences of {cite:t}`Kreps_Porteus1978` and
+{cite:t}`Epstein_Zin1989`, used in long-run risk models such as
+{cite:t}`Bansal_Yaron_2004`, add forward-looking terms to the SDF.
+
+The operator calculations below are the same once the parameters
+$(\bar\beta,\beta_f,\beta_o,\gamma_f,\gamma_o)$ are specified.
+
 ```{code-cell} ipython3
-# Breeden (1979) consumption-based model parameters
-# (Section 3.3 and 8.1 of Hansen-Scheinkman 2009)
-params_breeden = {
-    'xi_f'  : 2.5,    # mean reversion speed of X^f
-    'xbar_f': 0.04,   # long-run mean of X^f (average variance)
-    'sigma_f': -0.1,  # volatility of X^f (negative by convention)
-    'xi_o'  : 0.1,    # mean reversion speed of X^o
-    'xbar_o': 0.02,   # long-run mean of X^o (average growth)
-    'sigma_o': 0.02,  # volatility of X^o
-    # Stochastic discount factor loadings (Breeden CRRA with risk aversion a)
-    'a'     : 5.0,    # risk aversion
-    'theta_f': 1.0,   # consumption-volatility loading
-    'theta_o': 1.0,   # consumption-growth loading
+params_state = {
+    "xi_f": 0.70,
+    "xbar_f": 0.04,
+    "sigma_f": -0.20,
+    "xi_o": 0.50,
+    "xbar_o": 0.02,
+    "sigma_o": 0.01,
 }
 
-a  = params_breeden['a']
-theta_f = params_breeden['theta_f']
-theta_o = params_breeden['theta_o']
-sigma_f = params_breeden['sigma_f']
-sigma_o = params_breeden['sigma_o']
-xi_f    = params_breeden['xi_f']
-xi_o    = params_breeden['xi_o']
-
-# For the Breeden SDF: S_t = exp(A^s_t) where
-#   beta_bar = -b (subjective discount rate)
-#   beta_f   = -a * beta_o_in_sdf (risk aversion x volatility loading)
-#   gamma_f  = -a * sqrt(x^f) * theta_f  => loading
-#   gamma_o  = -a * theta_o
-b = 0.03   # subjective discount rate
+a = 4.0
+b = 0.03
+theta_f = 0.06
+theta_o = 0.02
 
 params_sdf = {
-    'xi_f'   : xi_f,
-    'xbar_f' : params_breeden['xbar_f'],
-    'sigma_f': sigma_f,
-    'xi_o'   : xi_o,
-    'xbar_o' : params_breeden['xbar_o'],
-    'sigma_o': sigma_o,
-    'beta_bar': -b,
-    'beta_f'  : 0.0,       # no x^f level loading in SDF
-    'beta_o'  : -a * params_breeden['xbar_o'],   # approximate
-    'gamma_f' : -a * theta_f,
-    'gamma_o' : -a * theta_o,
+    **params_state,
+    "beta_bar": -b,
+    "beta_f": 0.0,
+    "beta_o": -a,
+    "gamma_f": -a * theta_f,
+    "gamma_o": -a * theta_o,
 }
 
-cf, co, rho = solve_affine_eigenfunction(params_sdf)
+cf_s, co_s, rho_s, mr_s = solve_affine_eigenfunction(params_sdf)
 
-print("Affine eigenfunction φ(x^f, x^o) = exp(c_f x^f + c_o x^o)")
-print(f"  c_f = {cf:.6f}")
-print(f"  c_o = {co:.6f}")
-print(f"\nPrincipal eigenvalue ρ = {rho:.6f}")
-print(f"\nInterpretation:")
-print(f"  Long-run SDF growth rate = {rho:.4f}")
-print(f"  Long-run risk-free rate ≈ {-rho:.4f}")
+print("principal eigenfunction phi(xf, xo) = exp(cf xf + co xo)")
+print(f"cf = {cf_s:.6f}")
+print(f"co = {co_s:.6f}")
+print(f"rho = {rho_s:.6f}")
+print(f"twisted mean-reversion coefficient for Xf = {mr_s:.6f}")
+print(f"long-run zero-coupon yield = {-rho_s:.4f}")
 ```
 
-### Sensitivity to Risk Aversion
-
-A key result of {cite}`HansenScheinkman2009` is that the eigenvalue $\rho$
-encodes the long-run risk adjustment.
-
-We can trace out a **long-run
-risk-return frontier** by varying risk exposure.
+The rejected root for $c_f$ would make the twisted volatility process
+explosive rather than stationary.
 
 ```{code-cell} ipython3
-# Vary risk aversion and trace the long-run eigenvalue
-a_values = np.linspace(0.5, 10.0, 50)
-rho_values = []
-
-for a_val in a_values:
-    p = dict(params_sdf)   # copy
-    p['beta_o']  = -a_val * params_breeden['xbar_o']
-    p['gamma_f'] = -a_val * theta_f
-    p['gamma_o'] = -a_val * theta_o
-    try:
-        _, _, rho_val = solve_affine_eigenfunction(p)
-        rho_values.append(rho_val)
-    except ValueError:
-        rho_values.append(np.nan)
-
-fig, ax = plt.subplots()
-ax.plot(a_values, rho_values, 'b-', lw=2)
-ax.set_xlabel('Risk aversion $a$')
-ax.set_ylabel('Principal eigenvalue $\\rho$')
-ax.set_title('Long-run decay rate of SDF vs. risk aversion')
-ax.axhline(0, color='k', ls=':', lw=0.8)
-plt.tight_layout()
-plt.show()
+xi_f = params_sdf["xi_f"]
+sigma_f = params_sdf["sigma_f"]
+gamma_f = params_sdf["gamma_f"]
+beta_f = params_sdf["beta_f"]
+
+disc = ((xi_f - gamma_f * sigma_f) ** 2
+        - sigma_f ** 2 * (2 * beta_f + gamma_f ** 2))
+root = np.sqrt(disc)
+
+cf_candidates = np.array([
+    ((xi_f - gamma_f * sigma_f) - root) / sigma_f ** 2,
+    ((xi_f - gamma_f * sigma_f) + root) / sigma_f ** 2
+])
+
+for cf in cf_candidates:
+    mr = xi_f - sigma_f * (gamma_f + cf * sigma_f)
+    print(f"cf = {cf:8.4f}, twisted mean reversion = {mr:8.4f}")
 ```
 
-## The Multiplicative Decomposition in the Diffusion Example
+### The Martingale Component
 
-Given the affine eigenfunction, we can explicitly construct the martingale
-component $\hat{M}$ and illustrate the decomposition.
-
-The martingale $\hat{M}_t = e^{\hat{A}_t}$ where
+For the affine example, the martingale component has log
 
 $$
-\hat{A}_t = \int_0^t \sqrt{X^f_s}(\gamma_f + c_f\sigma_f)\,dB^f_s
-           + \int_0^t (\gamma_o + c_o\sigma_o)\,dB^o_s
-           - \frac{(\gamma_f + c_f\sigma_f)^2}{2}\int_0^t X^f_s\,ds
-           - \frac{(\gamma_o + c_o\sigma_o)^2}{2} t.
+\begin{aligned}
+\hat A_t
+&=
+\int_0^t
+    \sqrt{X_s^f}(\gamma_f+c_f\sigma_f)dB_s^f
++ \int_0^t
+    (\gamma_o+c_o\sigma_o)dB_s^o
+\\
+&\quad
+- \frac{1}{2}
+  \int_0^t
+    X_s^f(\gamma_f+c_f\sigma_f)^2 ds
+- \frac{1}{2}
+  \int_0^t
+    (\gamma_o+c_o\sigma_o)^2 ds .
+\end{aligned}
 $$
 
-The **twisted drift** for $X^f$ under the $\hat{M}$-measure is
+The corresponding drift distortions are
 
 $$
-\xi_f(\bar{x}_f - x^f) + x^f \sigma_f(\gamma_f + c_f\sigma_f),
+\begin{aligned}
+dX_t^f:
+\quad&
+\xi_f(\bar x_f-X_t^f)
++ X_t^f\sigma_f(\gamma_f+c_f\sigma_f),
+\\
+dX_t^o:
+\quad&
+\xi_o(\bar x_o-X_t^o)
++ \sigma_o(\gamma_o+c_o\sigma_o).
+\end{aligned}
 $$
 
-and for $X^o$:
-
-$$
-\xi_o(\bar{x}_o - x^o) + \sigma_o(\gamma_o + c_o\sigma_o).
-$$
+The code below simulates the state and constructs the three factors in
+{eq}`eq:hs-factorization`.
 
 ```{code-cell} ipython3
-def simulate_diffusion(params_sdf, T=50.0, dt=0.01, seed=42):
+def simulate_states(params, T=40.0, dt=0.01, seed=1234):
     """
-    Simulate the Hansen-Scheinkman affine diffusion and
-    the multiplicative decomposition M_t = exp(ρt) M̂_t φ(X0)/φ(X_t).
-
-    Returns
-    -------
-    times : array of time points
-    Xf, Xo : state paths
-    Mt : M_t path
-    Mt_hat : M̂_t path (martingale component)
-    phi_ratio : φ(X0)/φ(X_t) path (transient component)
-    rho : eigenvalue
+    Euler simulation of the affine state process.
     """
     rng = np.random.default_rng(seed)
-    cf, co, rho = solve_affine_eigenfunction(params_sdf)
 
-    xi_f   = params_sdf['xi_f']
-    xbar_f = params_sdf['xbar_f']
-    sigma_f = params_sdf['sigma_f']
-    xi_o   = params_sdf['xi_o']
-    xbar_o = params_sdf['xbar_o']
-    sigma_o = params_sdf['sigma_o']
-    beta_bar = params_sdf['beta_bar']
-    beta_f   = params_sdf['beta_f']
-    beta_o   = params_sdf['beta_o']
-    gamma_f  = params_sdf['gamma_f']
-    gamma_o  = params_sdf['gamma_o']
+    n = int(T / dt)
+    t = np.linspace(0, T, n + 1)
+    Xf = np.empty(n + 1)
+    Xo = np.empty(n + 1)
 
-    n_steps = int(T / dt)
-    times = np.linspace(0, T, n_steps + 1)
+    Xf[0] = params["xbar_f"]
+    Xo[0] = params["xbar_o"]
 
-    # Initialize at long-run means
-    Xf = np.zeros(n_steps + 1)
-    Xo = np.zeros(n_steps + 1)
-    Xf[0] = xbar_f
-    Xo[0] = xbar_o
+    for k in range(n):
+        xf = max(Xf[k], 1e-10)
+        xo = Xo[k]
 
-    # Additive functional A_t (log M_t)
-    A = np.zeros(n_steps + 1)
-    A_hat = np.zeros(n_steps + 1)  # log M̂_t
+        dBf = rng.normal() * np.sqrt(dt)
+        dBo = rng.normal() * np.sqrt(dt)
 
-    for i in range(n_steps):
-        xf = max(Xf[i], 1e-8)
-        xo = Xo[i]
+        Xf[k + 1] = (xf
+                     + params["xi_f"] * (params["xbar_f"] - xf) * dt
+                     + np.sqrt(xf) * params["sigma_f"] * dBf)
+        Xf[k + 1] = max(Xf[k + 1], 1e-10)
 
-        dBf = rng.standard_normal() * np.sqrt(dt)
-        dBo = rng.standard_normal() * np.sqrt(dt)
+        Xo[k + 1] = (xo
+                     + params["xi_o"] * (params["xbar_o"] - xo) * dt
+                     + params["sigma_o"] * dBo)
 
-        # State evolution
-        Xf[i+1] = max(xf + xi_f * (xbar_f - xf) * dt
-                      + np.sqrt(xf) * sigma_f * dBf, 1e-8)
-        Xo[i+1] = xo + xi_o * (xbar_o - xo) * dt + sigma_o * dBo
+    return t, Xf, Xo
 
-        # Additive functional increment
-        dA = (beta_bar + beta_f * xf + beta_o * xo) * dt \
-             + np.sqrt(xf) * gamma_f * dBf \
-             + gamma_o * dBo \
-             + 0.5 * (gamma_f**2 * xf + gamma_o**2) * dt  # Ito correction
 
-        A[i+1] = A[i] + dA
+def additive_log_M(params, t, Xf, Xo, seed=1234):
+    """
+    Recompute the Brownian increments used in simulate_states and construct A_t.
+    """
+    rng = np.random.default_rng(seed)
+    dt = t[1] - t[0]
+    A = np.zeros_like(t)
+
+    for k in range(len(t) - 1):
+        xf = max(Xf[k], 1e-10)
+        xo = Xo[k]
 
-        # Martingale component increment
-        dA_hat = (np.sqrt(xf) * (gamma_f + cf * sigma_f) * dBf
-                  + (gamma_o + co * sigma_o) * dBo
-                  - 0.5 * ((gamma_f + cf * sigma_f)**2 * xf
-                           + (gamma_o + co * sigma_o)**2) * dt)
+        dBf = rng.normal() * np.sqrt(dt)
+        dBo = rng.normal() * np.sqrt(dt)
 
-        A_hat[i+1] = A_hat[i] + dA_hat
+        drift = (params["beta_bar"]
+                 + params["beta_f"] * xf
+                 + params["beta_o"] * xo)
 
-    phi0 = np.exp(cf * Xf[0] + co * Xo[0])
-    phi_t = np.exp(cf * Xf + co * Xo)
+        shock = (np.sqrt(xf) * params["gamma_f"] * dBf
+                 + params["gamma_o"] * dBo)
+
+        A[k + 1] = A[k] + drift * dt + shock
+
+    return A
 
-    Mt     = np.exp(A)
-    Mt_hat = np.exp(A_hat)
-    phi_ratio = phi0 / phi_t
 
-    return times, Xf, Xo, Mt, Mt_hat, phi_ratio, rho, cf, co
+t, Xf, Xo = simulate_states(params_sdf)
+A_log = additive_log_M(params_sdf, t, Xf, Xo)
 
+phi0 = np.exp(cf_s * Xf[0] + co_s * Xo[0])
+phit = np.exp(cf_s * Xf + co_s * Xo)
 
-times, Xf, Xo, Mt, Mt_hat, phi_ratio, rho, cf, co = simulate_diffusion(
-    params_sdf, T=30.0, dt=0.01
-)
+M = np.exp(A_log)
+M_hat = np.exp(-rho_s * t) * M * phit / phi0
+transient = phi0 / phit
 
-print(f"ρ = {rho:.6f},  c_f = {cf:.4f},  c_o = {co:.4f}")
+identity_error = np.max(np.abs(M - np.exp(rho_s * t) * M_hat * transient))
+print(f"maximum factorization error = {identity_error:.2e}")
 ```
 
 ```{code-cell} ipython3
-# Plot the three components of the decomposition
 fig, axes = plt.subplots(2, 2, figsize=(12, 8))
 
-ax = axes[0, 0]
-ax.plot(times, Xf, 'b-', lw=1)
-ax.set_title('$X^f_t$ (stochastic volatility)')
-ax.set_xlabel('$t$')
-
-ax = axes[0, 1]
-ax.plot(times, Xo, 'g-', lw=1)
-ax.set_title('$X^o_t$ (predictable growth)')
-ax.set_xlabel('$t$')
-
-ax = axes[1, 0]
-ax.plot(times, Mt, 'b-', lw=1.5, label='$M_t$')
-ax.plot(times, np.exp(rho * times) * Mt_hat * phi_ratio, 'r--',
-        lw=1, label='$e^{\\rho t}\\hat{M}_t\\phi(X_0)/\\phi(X_t)$')
-ax.set_title('Decomposition check: $M_t = e^{\\rho t}\\hat{M}_t \\phi(X_0)/\\phi(X_t)$')
-ax.set_xlabel('$t$')
-ax.legend(fontsize=9)
-
-ax = axes[1, 1]
-ax.plot(times, np.exp(rho * times), 'k-', lw=1.5, label=f'$e^{{\\rho t}}$, ρ={rho:.4f}')
-ax.plot(times, Mt_hat, 'b-', lw=1, alpha=0.7, label='$\\hat{M}_t$ (martingale)')
-ax.plot(times, phi_ratio, 'r-', lw=1, alpha=0.7, label='$\\phi(X_0)/\\phi(X_t)$ (transient)')
-ax.set_title('Three components of $M_t$')
-ax.set_xlabel('$t$')
-ax.legend(fontsize=9)
-
-plt.suptitle('Multiplicative Decomposition of SDF', fontsize=13, y=1.01)
+axes[0, 0].plot(t, Xf)
+axes[0, 0].set_title("$X_t^f$")
+axes[0, 0].set_xlabel("$t$")
+
+axes[0, 1].plot(t, Xo)
+axes[0, 1].set_title("$X_t^o$")
+axes[0, 1].set_xlabel("$t$")
+
+axes[1, 0].plot(t, M, label="$M_t$")
+axes[1, 0].plot(t, np.exp(rho_s * t) * M_hat * transient,
+                "--", label="factorization")
+axes[1, 0].set_title("Multiplicative Factorization")
+axes[1, 0].set_xlabel("$t$")
+axes[1, 0].legend()
+
+axes[1, 1].plot(t, np.exp(rho_s * t), label="$\\exp(\\rho t)$")
+axes[1, 1].plot(t, M_hat, label="$\\hat M_t$", alpha=0.8)
+axes[1, 1].plot(t, transient, label="$\\phi(X_0)/\\phi(X_t)$", alpha=0.8)
+axes[1, 1].set_title("Three Components")
+axes[1, 1].set_xlabel("$t$")
+axes[1, 1].legend()
+
 plt.tight_layout()
 plt.show()
 ```
 
-## Long-Run Risk-Return Trade-offs
+## Long-Run Risk Prices
 
-### The Short-Run (Local) Trade-off
+Local continuous-time pricing is expressed through instantaneous risk prices.
 
-From Corollary 3.1 of {cite}`HansenScheinkman2009`, the instantaneous
-required expected rate of return for a portfolio with Brownian exposure
-$\gamma_v$ to the SDF with Brownian component $\gamma_s$ is
+Suppose the SDF has Brownian loading $\gamma^s$.
+
+For a valuation functional with Brownian exposure $\gamma^v$, Corollary 3.1 of
+{cite:t}`HansenScheinkman2009` gives the Brownian part of the local required
+expected return as
 
 $$
-\varepsilon_v = -\beta_s - \gamma_v \cdot \gamma_s - \frac{|\gamma_s|^2}{2}.
+    -\gamma^v \cdot \gamma^s .
 $$
 
-The vector $-\gamma_s$ contains the **local (instantaneous) risk prices**.
+Thus the local price of exposure to a Brownian shock is $-\gamma^s$.
 
-### The Long-Run Trade-off via Changing Cash Flows
+Long-run prices differ because a shock can move persistent state variables
+that influence future cash-flow growth or future discounting.
 
-For a cash flow $D_t = G_t \psi(X_t) D_0$ with growth functional $G =
-e^{A^g}$, the **long-run risk-adjusted return** is $-\rho + \delta$, where:
+In the affine model, the local price of exposure to $B^o$ is
 
-- $\delta$ is the expected growth rate of $G$, and
-- $\rho$ is the principal eigenvalue of the semigroup built from $M = GS$.
+$$
+    -\gamma_o^s .
+$$
 
-```{admonition} Long-Run Risk Price Formula
-For the affine diffusion with an Ornstein–Uhlenbeck growth predictor $X^o$,
-the long-run risk price for exposure $\gamma^g_o$ to the $B^o$ shock is
+The long-run price of exposure to $B^o$ in the cash-flow valuation problem is
 
 $$
-\frac{d\rho}{d\gamma^g_o} = -\gamma^s_o - \frac{\beta^s_o}{\xi_o}\,\sigma_o.
-$$
+    -\gamma_o^s
+    - \frac{\beta_o^s}{\xi_o}\sigma_o .
+$$ (eq:long-run-price-o)
+
+The second term is the persistence adjustment.
 
-The term $\beta^s_o / \xi_o$ captures the **persistence effect**: a shock to
-$X^o$ reverberates over a horizon of order $1/\xi_o$.
+A shock to $B^o$ moves the persistent growth predictor $X^o$.
 
-The more persistent
-the growth process, the larger the long-run risk price relative to the
-local risk price $-\gamma^s_o$.
+Because $X^o$ mean reverts at rate $\xi_o$, the cumulative effect of the shock
+is larger when $\xi_o$ is smaller.
+
+```{code-cell} ipython3
+gamma_s_o = params_sdf["gamma_o"]
+beta_s_o = params_sdf["beta_o"]
+xi_o = params_sdf["xi_o"]
+sigma_o = params_sdf["sigma_o"]
+
+local_price_o = -gamma_s_o
+long_run_price_o = -gamma_s_o - (beta_s_o / xi_o) * sigma_o
+
+print(f"local price of B^o exposure    = {local_price_o:.4f}")
+print(f"long-run price of B^o exposure = {long_run_price_o:.4f}")
 ```
 
+The next cell illustrates how persistence changes the wedge between local and
+long-run prices.
+
 ```{code-cell} ipython3
-def long_run_risk_return(gamma_g_o_values, params_sdf):
-    """
-    Compute the long-run risk-adjusted return -ρ+δ for varying
-    cash-flow exposure γ^g_o to the B^o shock.
+xi_o_grid = np.array([0.10, 0.20, 0.50, 1.00, 2.00, 5.00])
+local_grid = np.full_like(xi_o_grid, local_price_o)
+long_grid = -gamma_s_o - (beta_s_o / xi_o_grid) * sigma_o
+
+fig, ax = plt.subplots()
+ax.plot(xi_o_grid, local_grid, "--", lw=2, label="local")
+ax.plot(xi_o_grid, long_grid, "o-", lw=2, label="long-run")
+ax.set_xscale("log")
+ax.set_xlabel("mean-reversion speed $\\xi_o$")
+ax.set_ylabel("risk price")
+ax.set_title("Persistence and Long-Run Risk Prices")
+ax.legend()
+plt.show()
+```
+
+### Changing Cash-Flow Risk
+
+Let a cash-flow growth functional be
+
+$$
+\begin{aligned}
+A_t^g
+&=
+\delta t
++ \int_0^t \sqrt{X_s^f}\gamma_f^g dB_s^f
++ \int_0^t \gamma_o^g dB_s^o
+\\
+&\quad
+- \frac{1}{2}
+  \int_0^t
+    \left[
+        X_s^f(\gamma_f^g)^2 + (\gamma_o^g)^2
+    \right] ds .
+\end{aligned}
+$$ (eq:growth-functional)
+
+The last line makes $\exp(A_t^g-\delta t)$ a martingale.
+
+To price the cash flow $D_t=D_0G_t\psi(X_t)$, use the semigroup generated by
+$M=GS$.
+
+The combined affine parameters are
+
+$$
+\begin{aligned}
+\bar\beta &= \bar\beta^s+\delta-\frac{(\gamma_o^g)^2}{2},\\
+\beta_f &= \beta_f^s-\frac{(\gamma_f^g)^2}{2},\\
+\beta_o &= \beta_o^s,\\
+\gamma_f &= \gamma_f^s+\gamma_f^g,\\
+\gamma_o &= \gamma_o^s+\gamma_o^g.
+\end{aligned}
+$$
+
+Let $\rho$ be the principal eigenvalue of this $GS$ semigroup.
 
-    The combined multiplicative functional M = GS has loadings:
-        gamma_o = gamma_g_o + gamma_s_o
-        beta_o  = beta_s_o  (unchanged, since G is a martingale)
+Then $-\rho$ is the long-run decay rate in value, and
+
+$$
+    R_\infty = -\rho + \delta
+$$
+
+is the asymptotic required return net of the cash-flow growth rate.
+
+```{code-cell} ipython3
+def required_return_for_growth_exposure(gamma_g_o, gamma_g_f=0.0, delta=0.02):
+    """
+    Long-run required return -rho + delta for a cash-flow growth exposure.
     """
-    _, _, rho_s = solve_affine_eigenfunction(params_sdf)
-    gamma_s_o = params_sdf['gamma_o']
-    delta = 0.02   # assumed cash-flow growth rate
-
-    rho_vals = []
-    for gamma_g_o in gamma_g_o_values:
-        p = dict(params_sdf)
-        p['gamma_o'] = gamma_s_o + gamma_g_o   # combined loading
-        # beta_bar includes growth correction: delta - (gamma_g_o)^2/2
-        p['beta_bar'] = params_sdf['beta_bar'] + delta - 0.5 * gamma_g_o**2
-        try:
-            _, _, rho_val = solve_affine_eigenfunction(p)
-            rho_vals.append(-rho_val + delta)
-        except ValueError:
-            rho_vals.append(np.nan)
-    return np.array(rho_vals)
-
-
-gamma_g_o_vals = np.linspace(-0.5, 0.5, 100)
-ret_vals = long_run_risk_return(gamma_g_o_vals, params_sdf)
-
-# Local risk price: ∂ε_v/∂γ_v = -γ_s_o (constant)
-gamma_s_o = params_sdf['gamma_o']
-local_slope = -gamma_s_o
-local_return = -params_sdf['beta_bar'] + local_slope * gamma_g_o_vals
+    p = dict(params_sdf)
+    p["beta_bar"] = params_sdf["beta_bar"] + delta - 0.5 * gamma_g_o ** 2
+    p["beta_f"] = params_sdf["beta_f"] - 0.5 * gamma_g_f ** 2
+    p["beta_o"] = params_sdf["beta_o"]
+    p["gamma_f"] = params_sdf["gamma_f"] + gamma_g_f
+    p["gamma_o"] = params_sdf["gamma_o"] + gamma_g_o
+
+    _, _, rho, _ = solve_affine_eigenfunction(p)
+    return -rho + delta
+
+
+gamma_g_o_grid = np.linspace(-0.5, 0.5, 101)
+required_returns = np.array([
+    required_return_for_growth_exposure(g) for g in gamma_g_o_grid
+])
+
+local_line = (-params_sdf["beta_bar"]
+              + local_price_o * gamma_g_o_grid)
 
 fig, ax = plt.subplots()
-ax.plot(gamma_g_o_vals, ret_vals, 'b-', lw=2, label='Long-run return $-\\rho+\\delta$')
-ax.plot(gamma_g_o_vals, local_return, 'r--', lw=2, label='Local return approximation')
-ax.set_xlabel('Cash-flow risk exposure $\\gamma^g_o$')
-ax.set_ylabel('Required rate of return')
-ax.set_title('Long-Run vs. Local Risk-Return Trade-off ($B^o$ exposure)')
+ax.plot(gamma_g_o_grid, required_returns, lw=2,
+        label="long-run required return")
+ax.plot(gamma_g_o_grid, local_line, "--", lw=2,
+        label="local slope")
+ax.set_xlabel("cash-flow exposure $\\gamma_o^g$")
+ax.set_ylabel("rate of return")
+ax.set_title("Local and Long-Run Pricing of Persistent Growth Risk")
 ax.legend()
-plt.tight_layout()
 plt.show()
 ```
 
+The slope of the long-run line is the risk price in {eq}`eq:long-run-price-o`.
+
 ```{code-cell} ipython3
-# Quantify the long-run risk price vs local risk price
-xi_o    = params_sdf['xi_o']
-sigma_o = params_sdf['sigma_o']
-beta_s_o = params_sdf['beta_o']
-
-local_price   = -gamma_s_o
-lr_price      = -gamma_s_o - (beta_s_o / xi_o) * sigma_o
-
-print("Risk prices for B^o exposure:")
-print(f"  Local (instantaneous) risk price: {local_price:.4f}")
-print(f"  Long-run risk price:              {lr_price:.4f}")
-print(f"  Persistence amplification factor: {(beta_s_o/xi_o)*sigma_o:.4f}")
-print(f"  (= β^s_o/ξ_o × σ_o, captures reverberation over horizon 1/ξ_o={1/xi_o:.1f})")
+finite_difference = (
+    required_return_for_growth_exposure(0.001)
+    - required_return_for_growth_exposure(-0.001)
+) / 0.002
+
+print(f"finite-difference slope = {finite_difference:.6f}")
+print(f"formula                 = {long_run_price_o:.6f}")
 ```
 
-## Perron–Frobenius Theory and the Finite-State Case
+## Perron-Frobenius Dominance
 
-The long-run dominance result (Proposition 7.1 of {cite}`HansenScheinkman2009`)
-is the continuous-time, general Markov generalization of classical
-Perron–Frobenius theory.
+The finite-state examples make the limiting argument transparent.
 
-Let us illustrate this with a
-three-state Markov chain.
+Let us repeat the calculation for a three-state chain.
 
 ```{code-cell} ipython3
-# Three-state example: expansion, normal, contraction
-N = 3
-state_names = ['Expansion', 'Normal', 'Contraction']
+state_names = ["expansion", "normal", "contraction"]
 
-# Intensity matrix
-U3 = np.array([[-0.4,  0.3,  0.1],
-               [ 0.2, -0.5,  0.3],
-               [ 0.1,  0.2, -0.3]])
+U3 = np.array([[-0.40,  0.30,  0.10],
+               [ 0.20, -0.50,  0.30],
+               [ 0.10,  0.20, -0.30]])
 
-# Discount rates (higher in expansion = rich economy)
-beta3 = np.array([0.06, 0.04, 0.01])
-
-# No jumps
+r3 = np.array([0.06, 0.04, 0.01])
 kappa3 = np.zeros((3, 3))
-A3 = build_generator(U3, beta3, kappa3)
-rho3, phi3 = principal_eigen(A3)
-
-print("Three-state Markov chain")
-print(f"\nGenerator matrix A:")
-print(np.round(A3, 3))
-print(f"\nPrincipal eigenvalue ρ = {rho3:.6f}")
-print(f"Principal eigenfunction φ = {phi3}")
-
-# Verify Perron-Frobenius dominance
-eigenvalues, _ = eig(A3)
-real_eigs = sorted(eigenvalues.real, reverse=True)
-print(f"\nAll eigenvalues (real parts): {[f'{e:.4f}' for e in real_eigs]}")
-print("ρ is strictly largest: confirms long-run dominance")
+
+A3 = build_generator(U3, r3, kappa3)
+rho3, phi3 = principal_eigenpair(A3)
+A3_hat = twisted_generator(A3, rho3, phi3)
+varsigma3 = stationary_distribution(A3_hat)
+
+print(f"rho = {rho3:.6f}")
+print(f"phi = {phi3}")
+print(f"varsigma_hat = {varsigma3}")
+
+eigs3 = np.sort(eig(A3, right=False).real)[::-1]
+print("eigenvalues by real part:")
+print(np.round(eigs3, 6))
 ```
 
 ```{code-cell} ipython3
-# Demonstrate long-run dominance: exp(-ρt) M_t ψ → φ ∫(ψ/φ) dς̂
-# for three different initial functions ψ
-
-# Compute twisted stationary distribution
-phi3_diag_inv = np.diag(1.0 / phi3)
-phi3_diag     = np.diag(phi3)
-A3_hat = phi3_diag_inv @ A3 @ phi3_diag - rho3 * np.eye(3)
-
-evals3, evecs3 = eig(A3_hat.T)
-idx0 = np.argmin(np.abs(evals3.real))
-varsigma3 = evecs3[:, idx0].real
-varsigma3 = np.abs(varsigma3) / np.abs(varsigma3).sum()
-
-psi_functions = {
-    'ψ = [1, 0, 0]': np.array([1.0, 0.0, 0.0]),
-    'ψ = [0, 1, 0]': np.array([0.0, 1.0, 0.0]),
-    'ψ = [1, 2, 3]': np.array([1.0, 2.0, 3.0]),
+psi_list = {
+    "$\\psi=(1,0,0)$": np.array([1.0, 0.0, 0.0]),
+    "$\\psi=(0,1,0)$": np.array([0.0, 1.0, 0.0]),
+    "$\\psi=(1,2,3)$": np.array([1.0, 2.0, 3.0]),
 }
 
-t_grid = np.linspace(0, 30, 200)
+t_grid = np.linspace(0, 35, 220)
+colors = ["C0", "C1", "C2"]
+
 fig, axes = plt.subplots(1, 3, figsize=(14, 4))
 
-for ax, (label, psi) in zip(axes, psi_functions.items()):
+for ax, (label, psi) in zip(axes, psi_list.items()):
     limit = phi3 * np.sum((psi / phi3) * varsigma3)
 
-    for state_idx, color in enumerate(['b', 'g', 'r']):
-        vals = []
-        for t in t_grid:
-            Mt = expm(t * A3)
-            approx = np.exp(-rho3 * t) * (Mt @ psi)
-            vals.append(approx[state_idx])
-        ax.plot(t_grid, vals, color=color, lw=1.5, alpha=0.7,
-                label=f'State {state_idx+1}')
-        ax.axhline(limit[state_idx], color=color, ls='--', lw=0.8)
+    for i, color in enumerate(colors):
+        path = []
+        for t_val in t_grid:
+            value = np.exp(-rho3 * t_val) * expm(t_val * A3) @ psi
+            path.append(value[i])
+
+        ax.plot(t_grid, path, color=color, lw=1.5,
+                label=state_names[i])
+        ax.axhline(limit[i], color=color, ls="--", lw=1)
 
     ax.set_title(label)
-    ax.set_xlabel('$t$')
-    ax.set_ylabel('$e^{-\\rho t}\\mathbb{M}_t\\psi$')
+    ax.set_xlabel("$t$")
+    ax.set_ylabel("$\\exp(-\\rho t)\\mathbb{M}_t\\psi$")
 
-axes[0].legend(fontsize=9)
-fig.suptitle('Long-Run Dominance: $e^{-\\rho t}\\mathbb{M}_t\\psi \\to \\phi\\int(\\psi/\\phi)\\,d\\hat{\\varsigma}$\n'
-             '(dashed lines = theoretical limits)', fontsize=11)
+axes[0].legend()
 plt.tight_layout()
 plt.show()
 ```
 
 ## Summary
 
-This lecture has illustrated the main ideas of {cite}`HansenScheinkman2009`:
+The Hansen-Scheinkman approach studies long-run risk by studying
+positive eigenfunctions of valuation semigroups.
 
-1. **Multiplicative functionals** and their associated semigroups are the
-   natural language for intertemporal asset pricing.
+The main steps are:
 
-2. The **principal eigenvalue** $\rho$ and **eigenfunction** $\phi$ of the
-   semigroup generator provide the long-run risk-return relationship:
-   $\rho$ is the asymptotic growth (or decay) rate and $\phi$ determines
-   the limiting state dependence.
+1. Model discounting, growth, or cumulated returns by a positive
+   multiplicative functional $M$.
+2. Build the semigroup
+   $\mathbb M_t\psi(x)=E[M_t\psi(X_t)\mid X_0=x]$.
+3. Solve the principal eigenvalue problem
+   $\mathbb A\phi=\rho\phi$.
+4. Use the factorization
+   $M_t=\exp(\rho t)\hat M_t\phi(X_0)/\phi(X_t)$.
+5. Under the twisted probability measure induced by $\hat M$, use stability
+   to obtain long-run approximations of the form {eq}`eq:long-run-limit`.
 
-3. The **multiplicative decomposition**
-   $M_t = e^{\rho t}\hat{M}_t(\phi(X_0)/\phi(X_t))$
-   separates permanent ($e^{\rho t}\hat{M}_t$) from transient
-   ($\phi(X_0)/\phi(X_t)$) components.
+In finite-state problems, this is Perron-Frobenius theory.
 
-4. In finite-state chains, this is exactly **Perron–Frobenius theory**.
+In affine diffusion problems, exponential-affine eigenfunctions often produce
+closed-form formulas.
 
-5. For the affine diffusion example, the eigenfunction is exponential in
-   the state, and the eigenvalue formula reveals how **persistence**
-   amplifies long-run risk prices beyond their local counterparts.
+The long-run risk prices that emerge can differ sharply from local risk prices
+when shocks move persistent state variables.
+
+This persistence effect is the economic channel emphasized in long-run risk
+asset-pricing models and in the empirical work of {cite:t}`hansen2008consumption`.
 
 ## Exercises
 
@@ -1053,91 +1245,96 @@ This lecture has illustrated the main ideas of {cite}`HansenScheinkman2009`:
 Consider a two-state Markov chain with intensity matrix
 
 $$
-\mathbb{U} = \begin{pmatrix} -\lambda & \lambda \\ \mu & -\mu \end{pmatrix}
+U =
+\begin{pmatrix}
+    -\lambda & \lambda \\
+    \mu & -\mu
+\end{pmatrix}.
 $$
 
-and a multiplicative functional with decay rates $\beta_1 > 0$ in state 1
-and $\beta_2 = 0$ in state 2, and no jump scaling.
+Let the multiplicative functional have decay rate $r_1>0$ in state 1, decay
+rate $r_2=0$ in state 2, and no jump scaling.
 
-(a) Write down the generator matrix $\mathbb{A}$.
+(a) Write down the generator matrix $A$.
 
-(b) Find the principal eigenvalue $\rho$ in terms of $\lambda$, $\mu$,
-    and $\beta_1$.
+(b) Find the principal eigenvalue $\rho$ in terms of $\lambda$, $\mu$, and
+$r_1$.
 
-(c) Verify numerically with $\lambda = 0.4$, $\mu = 0.6$, $\beta_1 = 0.05$
-    that your formula matches the output of `principal_eigen`.
+(c) Verify numerically with $\lambda=0.4$, $\mu=0.6$, and $r_1=0.05$.
 
-(d) Show that $\rho$ lies strictly between $-\beta_1$ and $0$.
+(d) Show that $-r_1 < \rho < 0$.
 ```
 
 ```{solution-start} lrr_ex1
 :class: dropdown
 ```
 
-**(a)** The generator matrix is
+*(a)* The generator is
 
 $$
-\mathbb{A} = \begin{pmatrix} -\lambda - \beta_1 & \lambda \\ \mu & -\mu \end{pmatrix}
+A =
+\begin{pmatrix}
+    -\lambda-r_1 & \lambda \\
+    \mu & -\mu
+\end{pmatrix}.
 $$
 
-**(b)** The eigenvalues solve $\det(\mathbb{A} - \rho I) = 0$:
+*(b)* The characteristic equation is
 
 $$
-(-\lambda - \beta_1 - \rho)(-\mu - \rho) - \lambda\mu = 0
+    \rho^2 + (\lambda+\mu+r_1)\rho + \mu r_1 = 0.
 $$
 
-Expanding:
+Hence the principal eigenvalue is the larger root
 
 $$
-\rho^2 + (\lambda + \mu + \beta_1)\rho + \mu\beta_1 = 0
+\rho
+=
+\frac{
+    -(\lambda+\mu+r_1)
+    + \sqrt{(\lambda+\mu+r_1)^2 - 4\mu r_1}
+}{2}.
 $$
 
-The principal eigenvalue is the larger root:
-
-$$
-\rho = \frac{-(\lambda + \mu + \beta_1) + \sqrt{(\lambda + \mu + \beta_1)^2 - 4\mu\beta_1}}{2}
-$$
-
-**(c)** Numerical verification:
+*(c)* Numerical verification:
 
 ```{code-cell} ipython3
-lam, mu_val, b1 = 0.4, 0.6, 0.05
+lam, mu, r1 = 0.4, 0.6, 0.05
 
-# Analytical formula
-disc = (lam + mu_val + b1)**2 - 4 * mu_val * b1
-rho_analytical = (-( lam + mu_val + b1) + np.sqrt(disc)) / 2
+disc = (lam + mu + r1) ** 2 - 4 * mu * r1
+rho_formula = (-(lam + mu + r1) + np.sqrt(disc)) / 2
 
-# Numerical
-U_ex = np.array([[-lam, lam], [mu_val, -mu_val]])
-beta_ex = np.array([b1, 0.0])
+U_ex = np.array([[-lam, lam],
+                 [mu, -mu]])
+r_ex = np.array([r1, 0.0])
 kappa_ex = np.zeros((2, 2))
-A_ex = build_generator(U_ex, beta_ex, kappa_ex)
-rho_numerical, phi_ex = principal_eigen(A_ex)
 
-print(f"Analytical ρ = {rho_analytical:.8f}")
-print(f"Numerical  ρ = {rho_numerical:.8f}")
-print(f"Difference   = {abs(rho_analytical - rho_numerical):.2e}")
+A_ex = build_generator(U_ex, r_ex, kappa_ex)
+rho_numeric, phi_numeric = principal_eigenpair(A_ex)
+
+print(f"formula  rho = {rho_formula:.8f}")
+print(f"numeric  rho = {rho_numeric:.8f}")
+print(f"difference   = {abs(rho_formula-rho_numeric):.2e}")
 ```
 
-**(d)** From the quadratic $\rho^2 + (\lambda+\mu+\beta_1)\rho + \mu\beta_1 = 0$:
+*(d)* Let
+
+$$
+q(x)=x^2+(\lambda+\mu+r_1)x+\mu r_1.
+$$
 
-- At $\rho = 0$: LHS $= \mu\beta_1 > 0$.
-- At $\rho = -\beta_1$: LHS $= \beta_1^2 - \lambda\beta_1 = \beta_1(\beta_1 - \lambda)$,
-  which can be positive or negative.
-- At $\rho = -(\lambda+\mu+\beta_1)$: LHS $= \mu\beta_1 > 0$.
+Then $q(0)=\mu r_1>0$ and
 
-Since the parabola opens upward and has two real roots summing to $-(\lambda+\mu+\beta_1) < 0$
-with product $\mu\beta_1 > 0$, both roots are negative.  The larger root $\rho$ satisfies
-$-\beta_1 < \rho < 0$ because:
-- $\rho > -(\lambda+\mu+\beta_1) > -\beta_1 - (\lambda+\mu)$, so clearly $\rho > -\infty$.
-- Evaluating the quadratic at $\rho = 0$ gives $\mu\beta_1 > 0$, so 0 is above the right root.
+$$
+q(-r_1)
+=
+-\lambda r_1
+<0.
+$$
 
-```{code-cell} ipython3
-print(f"ρ = {rho_numerical:.6f}")
-print(f"-β₁ = {-b1:.6f}")
-print(f"0 = 0")
-print(f"Is -β₁ < ρ < 0? {-b1 < rho_numerical < 0}")
-```
+Since the parabola opens upward, one root lies in $(-r_1,0)$.
+
+The principal eigenvalue is the larger root, so $-r_1<\rho<0$.
 
 ```{solution-end}
 ```
@@ -1145,23 +1342,29 @@ print(f"Is -β₁ < ρ < 0? {-b1 < rho_numerical < 0}")
 ```{exercise}
 :label: lrr_ex2
 
-**Long-run vs. short-run risk prices in the affine model.**
+In the affine model, compute the local and long-run prices of exposure to
+$B^o$ for
 
-Using the `solve_affine_eigenfunction` function, compute both local and
-long-run risk prices for varying levels of the mean-reversion parameter
-$\xi_o$ of the Ornstein–Uhlenbeck predictor $X^o$.
+$$
+    \xi_o \in \{0.1, 0.2, 0.5, 1, 2, 5\}.
+$$
 
-Specifically, set $\xi_o \in \{0.05, 0.1, 0.2, 0.5, 1.0, 5.0\}$ and for
-each value:
+Use the formulas
 
-(a) Compute the local risk price for $B^o$ exposure: $-\gamma^s_o$.
+$$
+    \text{local price} = -\gamma_o^s
+$$
 
-(b) Compute the long-run risk price formula:
-    $-\gamma^s_o - (\beta^s_o/\xi_o)\sigma_o$.
+and
 
-(c) Plot both as functions of $\xi_o$ on the same axes.
+$$
+    \text{long-run price}
+    =
+    -\gamma_o^s
+    - \frac{\beta_o^s}{\xi_o}\sigma_o .
+$$
 
-(d) Explain intuitively why the two prices converge as $\xi_o \to \infty$.
+Explain why the two prices converge as $\xi_o \to \infty$.
 ```
 
 ```{solution-start} lrr_ex2
@@ -1169,41 +1372,30 @@ each value:
 ```
 
 ```{code-cell} ipython3
-xi_o_values = np.array([0.05, 0.1, 0.2, 0.5, 1.0, 5.0])
-
-gamma_s_o  = params_sdf['gamma_o']
-beta_s_o   = params_sdf['beta_o']
-sigma_o_val = params_sdf['sigma_o']
+xi_vals = np.array([0.1, 0.2, 0.5, 1.0, 2.0, 5.0])
+local_vals = np.full_like(xi_vals, -params_sdf["gamma_o"])
+long_vals = (-params_sdf["gamma_o"]
+             - (params_sdf["beta_o"] / xi_vals) * params_sdf["sigma_o"])
 
-local_price_val = -gamma_s_o   # constant, independent of ξ_o
-
-lr_prices = []
-for xi_o_val in xi_o_values:
-    lr_price_val = -gamma_s_o - (beta_s_o / xi_o_val) * sigma_o_val
-    lr_prices.append(lr_price_val)
+for xi, lp, lrp in zip(xi_vals, local_vals, long_vals):
+    print(f"xi_o = {xi:3.1f}: local = {lp:.4f}, long-run = {lrp:.4f}")
 
 fig, ax = plt.subplots()
-ax.axhline(local_price_val, color='r', ls='--', lw=2, label='Local risk price $-\\gamma^s_o$')
-ax.plot(xi_o_values, lr_prices, 'bo-', lw=2, ms=8, label='Long-run risk price')
-ax.set_xlabel('Mean-reversion speed $\\xi_o$')
-ax.set_ylabel('Risk price')
-ax.set_title('Local vs. Long-Run Risk Prices for $B^o$ Exposure')
+ax.plot(xi_vals, local_vals, "--", lw=2, label="local")
+ax.plot(xi_vals, long_vals, "o-", lw=2, label="long-run")
+ax.set_xscale("log")
+ax.set_xlabel("$\\xi_o$")
+ax.set_ylabel("risk price")
 ax.legend()
-ax.set_xscale('log')
-plt.tight_layout()
 plt.show()
-
-print(f"Local risk price (all ξ_o): {local_price_val:.4f}")
-print("\nξ_o  |  Long-run risk price")
-for xi_o_val, lr in zip(xi_o_values, lr_prices):
-    print(f"{xi_o_val:.2f}  |  {lr:.4f}")
 ```
 
-**(d)** As $\xi_o \to \infty$, shocks to $X^o$ dissipate extremely quickly
-(the process reverts to its mean almost instantaneously).  A shock today
-has no lasting effect, so there is no "reverberation" to price.  The
-persistence amplification term $\beta^s_o \sigma_o / \xi_o \to 0$, and the
-long-run price converges to the local price $-\gamma^s_o$.
+As $\xi_o$ increases, $X^o$ mean reverts faster.
+
+A shock to $B^o$ then has a shorter-lived effect on future expected growth.
+
+The persistence term $(\beta_o^s/\xi_o)\sigma_o$ converges to zero, so the
+long-run price converges to the local price.
 
 ```{solution-end}
 ```
@@ -1211,29 +1403,29 @@ long-run price converges to the local price $-\gamma^s_o$.
 ```{exercise}
 :label: lrr_ex3
 
-**Numerically illustrate long-run dominance for the three-state chain.**
+Using the three-state example, let $\psi=(3,1,2)$.
 
-Using the three-state example from the lecture (stored in `U3`, `beta3`,
-`A3`, `rho3`, `phi3`, `varsigma3`):
+(a) Compute the theoretical limit
 
-(a) For a generic initial function $\psi = [3, 1, 2]$, compute the
-    theoretical long-run limit $\phi \int (\psi/\phi)\,d\hat{\varsigma}$
-    and the path $t \mapsto e^{-\rho t}\mathbb{M}_t\psi$ for each state.
+$$
+    \phi \sum_i \frac{\psi_i}{\phi_i}\hat\varsigma_i .
+$$
 
-(b) Plot the convergence speed: compute
+(b) Plot
 
-    $$
-    \text{error}(t) = \max_i \left|e^{-\rho_3 t}(\mathbb{M}_t \psi)_i
-                            - \phi_i \int \frac{\psi}{\phi}\,d\hat{\varsigma}\right|
-    $$
+$$
+    \max_i
+    \left|
+        \exp(-\rho t)(\mathbb M_t\psi)_i
+        -
+        \phi_i \sum_j \frac{\psi_j}{\phi_j}\hat\varsigma_j
+    \right|
+$$
 
-    and plot $\log(\text{error}(t))$ vs $t$.  What is the approximate
-    rate of convergence?  Compare to the **spectral gap**
-    $\rho_3 - \rho_2$ where $\rho_2$ is the second-largest real eigenvalue.
+on a logarithmic scale.
 
-(c) How does the convergence rate change if you make the chain more
-    "sluggish" by scaling the intensity matrix as
-    $\mathbb{U} \leftarrow 0.1 \times \mathbb{U}_3$?
+(c) Compare the convergence rate to the spectral gap between the largest and
+second-largest real parts of the eigenvalues of $A$.
 ```
 
 ```{solution-start} lrr_ex3
@@ -1241,76 +1433,39 @@ Using the three-state example from the lecture (stored in `U3`, `beta3`,
 ```
 
 ```{code-cell} ipython3
-# (a) Theoretical limit and convergence
-psi_ex3 = np.array([3.0, 1.0, 2.0])
-limit_ex3 = phi3 * np.sum((psi_ex3 / phi3) * varsigma3)
-
-print("Theoretical long-run limit φ ∫(ψ/φ) dς̂:")
-for i, (s, lim) in enumerate(zip(state_names, limit_ex3)):
-    print(f"  {s}: {lim:.6f}")
-
-# (b) Convergence speed
-t_fine = np.linspace(0.01, 40, 300)
-errors = []
-
-for t in t_fine:
-    Mt = expm(t * A3)
-    approx = np.exp(-rho3 * t) * (Mt @ psi_ex3)
-    errors.append(np.max(np.abs(approx - limit_ex3)))
-
-# Spectral gap
-evals_A3 = sorted(np.linalg.eigvals(A3).real, reverse=True)
-spectral_gap = evals_A3[0] - evals_A3[1]
-print(f"\nSpectral gap ρ₁ - ρ₂ = {spectral_gap:.4f}")
-print(f"Expected convergence rate ≈ {spectral_gap:.4f}")
-
-fig, axes = plt.subplots(1, 2, figsize=(12, 4))
-
-axes[0].semilogy(t_fine, errors, 'b-', lw=2)
-axes[0].set_xlabel('$t$')
-axes[0].set_ylabel('$\\log(\\text{error})$')
-axes[0].set_title('Convergence of $e^{-\\rho t}\\mathbb{M}_t\\psi$ to limit')
-
-# Overlay fitted exponential decay
-t_fit = t_fine[t_fine > 2]
-err_fit = [errors[i] for i, t in enumerate(t_fine) if t > 2]
-log_err = np.log(np.maximum(err_fit, 1e-15))
-t_fit_arr = np.array(t_fit)
-slope = np.polyfit(t_fit_arr, log_err, 1)[0]
-axes[0].plot(t_fine, np.exp(np.log(errors[0]) + slope * t_fine), 'r--',
-             label=f'Fitted rate ≈ {abs(slope):.4f}')
-axes[0].axhline(1e-10, color='k', ls=':', lw=0.8)
-axes[0].legend()
+psi = np.array([3.0, 1.0, 2.0])
+limit = phi3 * np.sum((psi / phi3) * varsigma3)
 
-# (c) Sluggish chain
-U3_slow = 0.1 * U3
-A3_slow = build_generator(U3_slow, beta3, kappa3)
-rho3_slow, phi3_slow = principal_eigen(A3_slow)
+print("limit:")
+for name, value in zip(state_names, limit):
+    print(f"  {name:11s} {value:.6f}")
 
-evals_slow = sorted(np.linalg.eigvals(A3_slow).real, reverse=True)
-gap_slow = evals_slow[0] - evals_slow[1]
+t_vals = np.linspace(0.1, 40, 300)
+errors = np.empty_like(t_vals)
 
-errors_slow = []
-for t in t_fine:
-    Mt = expm(t * A3_slow)
-    limit_slow = phi3_slow * np.sum((psi_ex3 / phi3_slow) * varsigma3)
-    approx = np.exp(-rho3_slow * t) * (Mt @ psi_ex3)
-    errors_slow.append(np.max(np.abs(approx - limit_slow)))
+for n, t_val in enumerate(t_vals):
+    approx = np.exp(-rho3 * t_val) * expm(t_val * A3) @ psi
+    errors[n] = np.max(np.abs(approx - limit))
 
-axes[1].semilogy(t_fine, errors, 'b-', lw=2, label=f'Original (gap={spectral_gap:.3f})')
-axes[1].semilogy(t_fine, errors_slow, 'r-', lw=2, label=f'Sluggish (gap={gap_slow:.3f})')
-axes[1].set_xlabel('$t$')
-axes[1].set_ylabel('$\\log(\\text{error})$')
-axes[1].set_title('Effect of chain speed on convergence')
-axes[1].legend()
+eigenvalues = eig(A3, right=False)
+real_parts = np.sort(eigenvalues.real)[::-1]
+gap = real_parts[0] - real_parts[1]
 
-plt.tight_layout()
+fig, ax = plt.subplots()
+ax.semilogy(t_vals, errors, lw=2)
+ax.set_xlabel("$t$")
+ax.set_ylabel("error")
+ax.set_title(f"Convergence to the Principal Eigenfunction, gap = {gap:.4f}")
 plt.show()
 
-print(f"\nOriginal chain:  spectral gap = {spectral_gap:.4f}, fitted rate = {abs(slope):.4f}")
-print(f"Sluggish chain:  spectral gap = {gap_slow:.4f}")
-print("Convergence is slower when the chain is sluggish (smaller spectral gap)")
+print(f"spectral gap = {gap:.6f}")
 ```
 
+The normalized semigroup converges at an exponential rate governed by the
+separation between the dominant eigenvalue and the remaining eigenvalues.
+
+In this finite-state example, that separation is the spectral gap computed
+above.
+
 ```{solution-end}
 ```

From 91cf7891551cb061747524ada8f24a03f70e9cd3 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Mon, 25 May 2026 14:09:17 +1000
Subject: [PATCH 05/25] updates

---
 lectures/long_run_risk_operator.md | 425 +++++++++++++++++++++++++----
 lectures/rational_learning_re.md   |  59 ++--
 2 files changed, 411 insertions(+), 73 deletions(-)

diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index d3e284111..3dbece188 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -28,19 +28,24 @@ kernelspec:
 
 ## Overview
 
-This lecture studies the operator approach to long-term risk developed by
+This lecture studies the operator approach to long-run risk developed by
 {cite:t}`HansenScheinkman2009`.
 
-The paper asks how asset-pricing risk adjustments behave when the payoff
-horizon becomes large.
-
 Local continuous-time asset pricing tells us how expected returns compensate
 investors for instantaneous exposure to Brownian and jump shocks.
 
-Hansen and Scheinkman instead focus on valuation operators indexed by the
-time between the valuation date and the payoff date.
+Driving the time interval to zero gives a clean limiting object, but it
+describes only the *short end* of the term structure of risk prices.
+
+Hansen and Scheinkman instead study the *long end*: what happens as the time
+between valuation and payoff grows large.
+
+The two ends are complementary — together they pin down the slope of the term
+structure of risk prices, and economic restrictions are often more reliable
+over long horizons than over instantaneous ones.
 
-These operators form a *semigroup*.
+The mathematical vehicle is a family of valuation operators indexed by horizon
+$t$, which form a *semigroup*.
 
 The central object is a positive multiplicative functional $\{M_t\}_{t \geq 0}$,
 such as a stochastic discount factor, a cumulated return, a stochastic growth
@@ -62,7 +67,12 @@ where
 * $\hat M$ is a martingale used to change probability measure, and
 * $\phi(X_0)/\phi(X_t)$ is a transient state-dependent component.
 
-This is the Hansen-Scheinkman factorization.
+```{prf:definition} Multiplicative Factorization
+:label: lrr-def-multiplicative-factorization
+
+A representation of the form {eq}`eq:hs-factorization` is called the
+**multiplicative factorization** associated with $(\rho,\phi,\hat M)$.
+```
 
 It generalizes the Perron-Frobenius decomposition of a positive matrix to
 continuous-time Markov valuation problems.
@@ -92,11 +102,6 @@ We start with imports.
 import numpy as np
 import matplotlib.pyplot as plt
 from scipy.linalg import eig, expm
-
-plt.rcParams.update({
-    "figure.figsize": (10, 6),
-    "font.size": 12
-})
 ```
 
 ## Multiplicative Functionals
@@ -106,7 +111,21 @@ $\mathcal D_0$.
 
 Let $\mathcal F_t$ be the filtration generated by the history of $X$.
 
-An adapted functional $\{M_t\}$ is **multiplicative** if $M_0 = 1$ and
+```{prf:definition} Functional
+:label: lrr-def-functional
+
+A **functional** is a real-valued adapted process $\{M_t : t \geq 0\}$
+constructed from the history of $X$, so that $M_t$ is
+$\mathcal F_t$-measurable for each $t$.
+```
+
+The paper assumes that functionals have versions with right-continuous sample
+paths and left limits.
+
+```{prf:definition} Multiplicative Functional
+:label: lrr-def-multiplicative-functional
+
+A functional $\{M_t : t \geq 0\}$ is **multiplicative** if $M_0 = 1$ and
 
 $$
     M_{t+u} = M_u(\theta_t) M_t ,
@@ -114,6 +133,7 @@ $$
 $$ (eq:multiplicative)
 
 where $\theta_t$ shifts the underlying Markov path forward by $t$ units.
+```
 
 For example, if $S_t$ is a stochastic discount factor, then
 $S_{t+u}/S_t$ is the date-$t$ discount factor for payoffs at date $t+u$.
@@ -123,11 +143,19 @@ The Markov version of this intertemporal consistency condition is exactly
 
 When $M_t > 0$, we can write $M_t = \exp(A_t)$.
 
-Then $A$ is additive:
+The logarithm $A$ then satisfies the following additive property.
+
+```{prf:definition} Additive Functional
+:label: lrr-def-additive-functional
+
+A functional $\{A_t : t \geq 0\}$ is **additive** if $A_0 = 0$ and
 
 $$
     A_{t+u} = A_u(\theta_t) + A_t .
 $$
+```
+
+Exponentials of additive functionals are strictly positive multiplicative functionals.
 
 For the jump-diffusion setting in {cite:t}`HansenScheinkman2009`, a useful
 parameterization is
@@ -142,8 +170,8 @@ $$
 \end{aligned}
 $$ (eq:additive-functional)
 
-The functions $(\beta, \gamma, \kappa)$ control drift, Brownian exposure, and
-jump scaling.
+The functions $(\beta, \gamma, \kappa)$ are the drift, diffusion coefficient,
+and jump amplitudes.
 
 In this notation, $\beta$ is allowed to be positive or negative.
 
@@ -152,7 +180,27 @@ $\beta(x) = -r(x)$.
 
 ## Semigroups
 
-A multiplicative functional defines a family of linear operators
+The operator objects are formalized by the following semigroup definition.
+
+```{prf:definition} One-Parameter Semigroup
+:label: lrr-def-one-parameter-semigroup
+
+A family of linear operators $\{T_t : t \geq 0\}$ is a **one-parameter
+semigroup** if $T_0=I$ and $T_{t+s}=T_tT_s$ for all $s,t \geq 0$.
+```
+
+```{prf:definition} Positive Semigroup
+:label: lrr-def-positive-semigroup
+
+A semigroup $\{T_t : t \geq 0\}$ is **positive** if $T_t\psi \geq 0$
+whenever $\psi \geq 0$ and $t \geq 0$.
+```
+
+```{prf:definition} Multiplicative Semigroup
+:label: lrr-def-multiplicative-semigroup
+
+Given a multiplicative functional $M$, the associated **multiplicative
+semigroup** is the family of operators
 
 $$
     \mathbb M_t \psi(x)
@@ -167,11 +215,35 @@ $$
     \qquad
     \mathbb M_{t+u} = \mathbb M_t \mathbb M_u .
 $$
+```
 
 The proof is just iterated expectations plus the multiplicative property of
 $M$.
 
-The paper uses several multiplicative functionals, summarized as follows.
+Economically, the semigroup property is the Markov version of the law of
+iterated values: the date-$0$ price of a date-$(t+u)$ payoff equals the
+date-$0$ price of holding the date-$t$ price of that payoff.
+
+Multiplicativity of $M$ and the semigroup property of $\{\mathbb M_t\}$ are
+the same condition, expressed at the path level and the operator level
+respectively.
+
+We will see several multiplicative functionals, summarized as follows.
+
+```{prf:definition} Stochastic Discount Factor
+:label: lrr-def-stochastic-discount-factor
+
+A **stochastic discount factor** $S$ is a positive multiplicative functional
+for which $E[S_t Z_t \mid X_0=x]$ gives the date-$0$ value of a payoff $Z_t$
+delivered at date $t$.
+```
+
+```{prf:definition} Valuation Functional
+:label: lrr-def-valuation-functional
+
+Given a stochastic discount factor $S$, a **valuation functional** $V$ is a
+multiplicative functional such that $\{V_tS_t : t \geq 0\}$ is a martingale.
+```
 
 | Object | Multiplicative functional | Semigroup |
 |---|---:|---:|
@@ -182,6 +254,13 @@ The paper uses several multiplicative functionals, summarized as follows.
 
 The last case is central for long-run cash-flow pricing.
 
+```{prf:definition} Stochastic Growth Functional
+:label: lrr-def-stochastic-growth-functional
+
+A **stochastic growth functional** $G$ is a positive multiplicative functional
+that scales a cash flow between dates.
+```
+
 If a cash flow is
 
 $$
@@ -198,16 +277,171 @@ $$
     E\left[G_t S_t \psi(X_t) \mid X_0=x\right].
 $$
 
+```{prf:definition} Cash-Flow Valuation Semigroup
+:label: lrr-def-cash-flow-valuation-semigroup
+
+The **cash-flow valuation semigroup** is the multiplicative semigroup
+generated by $Q=GS$, where $G$ is stochastic growth and $S$ is the stochastic
+discount factor.
+```
+
 The long-horizon behavior of $\mathbb Q_t$ tells us how current prices value
 cash-flow growth risk that materializes far in the future.
 
 ## The Generator
 
-The **extended generator** associated with $M$ is the local object that
-corresponds to the semigroup $\{\mathbb M_t\}$.
+The semigroup $\{\mathbb M_t\}_{t \geq 0}$ is a global object.
+
+For a fixed horizon $t$, the value
+
+$$
+    \mathbb M_t \psi(x)
+    =
+    E\left[M_t \psi(X_t) \mid X_0=x\right]
+$$
+
+averages over all paths from $0$ to $t$.
+
+But the long-horizon question asks what happens to this whole family of
+operators as $t$ becomes large.
+
+The role of the generator is to replace this horizon-indexed family by a
+single local object on the state space.
+
+The idea is easiest to see in discrete time.
+
+Suppose $X_n$ is Markov and $M_n$ is a multiplicative functional.
+
+Let the one-period valuation operator be
+
+$$
+    K\psi(x)
+    =
+    E\left[M_1 \psi(X_1) \mid X_0=x\right].
+$$
+
+Then the $n$-period valuation operator is just $K^n$.
+
+This is the same one-step-to-many-step logic used in the finite Markov chains
+lecture, where a transition matrix $P$ determines $n$-step probabilities
+through $P^n$.
+
+Here $K$ plays the role of the one-step matrix, except that it also includes
+the multiplicative payoff weight $M_1$.
+
+Thus a long-horizon problem is controlled by a single one-step object.
+
+If $K\phi = \lambda \phi$, then
+
+$$
+    K^n \phi = \lambda^n \phi,
+$$
+
+so the eigenvalue $\lambda$ gives the long-run geometric growth or decay rate,
+while $\phi$ describes the long-run dependence on the current state.
+
+The same one-step operator also tells us the predictable change in a weighted
+payoff.
+
+By multiplicativity,
+
+$$
+\begin{aligned}
+    E\left[
+        M_{n+1}\psi(X_{n+1}) - M_n \psi(X_n)
+        \mid \mathcal F_n
+    \right]
+    &=
+    M_n\left(K\psi(X_n)-\psi(X_n)\right).
+\end{aligned}
+$$
+
+Hence $K\psi-\psi$ is the predictable one-period rate of change of the
+weighted payoff, expressed as a function of the current state.
+
+Subtracting these predictable changes from the total change leaves a
+martingale:
 
-A Borel function $\psi$ belongs to the domain of the generator $\mathbb A$ if
-there is a Borel function $\chi$ such that
+$$
+    M_n\psi(X_n)
+    -
+    \psi(X_0)
+    -
+    \sum_{j=0}^{n-1}
+        M_j\left(K\psi(X_j)-\psi(X_j)\right).
+$$
+
+So in discrete time the operator $K-I$ does two things at once.
+
+It gives the predictable drift of $M_n\psi(X_n)$, and its eigenvalue problem
+
+$$
+    (K-I)\phi = (\lambda-1)\phi
+$$
+
+is equivalent to the long-horizon eigenvalue problem for $K^n$.
+
+Continuous time keeps the same logic, but there is no distinguished
+one-period step.
+
+For a small interval $h$,
+
+$$
+    \mathbb M_h\psi(x) - \psi(x)
+$$
+
+is the short-horizon predictable change in the weighted payoff starting from
+state $x$.
+
+The continuous-time analogue of $K-I$ is therefore the derivative of the
+semigroup at zero:
+
+$$
+    \mathbb M_h \psi(x)
+    \approx
+    \psi(x) + h \mathbb A\psi(x).
+$$
+
+When this derivative can be represented by a function of the current state, we
+call that function $\mathbb A\psi$.
+
+It is local because it records the instantaneous rate of change at $x$, rather
+than the value of an entire finite-horizon path integral.
+
+If $\mathbb A\phi=\rho\phi$, then
+$\mathbb M_t\phi=\exp(\rho t)\phi$, the continuous-time counterpart of
+$K^n\phi=\lambda^n\phi$.
+
+This is why the generator matters for long-run valuation: it converts the
+asymptotic study of $\mathbb M_t$ into an eigenvalue problem for a local
+operator.
+
+The derivative notation above is heuristic.
+
+For the Markov processes used by {cite:t}`HansenScheinkman2009`, it is more
+convenient to define the generator through the associated martingale
+decomposition, mirroring the discrete-time formula.
+
+We look for a function $\chi$ such that $M_t \chi(X_t)$ is the instantaneous
+predictable rate of change of $M_t \psi(X_t)$.
+
+In informal differential form,
+
+$$
+    E\bigl[ d\bigl(M_t \psi(X_t)\bigr) \,\bigm|\, \mathcal F_t \bigr]
+    \;=\;
+    M_t \chi(X_t)\, dt .
+$$
+
+The formal definition says that, after integrating this predictable rate along
+the path, the remaining part is martingale noise.
+
+```{prf:definition} Extended Generator
+:label: lrr-def-extended-generator
+
+A Borel function $\psi$ belongs to the domain of the **extended generator**
+$\mathbb A$ of the multiplicative functional $M$ if there is a Borel function
+$\chi$ such that
 
 $$
     N_t
@@ -219,7 +453,30 @@ $$
 
 is a local martingale.
 
-We then write $\mathbb A \psi = \chi$.
+In this case, the extended generator assigns $\chi$ to $\psi$, and we write
+$\mathbb A \psi = \chi$.
+```
+
+The three terms have the same roles as in the discrete-time decomposition:
+
+* $M_t \psi(X_t) - \psi(X_0)$ is the total change in the weighted payoff over
+  $[0, t]$.
+* $\int_0^t M_s \chi(X_s)\, ds$ accumulates the expected rate of change along
+  the path.
+* $N_t$ is the residual — mean-zero noise — and the martingale condition is
+  what forces $\chi$ to be the right rate.
+
+So $\mathbb A \psi(x) = \chi(x)$ is the instantaneous expected rate of change
+of $M_t \psi(X_t)$ when the current state is $x$.
+
+Two sanity checks connect this definition to familiar objects.
+
+When $M \equiv 1$, the condition reduces to Dynkin's formula for the standard
+Markov generator
+$\mathcal L \psi(x) = \lim_{t \downarrow 0} t^{-1}\bigl[E\psi(X_t) - \psi(x)\bigr]$.
+
+When $X$ is a jump diffusion, applying Itô's formula to $M_t \psi(X_t)$ produces
+a closed-form expression for $\mathbb A \psi$ given below.
 
 Suppose the Markov state satisfies
 
@@ -262,27 +519,33 @@ $$
 \end{aligned}
 $$ (eq:extended-generator)
 
-This formula is useful because it converts a long-horizon pricing problem into
-an eigenvalue problem for a local generator.
-
 ```{note}
-When $M \equiv 1$, {eq}`eq:extended-generator` reduces to the generator of
-the Markov process $X$.
-
-When $M=S$ is a stochastic discount factor, the extra terms encode local
-prices of Brownian and jump risk.
+When $M=S$ is a stochastic discount factor, the extra terms multiplying
+$\phi(x)$ encode local prices of Brownian and jump risk.
 ```
 
 ## Principal Eigenfunctions
 
-A Borel function $\phi$ is an eigenfunction of $\mathbb A$ with eigenvalue
-$\rho$ if
+With the local operator $\mathbb A$ in hand, the long-run question becomes:
+which positive payoffs grow at a constant proportional rate under the
+valuation semigroup?
+
+```{prf:definition} Eigenfunction of the Extended Generator
+:label: lrr-def-generator-eigenfunction
+
+A Borel function $\phi$ is an **eigenfunction** of $\mathbb A$ with
+eigenvalue $\rho$ if
 
 $$
     \mathbb A \phi = \rho \phi .
 $$ (eq:generator-eigen)
+```
 
-A **principal eigenfunction** is an eigenfunction that is strictly positive.
+```{prf:definition} Principal Eigenfunction
+:label: lrr-def-principal-eigenfunction
+
+A **principal eigenfunction** is an eigenfunction that is strictly positive on the state space.
+```
 
 If $\phi > 0$ solves {eq}`eq:generator-eigen`, then
 
@@ -295,8 +558,23 @@ $$ (eq:mhat)
 
 is a local martingale.
 
-When $\hat M$ is a martingale, it defines a new probability measure and gives
-the factorization {eq}`eq:hs-factorization`.
+```{prf:definition} Martingale Component and Twisted Measure
+:label: lrr-def-martingale-component
+
+When $\hat M$ in {eq}`eq:mhat` is a martingale, it is the **martingale
+component** associated with $(\rho,\phi)$ and defines the **twisted probability
+measure** by weighting date-$t$ events with $\hat M_t$.
+
+For $F \in \mathcal F_t$, the twisted conditional probability is
+
+$$
+    \widehat{\Pr}(F \mid X_0=x)
+    =
+    E[\hat M_t 1_F \mid X_0=x].
+$$
+```
+
+The martingale component also gives the factorization {eq}`eq:hs-factorization`.
 
 It also gives the semigroup eigenvalue equation
 
@@ -305,9 +583,33 @@ $$
     \qquad t \geq 0.
 $$ (eq:semigroup-eigen)
 
-Under stochastic stability restrictions under the $\hat M$-twisted measure,
-Proposition 7.1 of {cite:t}`HansenScheinkman2009` gives the long-run
-approximation
+```{prf:definition} Harris Recurrence
+:label: lrr-def-harris-recurrence
+
+A Markov process with stationary distribution $\hat\varsigma$ is **Harris
+recurrent** if every Borel set with positive $\hat\varsigma$ measure is visited
+for an infinite amount of time with probability one from every initial state.
+```
+
+```{prf:definition} Stochastically Stable Twisted Process
+:label: lrr-def-stochastic-stability
+
+The $\hat M$-twisted Markov process is **stochastically stable** if it has a
+stationary distribution $\hat\varsigma$ and is Harris recurrent under the
+probability measure induced by $\hat M$.
+```
+
+```{prf:definition} Irreducible Skeleton
+:label: lrr-def-irreducible-skeleton
+
+A discretely sampled skeleton $\{X_{\Delta j}: j=0,1,\ldots\}$ is
+**irreducible** relative to $\hat\varsigma$ if every Borel set with positive
+$\hat\varsigma$ measure is reached with positive probability from every
+initial state.
+```
+
+Under these stability restrictions, Proposition 7.1 of
+{cite:t}`HansenScheinkman2009` gives the long-run approximation
 
 $$
     \lim_{t \to \infty}
@@ -341,7 +643,13 @@ Perron-Frobenius theory.
 Let $X$ take values in $\{x_1,\ldots,x_N\}$ and let $U$ be its intensity
 matrix.
 
-Thus $u_{ij} \geq 0$ for $i \neq j$ and each row of $U$ sums to zero.
+```{prf:definition} Intensity Matrix
+:label: lrr-def-intensity-matrix
+
+An **intensity matrix** $U$ for a finite-state continuous-time Markov chain
+satisfies $u_{ij} \geq 0$ for $i \neq j$ and $\sum_j u_{ij}=0$ for each
+state $i$.
+```
 
 Let the multiplicative functional have
 
@@ -523,7 +831,7 @@ for t in [1, 5, 20, 80]:
 print("\nlimit =", limit)
 ```
 
-### Jump Scaling
+### Adding Jumps
 
 Now let the multiplicative functional jump when the Markov state changes.
 
@@ -537,9 +845,9 @@ kappa_jump = np.array([[0.0,  0.30],
 A_jump = build_generator(U, r, kappa_jump)
 rho_jump, phi_jump = principal_eigenpair(A_jump)
 
-print(f"rho without jump scaling = {rho:.6f}")
-print(f"rho with jump scaling    = {rho_jump:.6f}")
-print("\nphi with jump scaling:")
+print(f"rho without jumps = {rho:.6f}")
+print(f"rho with jumps    = {rho_jump:.6f}")
+print("\nphi with jumps:")
 print(phi_jump)
 ```
 
@@ -559,7 +867,7 @@ ax.axhline(rho, color="black", ls="--", lw=1)
 ax.axvline(0, color="black", ls=":", lw=1)
 ax.set_xlabel("jump log multiplier for recession to boom")
 ax.set_ylabel("principal eigenvalue")
-ax.set_title("Jump Scaling and the Long-Run Growth Rate")
+ax.set_title("Jumps and the Long-Run Growth Rate")
 plt.show()
 ```
 
@@ -616,6 +924,13 @@ $$
     \phi(x^f,x^o) = \exp(c_f x^f + c_o x^o).
 $$
 
+```{prf:definition} Exponential-Affine Eigenfunction
+:label: lrr-def-exponential-affine-eigenfunction
+
+An **exponential-affine eigenfunction** is a positive eigenfunction whose
+logarithm is affine in the state variables.
+```
+
 Substitution into $\mathbb A\phi=\rho\phi$ gives
 
 $$
@@ -983,6 +1298,13 @@ Local continuous-time pricing is expressed through instantaneous risk prices.
 
 Suppose the SDF has Brownian loading $\gamma^s$.
 
+```{prf:definition} Local Brownian Risk Price
+:label: lrr-def-local-brownian-risk-price
+
+The **local price** of exposure to a Brownian shock with SDF loading
+$\gamma_i^s$ is $-\gamma_i^s$.
+```
+
 For a valuation functional with Brownian exposure $\gamma^v$, Corollary 3.1 of
 {cite:t}`HansenScheinkman2009` gives the Brownian part of the local required
 expected return as
@@ -1009,6 +1331,15 @@ $$
     - \frac{\beta_o^s}{\xi_o}\sigma_o .
 $$ (eq:long-run-price-o)
 
+```{prf:definition} Long-Run Risk Price
+:label: lrr-def-long-run-risk-price
+
+In the cash-flow valuation problem, the **long-run risk price** for exposure
+$\gamma_i^g$ is the marginal change
+$\partial R_\infty / \partial \gamma_i^g$ in the asymptotic required return
+$R_\infty=-\rho+\delta$.
+```
+
 The second term is the persistence adjustment.
 
 A shock to $B^o$ moves the persistent growth predictor $X^o$.
@@ -1253,7 +1584,7 @@ U =
 $$
 
 Let the multiplicative functional have decay rate $r_1>0$ in state 1, decay
-rate $r_2=0$ in state 2, and no jump scaling.
+rate $r_2=0$ in state 2, and no jumps.
 
 (a) Write down the generator matrix $A$.
 
diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index 8ee4a4041..3b291b3f6 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -34,7 +34,7 @@ This lecture explores a classic question in economic theory: can agents **learn*
 
 In a rational expectations equilibrium, agents use market prices to make inferences about other agents' private information.
 
-Each agent knows the **statistical relationship** between prices and the underlying payoff-relevant variables — and that relationship is **correct** given the equilibrium.
+Each agent knows the **statistical relationship** between prices and the underlying payoff-relevant variables and that relationship is **correct** given the equilibrium.
 
 But this raises a  question: where does that knowledge come from?
 
@@ -44,12 +44,12 @@ The key findings are:
 
 * In a benchmark example, a rational (Bayesian) uninformed agent **does learn** the equilibrium price function as data accumulate.
 * The beliefs of the uninformed agent converge (weakly) to a point mass at the true equilibrium parameter.
-* In more general economies, this convergence can fail — especially when **multiple equilibria** exist or when the uninformed agent's model is **misspecified**.
+* In more general economies, this convergence can fail, especially when **multiple equilibria** exist or when the uninformed agent's model is **misspecified**.
 
 This lecture presents the Bray–Kreps framework, works through their benchmark example in detail, and provides Python code to simulate Bayesian learning dynamics.
 
 
-This lecture describes  {cite}`BrayKreps1987`, Chapter 19 in *Advances in Economic Theory* (1987), which synthesizes earlier work by {cite}`Bray1982`, {cite}`BraySavin1984`, and the rational expectations literature of {cite}`Radner1979`, {cite}`grossman1976`, and {cite}`Jordan1982`.
+We focus on  {cite}`BrayKreps1987`, Chapter 19 in *Advances in Economic Theory* (1987), which synthesizes earlier work by {cite}`Bray1982`, {cite}`BraySavin1984`, and the rational expectations literature of {cite}`Radner1979`, {cite}`grossman1976`, and {cite}`Jordan1982`.
 
 
 Let's start with the necessary imports.
@@ -82,6 +82,7 @@ There are two agents:
 ### Preferences
 
 Both agents have von Neumann–Morgenstern utility with coefficient of absolute risk tolerance equal to $2$.
+
 Agent $n \in \{I, U\}$ chooses holdings $x^n$ of the risky asset to maximize
 
 $$
@@ -120,7 +121,7 @@ $$
 a = 0, \qquad b = 1
 $$
 
-so that $p_t = r_t$ — the price fully reveals the fundamental.
+so that $p_t = r_t$ (i.e., the price fully reveals the fundamental).
 
 More generally, with parameters $(\theta^I, \theta^U)$ denoting risk tolerances and $\sigma^2$ the variance of $r_t$:
 
@@ -417,7 +418,7 @@ The convergence result above relies on several assumptions that may fail in rich
 
 ### 1. Multiple Equilibria
 
-When there are multiple rational expectations equilibria, the uninformed agent's beliefs may converge to the **wrong** equilibrium — one that is not the equilibrium that actually prevails.
+When there are multiple rational expectations equilibria, the uninformed agent's beliefs may converge to the **wrong** equilibrium.
 
 In the example with two potential equilibrium parameters $b_1^*$ and $b_2^*$, the agent's posterior mean can converge to either one depending on the history.
 
@@ -476,7 +477,7 @@ plt.show()
 
 As expected, agent $U$ learns the **correct** equilibrium as long as the model is correctly specified and the true equilibrium generates the data.
 
-The more subtle failure mode — identified by Bray and Kreps — arises when agents' learning rules themselves **change the equilibrium**, creating a feedback loop that may or may not converge.
+The more subtle failure mode, identified by Bray and Kreps, arises when agents' learning rules themselves **change the equilibrium**, creating a feedback loop that may or may not converge.
 
 ### 2. Self-Referential Learning Dynamics
 
@@ -485,7 +486,7 @@ But $\mu_t$ is updated based on past prices.
 
 This creates a **self-referential** system: beliefs drive prices, and prices update beliefs.
 
-{cite}`BrayKreps1987` show (their Proposition 2 and Section 5) that this feedback can lead to **non-stationary** dynamics and that convergence to the rational expectations equilibrium requires additional conditions — essentially that the economy "settles down" to a stationary relationship before agents learn the parameters of that relationship.
+{cite}`BrayKreps1987` show (their Proposition 2 and Section 5) that this feedback can lead to **non-stationary** dynamics and that convergence to the rational expectations equilibrium requires additional conditions. Essentially, the economy "settles down" to a stationary relationship before agents learn the parameters of that relationship.
 
 The next section illustrates the self-referential dynamics.
 
@@ -579,7 +580,7 @@ $$
 This is the **Bayesian consistency** result: a rational Bayesian agent who assigns positive prior probability to the truth will eventually learn it.
 
 The key caveat: the agent must assign **positive prior probability** to the true data-generating process.
-If the agent's model is misspecified — if the true equilibrium is outside the support of the agent's prior — convergence to the truth is not guaranteed.
+If the agent's model is misspecified --- if the true equilibrium is outside the support of the agent's prior --- convergence to the truth is not guaranteed.
 
 A corollary to this general result is that for the specific model described above, the uninformed agent's posterior on $b$ converges to the truth as long as the prior assigns positive density to a neighborhood of $b^*$.
 
@@ -587,7 +588,7 @@ A corollary to this general result is that for the specific model described abov
 
 Section 4 of {cite}`BrayKreps1987` specializes the convergence results to the context of rational expectations equilibria in markets.
 
-The main result (Proposition 3) states that even in large general-equilibrium economies with $N$ agents and $M$ assets, agents' beliefs converge weakly to a stationary rational expectations equilibrium — provided:
+The main result (Proposition 3) states that even in large general-equilibrium economies with $N$ agents and $M$ assets, agents' beliefs converge weakly to a stationary rational expectations equilibrium, provided that:
 
 1. Agents form **rational (Bayesian) forecasts** given their information.
 2. The equilibrium is **unique** (no multiplicity problem).
@@ -606,7 +607,7 @@ The proof involves three steps:
 
 * **Step 1A**: The conditional probability $P(A \mid H_t)$ forms a martingale with respect to $H_t$ (by the law of iterated expectations).
 * **Step 1B**: The martingale converges a.s. by Doob's martingale convergence theorem.
-* **Step 2**: The equilibrium price function — which maps $(p, \theta)$ space to prices — is continuous (under a linear model assumption).
+* **Step 2**: The equilibrium price function, which maps $(p, \theta)$ space to prices, is continuous (under a linear model assumption).
 * **Step 3–4**: By combining Step 1 and Step 2, the joint distribution of prices and beliefs converges.
 
 ## Obstacles to Convergence
@@ -621,7 +622,9 @@ A concrete example: suppose there are two spot market equilibria for some payoff
 
 The informed agents choose randomly among these each period (since they are indifferent).
 
-The uninformed agent's posterior mean can never converge to a single value — it will bounce between neighborhoods of $\theta_1$ and $\theta_2$.
+The uninformed agent's posterior mean can never converge to a single value. 
+
+It will bounce between neighborhoods of $\theta_1$ and $\theta_2$.
 
 ### Obstacle 2: Non-Stationarity of Beliefs
 
@@ -633,7 +636,7 @@ This is a **philosophical problem** with the idea of learning in equilibrium: on
 
 ### Obstacle 3: Misspecified Models
 
-If $U$'s prior assigns zero probability to $b^*$ — that is, if $U$'s model is misspecified — then convergence to $b^*$ is impossible by Bayesian consistency.
+If $U$'s prior assigns zero probability to $b^*$, that is if $U$'s model is misspecified, then convergence to $b^*$ is impossible by Bayesian consistency.
 
 {cite}`BrayKreps1987` note (p. 622) that this is a subtle but important caveat: convergence is guaranteed only when the "true $\theta$ may lie outside the set of states $\Omega$" to which the agent's prior assigns positive probability is not the case.
 
@@ -650,12 +653,14 @@ The uninformed agent knows the true structural form of the price function (that
 
 Because the true $b^*$ lies in the support of agent $U$'s prior, the agent's model is **correctly specified**.
 
-The Bayesian updating rule — standard Gaussian conjugate updating — is therefore fully rationalized: it is exactly what a rational agent with a correct model would do.
+The Bayesian updating rule is therefore fully rationalized: it is exactly what a rational agent with a correct model would do.
 
 Convergence of beliefs to $b^*$ then follows from the standard Bayesian consistency theorem (Proposition 2 of {cite}`BrayKreps1987`).
 
 **Learning *about* a rational expectations equilibrium** is a quite different enterprise.
-Here the agent does not know the statistical relationship between prices and fundamentals, and that relationship is itself an *endogenous* object — it is determined in equilibrium by the very beliefs the agent is trying to learn.
+Here the agent does not know the statistical relationship between prices and fundamentals, and that relationship is itself an *endogenous* object
+
+It is determined in equilibrium by the very beliefs the agent is trying to learn.
 
 As Bray and Kreps put it (p. 601):
 
@@ -678,7 +683,9 @@ For that to be valid, the agent would need to know:
 
 But both of these are themselves functions of the equilibrium that agent $U$ is trying to learn.
 
-If $U$'s beliefs at date $t$ are $\mu_t \neq b^*$, then $U$'s model of the price process is **misspecified** — the prices generated in the economy reflect other agents' optimization given the *actual* (possibly non-stationary) beliefs of $U$, not the stationary REE price function that $U$ is treating as fixed.
+If $U$'s beliefs at date $t$ are $\mu_t \neq b^*$, then $U$'s model of the price process is **misspecified**.
+
+The prices generated in the economy reflect other agents' optimization given the *actual* (possibly non-stationary) beliefs of $U$, not the stationary REE price function that $U$ is treating as fixed.
 
 Thus, the agent's model can be correctly specified *only if* the economy is already at the rational expectations equilibrium.
 
@@ -692,13 +699,13 @@ And in their concluding section they observe that the rational-learning model is
 
 > *"...concerned with learning* within *and learning* about *an equilibrium, and then the sense of* rational learning *within* ... is equivalent to* rational learning about *in some sense other than as formally equivalent to* rational expectations equilibrium."*
 
-The distinction is that learning *within* an REE — our Bayesian model above — is consistent with full rationality because the agent's model is correct.
+The distinction is that learning *within* an REE, our Bayesian model above, is consistent with full rationality because the agent's model is correct.
 
 Learning *about* an REE, by contrast, requires the agent to use data generated by a **non-stationary** process as if it were generated by a stationary REE, which is a form of model misspecification that cannot be rationalized as Bayesian updating with a correct prior.
 
 ### The Role of "Irrational" Learning Algorithms
 
-This explains why the literature on learning *about* rational expectations equilibria — going back to {cite}`Bray1982` and {cite}`BraySavin1984`, and extended in the influential work of {cite}`MarcetSargent1989` — tends to rely on **ordinary least squares (OLS)** or other adaptive algorithms rather than Bayes' rule.
+This explains why the literature on learning *about* rational expectations equilibria --- going back to {cite}`Bray1982` and {cite}`BraySavin1984`, and extended in the influential work of {cite}`MarcetSargent1989` --- tends to rely on **ordinary least squares (OLS)** or other adaptive algorithms rather than Bayes' rule.
 
 ```{note}
 {cite}`MarcetSargent1989` use some theorems about stochastic approximation to extend some of Bray and 
@@ -709,9 +716,9 @@ In those models, agent $U$ runs a regression of observed prices on observed fund
 
 OLS is consistent and computationally tractable, but it is *not* the optimal rule for an agent who knows the true data-generating process.
 
-It is, as Bray and Kreps call it, a form of **"irrational" learning** — rational in the limited sense of using past data intelligently, but not derivable from Bayes' theorem applied to a correctly specified model.
+It is, as Bray and Kreps call it, a form of **"irrational" learning**: rational in the limited sense of using past data intelligently, but not derivable from Bayes' theorem applied to a correctly specified model.
 
-An OLS learner implicitly assumes the data-generating process is stationary — that the relationship between prices and fundamentals is the same in every period.
+An OLS learner implicitly assumes the data-generating process is stationary --- that is to say the relationship between prices and fundamentals is the same in every period.
 
 But during the learning transition, it is not: the price function shifts as beliefs shift.
 
@@ -721,28 +728,28 @@ This is a misspecification, and the resulting estimates are biased in finite sam
 
 Bray and Kreps note (pp. 598–599) that in the models studied by {cite}`Bray1982` and {cite}`BraySavin1984`:
 
-> *"Agents are doing Bayesian updating, but their model is, almost by construction, wrong — they are learning as if the environment were stationary when it is not."*
+> *"Agents are doing Bayesian updating, but their model is, almost by construction, wrong --- they are learning as if the environment were stationary when it is not."*
 
 There is a fundamental **epistemic tension** at the heart of learning about rational expectations equilibria:
 
-* A fully rational (Bayesian, correctly specified) learner can only apply Bayes' rule to a model whose structure is *already known* — but the structure of the REE is exactly what the agent is trying to learn.
+* A fully rational (Bayesian, correctly specified) learner can only apply Bayes' rule to a model whose structure is *already known* but the structure of the REE is exactly what the agent is trying to learn.
 * A learner who uses an adaptive algorithm (OLS, least-mean-squares, etc.) can potentially converge to the REE, but only by using a rule that cannot be derived from Bayesian rationality applied to a correctly specified model.
 
 The benchmark model in this lecture avoids this tension by assumption: agent $U$ knows the structural form of the price function and needs only to learn one parameter.
 
-That is learning *within* an REE — a clean, tractable, and fully rational exercise — but it is also a special case that sidesteps the deeper difficulty of learning *about* an REE from scratch.
+That is learning *within* an REE, a clean, tractable, and fully rational exercise, but it is also a special case that sidesteps the deeper difficulty of learning *about* an REE from scratch.
 
 
 ## Summary
 
 This lecture has discussed  ideas from {cite}`BrayKreps1987`:
 
-1. **Rational expectations equilibria** require agents to know the statistical relationship between prices and fundamentals — but this knowledge is typically assumed, not derived.
+1. **Rational expectations equilibria** require agents to know the statistical relationship between prices and fundamentals but this knowledge is typically assumed, not derived.
 
 2. **Rational learning** asks whether Bayesian agents can *learn* the REE from data.
    In a benchmark linear model, the answer is yes: the uninformed agent's posterior on the slope parameter $b^*$ converges almost surely to the truth.
 
-3. The convergence relies on **Bayesian consistency** — the uninformed agent accumulates sufficient information to identify $b^*$ from observed prices and returns.
+3. The convergence relies on **Bayesian consistency**: the uninformed agent accumulates sufficient information to identify $b^*$ from observed prices and returns.
 
 4. Convergence can **fail** when:
    - There are **multiple equilibria** and agents' learning rules interact with equilibrium selection.
@@ -754,11 +761,11 @@ This lecture has discussed  ideas from {cite}`BrayKreps1987`:
 6. **Learning *within* versus *about* an REE** is a crucial distinction.
    The benchmark model in this lecture exemplifies learning *within* an REE: agent $U$ knows the structural form of the price function and uses a correctly specified Bayesian model.
 
-   Learning *about* an REE — where the equilibrium price function is itself the unknown object — is fundamentally harder, because the data-generating process shifts as beliefs shift.
+   Learning *about* an REE, where the equilibrium price function is itself the unknown object, is fundamentally harder, because the data-generating process shifts as beliefs shift.
 
    This non-stationarity means that learning *about* an REE cannot in general be rationalized as Bayes' rule applied to a correctly specified model, which is why the literature on this topic relies on adaptive algorithms such as OLS rather than fully Bayesian updating.
 
-The broader message of Bray and Kreps is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle — and the conditions under which learning succeeds are more restrictive than they might appear.
+The broader message of Bray and Kreps is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle and the conditions under which learning succeeds are more restrictive than they might appear.
 
 
 ## Exercises

From 16a6f9acb5ba29b6ed08d98262b9a6b375da0798 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Thu, 28 May 2026 15:52:32 +1000
Subject: [PATCH 06/25] updates

---
 lectures/long_run_risk_operator.md | 682 ++++++++++++++++++++---------
 1 file changed, 469 insertions(+), 213 deletions(-)

diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index 3dbece188..1b88162f7 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -20,7 +20,7 @@ kernelspec:
 </div>
 ```
 
-# Long-Term Risk: An Operator Approach
+# Long-term risk: an operator approach
 
 ```{contents} Contents
 :depth: 2
@@ -40,7 +40,7 @@ describes only the *short end* of the term structure of risk prices.
 Hansen and Scheinkman instead study the *long end*: what happens as the time
 between valuation and payoff grows large.
 
-The two ends are complementary — together they pin down the slope of the term
+The two ends are complementary --- together they pin down the slope of the term
 structure of risk prices, and economic restrictions are often more reliable
 over long horizons than over instantaneous ones.
 
@@ -93,7 +93,7 @@ This lecture covers
 * the extended generator associated with a multiplicative functional,
 * principal eigenfunctions and the Hansen-Scheinkman factorization,
 * a finite-state example where the analysis reduces to Perron-Frobenius theory,
-* the affine diffusion example from the paper, and
+* an affine diffusion example, and
 * long-run risk prices for persistent growth shocks.
 
 We start with imports.
@@ -104,7 +104,7 @@ import matplotlib.pyplot as plt
 from scipy.linalg import eig, expm
 ```
 
-## Multiplicative Functionals
+## Multiplicative functionals
 
 Let $\{X_t : t \geq 0\}$ be a continuous-time Markov process with state space
 $\mathcal D_0$.
@@ -119,8 +119,33 @@ constructed from the history of $X$, so that $M_t$ is
 $\mathcal F_t$-measurable for each $t$.
 ```
 
-The paper assumes that functionals have versions with right-continuous sample
-paths and left limits.
+We assume that functionals have versions with right-continuous sample
+paths and left limits, the **càdlàg** property.
+
+Concretely, for almost every $\omega$, the path $t \mapsto M_t(\omega)$ satisfies
+
+$$
+    \lim_{s \downarrow t} M_s(\omega) = M_t(\omega)
+    \quad \text{for all } t \geq 0,
+$$
+
+and the left limit
+
+$$
+    M_{t-}(\omega) := \lim_{s \uparrow t} M_s(\omega)
+$$
+
+exists and is finite for all $t > 0$.
+
+Paths may jump, but each jump is resolved at the jump time:
+$M_t = M_{t-} + \Delta M_t$ with $\Delta M_t := M_t - M_{t-}$.
+
+The word *version* means we are free to replace $M_t$ by any process
+$\tilde M_t$ with $\mathbb P(M_t = \tilde M_t) = 1$ for each $t$.
+
+Càdlàg paths give the joint measurability in $(\omega, t)$ that we need to
+integrate functionals against time, apply optional stopping, and pass to
+limits such as $\lim_{t \to \infty} t^{-1} \log M_t$ that appear later.
 
 ```{prf:definition} Multiplicative Functional
 :label: lrr-def-multiplicative-functional
@@ -157,8 +182,7 @@ $$
 
 Exponentials of additive functionals are strictly positive multiplicative functionals.
 
-For the jump-diffusion setting in {cite:t}`HansenScheinkman2009`, a useful
-parameterization is
+In a jump-diffusion setting, a useful parameterization is
 
 $$
 \begin{aligned}
@@ -180,7 +204,16 @@ $\beta(x) = -r(x)$.
 
 ## Semigroups
 
-The operator objects are formalized by the following semigroup definition.
+A multiplicative functional $M$ together with the Markov process $X$
+defines, for each horizon $t$, the valuation operator
+
+$$
+    \mathbb M_t \psi(x) = E\left[M_t \psi(X_t) \mid X_0 = x\right] .
+$$
+
+These operators inherit a clean composition rule from the multiplicative
+property of $M$, which makes the family $\{\mathbb M_t\}_{t \geq 0}$ a
+*semigroup*.
 
 ```{prf:definition} One-Parameter Semigroup
 :label: lrr-def-one-parameter-semigroup
@@ -217,18 +250,29 @@ $$
 $$
 ```
 
-The proof is just iterated expectations plus the multiplicative property of
-$M$.
+The semigroup property follows from iterated expectations and the
+multiplicative property of $M$.
+
+Economically, it is the Markov law of iterated values: the date-$0$ price of
+a date-$(t+u)$ payoff equals the date-$0$ price of holding the date-$t$
+price of that payoff.
 
-Economically, the semigroup property is the Markov version of the law of
-iterated values: the date-$0$ price of a date-$(t+u)$ payoff equals the
-date-$0$ price of holding the date-$t$ price of that payoff.
+Multiplicativity of $M$ (at the path level) and the semigroup property of
+$\{\mathbb M_t\}$ (at the operator level) are the same condition.
 
-Multiplicativity of $M$ and the semigroup property of $\{\mathbb M_t\}$ are
-the same condition, expressed at the path level and the operator level
-respectively.
+### Functionals we will use
 
-We will see several multiplicative functionals, summarized as follows.
+We work with four positive multiplicative functionals throughout the lecture.
+
+| Object | Multiplicative functional | Semigroup |
+|---|---:|---:|
+| stochastic discount factor | $S$ | $\{\mathbb S_t\}$ |
+| cumulated return | $V$ | $\{\mathbb V_t\}$ |
+| stochastic growth | $G$ | $\{\mathbb G_t\}$ |
+| valuation with stochastic growth | $Q = GS$ | $\{\mathbb Q_t\}$ |
+
+The first three are primitives; the last one combines them to value cash
+flows that both grow and require discounting.
 
 ```{prf:definition} Stochastic Discount Factor
 :label: lrr-def-stochastic-discount-factor
@@ -245,15 +289,6 @@ Given a stochastic discount factor $S$, a **valuation functional** $V$ is a
 multiplicative functional such that $\{V_tS_t : t \geq 0\}$ is a martingale.
 ```
 
-| Object | Multiplicative functional | Semigroup |
-|---|---:|---:|
-| stochastic discount factor | $S$ | $\{\mathbb S_t\}$ |
-| cumulated return | $V$ | $\{\mathbb V_t\}$ |
-| stochastic growth | $G$ | $\{\mathbb G_t\}$ |
-| valuation with stochastic growth | $Q = GS$ | $\{\mathbb Q_t\}$ |
-
-The last case is central for long-run cash-flow pricing.
-
 ```{prf:definition} Stochastic Growth Functional
 :label: lrr-def-stochastic-growth-functional
 
@@ -261,13 +296,8 @@ A **stochastic growth functional** $G$ is a positive multiplicative functional
 that scales a cash flow between dates.
 ```
 
-If a cash flow is
-
-$$
-    D_t = D_0 G_t \psi(X_t),
-$$
-
-then its date-$0$ value is
+To price a cash flow $D_t = D_0 G_t \psi(X_t)$, we discount with $S$ and grow
+with $G$, so its date-$0$ value is
 
 $$
     D_0 \mathbb Q_t \psi(X_0),
@@ -285,34 +315,34 @@ generated by $Q=GS$, where $G$ is stochastic growth and $S$ is the stochastic
 discount factor.
 ```
 
-The long-horizon behavior of $\mathbb Q_t$ tells us how current prices value
-cash-flow growth risk that materializes far in the future.
-
-## The Generator
+The long-horizon behaviour of $\mathbb Q_t$ is the central object of the
+lecture: it tells us how current prices value cash-flow growth risk that
+materializes far in the future.
 
-The semigroup $\{\mathbb M_t\}_{t \geq 0}$ is a global object.
+## The generator
 
-For a fixed horizon $t$, the value
+So far we have a family of operators $\{\mathbb M_t\}_{t \geq 0}$, one for each
+horizon $t$.
 
-$$
-    \mathbb M_t \psi(x)
-    =
-    E\left[M_t \psi(X_t) \mid X_0=x\right]
-$$
+That is more information than we can analyze directly --- and what we really
+want is the behaviour of $\mathbb M_t \psi$ as $t \to \infty$.
 
-averages over all paths from $0$ to $t$.
+The **generator** $\mathbb A$ compresses the entire semigroup into one
+time-independent operator on the state space.
 
-But the long-horizon question asks what happens to this whole family of
-operators as $t$ becomes large.
+It records the *instantaneous* rate of change of $M_t \psi(X_t)$, and its
+eigenvalues drive the long-run growth rate of $\mathbb M_t$.
 
-The role of the generator is to replace this horizon-indexed family by a
-single local object on the state space.
+That is what lets us turn an asymptotic question about a family of operators
+into a single eigenvalue problem.
 
-The idea is easiest to see in discrete time.
+### Discrete-time intuition
 
-Suppose $X_n$ is Markov and $M_n$ is a multiplicative functional.
+The role of the generator is easiest to see when time is discrete, so we
+build the picture there once and then carry it over.
 
-Let the one-period valuation operator be
+Let $X_n$ be Markov and $M_n$ a multiplicative functional, and define the
+one-period valuation operator
 
 $$
     K\psi(x)
@@ -320,121 +350,83 @@ $$
     E\left[M_1 \psi(X_1) \mid X_0=x\right].
 $$
 
-Then the $n$-period valuation operator is just $K^n$.
-
-This is the same one-step-to-many-step logic used in the finite Markov chains
-lecture, where a transition matrix $P$ determines $n$-step probabilities
-through $P^n$.
+Iterating gives the $n$-period operator $K^n$ --- exactly the logic by which a
+transition matrix $P$ produces $n$-step probabilities through $P^n$, except
+that $K$ also carries the payoff weight $M_1$.
 
-Here $K$ plays the role of the one-step matrix, except that it also includes
-the multiplicative payoff weight $M_1$.
+So one local object, $K$, controls the entire horizon-indexed family.
 
-Thus a long-horizon problem is controlled by a single one-step object.
-
-If $K\phi = \lambda \phi$, then
+It also controls long-run growth: if $K\phi = \lambda \phi$, then
 
 $$
-    K^n \phi = \lambda^n \phi,
+    K^n \phi = \lambda^n \phi ,
 $$
 
-so the eigenvalue $\lambda$ gives the long-run geometric growth or decay rate,
-while $\phi$ describes the long-run dependence on the current state.
+so $\lambda$ is the long-run geometric growth (or decay) rate and $\phi$ is
+the long-run dependence on the current state.
 
-The same one-step operator also tells us the predictable change in a weighted
+The same operator gives the predictable rate of change of the weighted
 payoff.
 
 By multiplicativity,
 
 $$
-\begin{aligned}
     E\left[
         M_{n+1}\psi(X_{n+1}) - M_n \psi(X_n)
         \mid \mathcal F_n
     \right]
-    &=
-    M_n\left(K\psi(X_n)-\psi(X_n)\right).
-\end{aligned}
+    =
+    M_n (K\psi - \psi)(X_n) ,
 $$
 
-Hence $K\psi-\psi$ is the predictable one-period rate of change of the
-weighted payoff, expressed as a function of the current state.
-
-Subtracting these predictable changes from the total change leaves a
-martingale:
+so subtracting the cumulative predictable change from the total change leaves
+a martingale:
 
 $$
     M_n\psi(X_n)
-    -
-    \psi(X_0)
-    -
-    \sum_{j=0}^{n-1}
-        M_j\left(K\psi(X_j)-\psi(X_j)\right).
-$$
-
-So in discrete time the operator $K-I$ does two things at once.
-
-It gives the predictable drift of $M_n\psi(X_n)$, and its eigenvalue problem
-
-$$
-    (K-I)\phi = (\lambda-1)\phi
+    - \psi(X_0)
+    - \sum_{j=0}^{n-1} M_j (K\psi - \psi)(X_j) .
 $$
 
-is equivalent to the long-horizon eigenvalue problem for $K^n$.
-
-Continuous time keeps the same logic, but there is no distinguished
-one-period step.
+Two roles, one operator: $K-I$ is the *local* rate of change of
+$M_n \psi(X_n)$, and through $K^n$ it also controls long-run growth.
 
-For a small interval $h$,
-
-$$
-    \mathbb M_h\psi(x) - \psi(x)
-$$
+### From discrete to continuous time
 
-is the short-horizon predictable change in the weighted payoff starting from
-state $x$.
+Continuous time keeps the same logic.
 
-The continuous-time analogue of $K-I$ is therefore the derivative of the
-semigroup at zero:
+The natural replacement for $K-I$ is the derivative of the semigroup at zero:
 
 $$
     \mathbb M_h \psi(x)
     \approx
-    \psi(x) + h \mathbb A\psi(x).
+    \psi(x) + h \mathbb A \psi(x)
+    \quad \text{for small } h > 0.
 $$
 
-When this derivative can be represented by a function of the current state, we
-call that function $\mathbb A\psi$.
+The operator $\mathbb A$ is *local* in the sense that $\mathbb A\psi(x)$
+depends only on what happens in an infinitesimal neighbourhood of $x$, not on
+a path integral over $[0,t]$.
 
-It is local because it records the instantaneous rate of change at $x$, rather
-than the value of an entire finite-horizon path integral.
+If $\mathbb A\phi = \rho \phi$, then
 
-If $\mathbb A\phi=\rho\phi$, then
-$\mathbb M_t\phi=\exp(\rho t)\phi$, the continuous-time counterpart of
-$K^n\phi=\lambda^n\phi$.
-
-This is why the generator matters for long-run valuation: it converts the
-asymptotic study of $\mathbb M_t$ into an eigenvalue problem for a local
-operator.
-
-The derivative notation above is heuristic.
+$$
+    \mathbb M_t \phi = \exp(\rho t)\phi ,
+$$
 
-For the Markov processes used by {cite:t}`HansenScheinkman2009`, it is more
-convenient to define the generator through the associated martingale
-decomposition, mirroring the discrete-time formula.
+the continuous-time analogue of $K^n \phi = \lambda^n \phi$.
 
-We look for a function $\chi$ such that $M_t \chi(X_t)$ is the instantaneous
-predictable rate of change of $M_t \psi(X_t)$.
+So the long-run behaviour of $\mathbb M_t$ is encoded in an eigenvalue problem
+for the local operator $\mathbb A$.
 
-In informal differential form,
+### Extended generator
 
-$$
-    E\bigl[ d\bigl(M_t \psi(X_t)\bigr) \,\bigm|\, \mathcal F_t \bigr]
-    \;=\;
-    M_t \chi(X_t)\, dt .
-$$
+For the Markov processes we use, the derivative form above is heuristic --- it
+may not be well-defined for every $\psi$ of interest.
 
-The formal definition says that, after integrating this predictable rate along
-the path, the remaining part is martingale noise.
+We instead define $\mathbb A$ through the *martingale decomposition*, which
+mirrors the discrete-time identity in which $K-I$ is the predictable rate of
+change of $M_n\psi(X_n)$.
 
 ```{prf:definition} Extended Generator
 :label: lrr-def-extended-generator
@@ -457,26 +449,27 @@ In this case, the extended generator assigns $\chi$ to $\psi$, and we write
 $\mathbb A \psi = \chi$.
 ```
 
-The three terms have the same roles as in the discrete-time decomposition:
+The three terms play the same roles as in discrete time:
 
 * $M_t \psi(X_t) - \psi(X_0)$ is the total change in the weighted payoff over
-  $[0, t]$.
+  $[0, t]$,
 * $\int_0^t M_s \chi(X_s)\, ds$ accumulates the expected rate of change along
-  the path.
-* $N_t$ is the residual — mean-zero noise — and the martingale condition is
-  what forces $\chi$ to be the right rate.
+  the path,
+* $N_t$ is the residual.
 
-So $\mathbb A \psi(x) = \chi(x)$ is the instantaneous expected rate of change
-of $M_t \psi(X_t)$ when the current state is $x$.
+Requiring $N_t$ to be a local martingale pins down $\chi = \mathbb A\psi$ as
+the instantaneous expected rate of change of $M_t \psi(X_t)$ at the current
+state.
 
-Two sanity checks connect this definition to familiar objects.
+* When $M \equiv 1$, the definition reduces to Dynkin's formula for the
+  standard Markov generator
+  $\mathcal L \psi(x)
+  = \lim_{t \downarrow 0} t^{-1}\bigl[E\psi(X_t) - \psi(x)\bigr]$.
 
-When $M \equiv 1$, the condition reduces to Dynkin's formula for the standard
-Markov generator
-$\mathcal L \psi(x) = \lim_{t \downarrow 0} t^{-1}\bigl[E\psi(X_t) - \psi(x)\bigr]$.
+* When $X$ is a jump diffusion, Itô's formula applied to $M_t\psi(X_t)$
+  produces the closed-form expression for $\mathbb A\psi$ below.
 
-When $X$ is a jump diffusion, applying Itô's formula to $M_t \psi(X_t)$ produces
-a closed-form expression for $\mathbb A \psi$ given below.
+### A closed form for jump diffusions
 
 Suppose the Markov state satisfies
 
@@ -524,7 +517,9 @@ When $M=S$ is a stochastic discount factor, the extra terms multiplying
 $\phi(x)$ encode local prices of Brownian and jump risk.
 ```
 
-## Principal Eigenfunctions
+We will apply this formula directly in the affine-diffusion example below.
+
+## Principal eigenfunctions
 
 With the local operator $\mathbb A$ in hand, the long-run question becomes:
 which positive payoffs grow at a constant proportional rate under the
@@ -547,17 +542,63 @@ $$ (eq:generator-eigen)
 A **principal eigenfunction** is an eigenfunction that is strictly positive on the state space.
 ```
 
-If $\phi > 0$ solves {eq}`eq:generator-eigen`, then
+To see why this expression is the natural object built from the eigenpair,
+recall the discrete-time picture from the generator section.
+
+There, if $K\phi = \lambda\phi$, then the process
+
+$$
+    \lambda^{-n} M_n \frac{\phi(X_n)}{\phi(X_0)}
+$$
+
+is a martingale: $K\phi = \lambda\phi$ exactly cancels the one-step drift of
+$M_n\phi(X_n)$ after we divide by $\lambda^n$.
+
+In continuous time, $\lambda^n$ is replaced by $\exp(\rho t)$, and the
+analogous candidate martingale is
 
 $$
     \hat M_t
     =
     \exp(-\rho t) M_t
-    \frac{\phi(X_t)}{\phi(X_0)}
+    \frac{\phi(X_t)}{\phi(X_0)} .
 $$ (eq:mhat)
 
+The eigenfunction equation $\mathbb A\phi = \rho\phi$ is what we need to make
+this candidate work, just as $K\phi = \lambda\phi$ did in discrete time.
+
+To verify, apply the definition of the extended generator to $M_t\phi(X_t)$:
+
+$$
+    M_t \phi(X_t) - \phi(X_0) - \int_0^t M_s\, \mathbb A\phi(X_s)\, ds
+$$
+
 is a local martingale.
 
+Substituting $\mathbb A\phi = \rho \phi$ reduces this to
+
+$$
+    M_t \phi(X_t) - \phi(X_0) - \rho \int_0^t M_s \phi(X_s)\, ds,
+$$
+
+so the predictable drift of $M_t\phi(X_t)$ is $\rho M_t \phi(X_t)\, dt$.
+
+For $Z_t := M_t \phi(X_t)$, integration by parts gives
+
+$$
+    d\bigl(\exp(-\rho t) Z_t\bigr)
+    = -\rho \exp(-\rho t) Z_t\, dt + \exp(-\rho t)\, dZ_t ,
+$$
+
+and the drift term in $dZ_t$ is exactly $\rho Z_t\, dt$, so the two drift
+contributions cancel.
+
+Hence $\hat M_t$ has zero drift and is a local martingale.
+
+Rearranging {eq}`eq:mhat` for $M_t$ gives the factorization
+{eq}`eq:hs-factorization` from the overview, with $\hat M$ playing the role
+of the promised martingale component.
+
 ```{prf:definition} Martingale Component and Twisted Measure
 :label: lrr-def-martingale-component
 
@@ -574,15 +615,25 @@ $$
 $$
 ```
 
-The martingale component also gives the factorization {eq}`eq:hs-factorization`.
-
-It also gives the semigroup eigenvalue equation
+Rearranging {eq}`eq:mhat` gives the multiplicative factorization
+{eq}`eq:hs-factorization`, and taking expectations gives the semigroup
+eigenvalue equation
 
 $$
     \mathbb M_t \phi = \exp(\rho t)\phi,
     \qquad t \geq 0.
 $$ (eq:semigroup-eigen)
 
+### Stability of the twisted process
+
+The eigenpair $(\rho, \phi)$ controls *long-run* behaviour of $\mathbb M_t$
+only if the twisted process settles into a stationary regime.
+
+We need three notions: a stationary distribution, recurrence (the process
+revisits every set of interest), and irreducibility of a discretely sampled
+skeleton (so the recurrence is not an artefact of the continuous-time
+sampling).
+
 ```{prf:definition} Harris Recurrence
 :label: lrr-def-harris-recurrence
 
@@ -608,19 +659,19 @@ $\hat\varsigma$ measure is reached with positive probability from every
 initial state.
 ```
 
-Under these stability restrictions, Proposition 7.1 of
-{cite:t}`HansenScheinkman2009` gives the long-run approximation
+When the twisted process is stochastically stable with an irreducible
+skeleton, the long-run approximation
 
 $$
     \lim_{t \to \infty}
     \exp(-\rho t)\mathbb M_t \psi
     =
     \phi
-    \int \frac{\psi}{\phi} d\hat\varsigma ,
+    \int \frac{\psi}{\phi} d\hat\varsigma
 $$ (eq:long-run-limit)
 
-where $\hat\varsigma$ is the stationary distribution of the twisted Markov
-process.
+holds, where $\hat\varsigma$ is the stationary distribution of the twisted
+Markov process.
 
 This is the formal sense in which $\rho$ is the long-run growth rate and
 $\phi$ is the long-run state dependence.
@@ -628,14 +679,12 @@ $\phi$ is the long-run state dependence.
 ```{note}
 Positive eigenfunctions need not be unique in general state spaces.
 
-The eigenfunction used for long-run approximation must generate a martingale
-and a stochastically stable twisted process.
-
-Proposition 7.2 of {cite:t}`HansenScheinkman2009` shows that these stability
-requirements select the relevant eigenfunction up to scale.
+The stability requirements above select the relevant eigenfunction up to
+scale --- they pick out the eigenpair whose twisted process is ergodic, and so
+the one that governs the long-run limit.
 ```
 
-## A Finite-State Markov Chain
+## A finite-state Markov chain
 
 We first study a finite-state chain, where the analysis is exactly
 Perron-Frobenius theory.
@@ -760,7 +809,7 @@ def stationary_distribution(Q):
     return pi / pi.sum()
 ```
 
-### Two States
+### Two states
 
 Consider a boom-recession economy.
 
@@ -831,11 +880,15 @@ for t in [1, 5, 20, 80]:
 print("\nlimit =", limit)
 ```
 
-### Adding Jumps
+### Adding jumps
+
+State transitions in this model are discontinuous, so the multiplicative
+functional should be allowed to jump at the transition times.
 
-Now let the multiplicative functional jump when the Markov state changes.
+A natural case is a stochastic discount factor that pays out more when the
+economy switches into a boom and less when it switches into a recession.
 
-The matrix `kappa_jump` below says that the functional jumps up on a
+The matrix `kappa_jump` below encodes this: the functional jumps up on a
 recession-to-boom transition and down on a boom-to-recession transition.
 
 ```{code-cell} ipython3
@@ -851,6 +904,10 @@ print("\nphi with jumps:")
 print(phi_jump)
 ```
 
+To see how the long-run rate $\rho$ responds to jump risk, we hold the
+boom-to-recession multiplier fixed and trace out $\rho$ as the
+recession-to-boom multiplier varies.
+
 ```{code-cell} ipython3
 kappa_grid = np.linspace(-0.5, 0.5, 100)
 rho_grid = np.empty_like(kappa_grid)
@@ -871,9 +928,14 @@ ax.set_title("Jumps and the Long-Run Growth Rate")
 plt.show()
 ```
 
-## The Affine Diffusion Example
+Larger upward jumps on the recession-to-boom transition raise $\rho$,
+because they make the functional grow more on transitions out of the
+high-decay state.
+
+## The affine diffusion example
 
-We now turn to the continuous-state example in {cite:t}`HansenScheinkman2009`.
+We now apply the operator approach to a continuous-state model that is
+tractable enough to solve in closed form.
 
 The state has two independent components.
 
@@ -897,7 +959,7 @@ dX_t^o
 \end{aligned}
 $$ (eq:affine-state)
 
-The paper normalizes $\sigma_o > 0$ and $\sigma_f < 0$.
+We normalize $\sigma_o > 0$ and $\sigma_f < 0$.
 
 The sign of $\sigma_f$ is a convention that makes a positive $B^f$ shock
 reduce volatility.
@@ -918,11 +980,11 @@ A_t
 \end{aligned}
 $$ (eq:affine-additive)
 
-Guess an exponential-affine eigenfunction
-
-$$
-    \phi(x^f,x^o) = \exp(c_f x^f + c_o x^o).
-$$
+Because the state dynamics and the drift of $A$ are both affine in $(x^f,
+x^o)$, an exponential-affine eigenfunction closes the eigenvalue problem:
+applying the generator to $\phi(x^f,x^o) = \exp(c_f x^f + c_o x^o)$ produces
+another exponential-affine function, so $\mathbb A\phi = \rho\phi$ reduces to
+algebraic conditions on $(c_f, c_o, \rho)$.
 
 ```{prf:definition} Exponential-Affine Eigenfunction
 :label: lrr-def-exponential-affine-eigenfunction
@@ -931,7 +993,14 @@ An **exponential-affine eigenfunction** is a positive eigenfunction whose
 logarithm is affine in the state variables.
 ```
 
-Substitution into $\mathbb A\phi=\rho\phi$ gives
+Substituting
+
+$$
+    \phi(x^f,x^o) = \exp(c_f x^f + c_o x^o)
+$$
+
+into the generator formula {eq}`eq:extended-generator` and matching
+coefficients of $x^f$, $x^o$, and the constant term gives
 
 $$
 0
@@ -996,7 +1065,7 @@ which must be positive.
 ```{code-cell} ipython3
 def solve_affine_eigenfunction(params):
     """
-    Solve the affine eigenvalue problem from Hansen and Scheinkman.
+    Solve the exponential-affine eigenvalue problem.
     """
     xi_f = params["xi_f"]
     xbar_f = params["xbar_f"]
@@ -1082,12 +1151,11 @@ $$
     \gamma_o^s = -a\vartheta_o .
 $$ (eq:breeden-sdf-params)
 
-Recursive preferences of {cite:t}`Kreps_Porteus1978` and
+The recursive preferences of {cite:t}`Kreps_Porteus1978` and
 {cite:t}`Epstein_Zin1989`, used in long-run risk models such as
-{cite:t}`Bansal_Yaron_2004`, add forward-looking terms to the SDF.
-
-The operator calculations below are the same once the parameters
-$(\bar\beta,\beta_f,\beta_o,\gamma_f,\gamma_o)$ are specified.
+{cite:t}`Bansal_Yaron_2004`, change these parameters by adding
+forward-looking terms --- but the operator calculations below are identical
+once $(\bar\beta,\beta_f,\beta_o,\gamma_f,\gamma_o)$ are specified.
 
 ```{code-cell} ipython3
 params_state = {
@@ -1146,9 +1214,12 @@ for cf in cf_candidates:
     print(f"cf = {cf:8.4f}, twisted mean reversion = {mr:8.4f}")
 ```
 
-### The Martingale Component
+### The martingale component
+
+Having solved for the eigenpair $(\rho,\phi)$, we can now assemble the
+multiplicative factorization {eq}`eq:hs-factorization` explicitly.
 
-For the affine example, the martingale component has log
+The martingale component $\hat M$ defined in {eq}`eq:mhat` has log
 
 $$
 \begin{aligned}
@@ -1292,11 +1363,17 @@ plt.tight_layout()
 plt.show()
 ```
 
-## Long-Run Risk Prices
+## Long-run risk prices
 
-Local continuous-time pricing is expressed through instantaneous risk prices.
+The eigenpair $(\rho, \phi)$ from the cash-flow valuation problem also lets
+us define a *long-run* analogue of the instantaneous risk prices used in
+local continuous-time asset pricing.
 
-Suppose the SDF has Brownian loading $\gamma^s$.
+The two prices need not agree: a shock that moves a persistent state variable
+has a small immediate effect on the cash flow but a large cumulative effect
+on future growth and discounting.
+
+Our aim is to compare the two.
 
 ```{prf:definition} Local Brownian Risk Price
 :label: lrr-def-local-brownian-risk-price
@@ -1305,47 +1382,43 @@ The **local price** of exposure to a Brownian shock with SDF loading
 $\gamma_i^s$ is $-\gamma_i^s$.
 ```
 
-For a valuation functional with Brownian exposure $\gamma^v$, Corollary 3.1 of
-{cite:t}`HansenScheinkman2009` gives the Brownian part of the local required
-expected return as
+```{prf:definition} Long-Run Risk Price
+:label: lrr-def-long-run-risk-price
 
-$$
-    -\gamma^v \cdot \gamma^s .
-$$
+In the cash-flow valuation problem, the **long-run risk price** for exposure
+$\gamma_i^g$ is the marginal change
+$\partial R_\infty / \partial \gamma_i^g$ in the asymptotic required return
+$R_\infty=-\rho+\delta$.
+```
+
+The local Brownian price is read off the SDF directly: for a valuation
+functional with Brownian exposure $\gamma^v$, the Brownian part of the local
+required expected return is $-\gamma^v \cdot \gamma^s$, so a unit of
+$\gamma^v_i$ exposure is priced at $-\gamma^s_i$.
 
-Thus the local price of exposure to a Brownian shock is $-\gamma^s$.
+The long-run price requires solving the principal eigenvalue problem for the
+$GS$ semigroup, since it depends on how a shock propagates through the
+persistent state.
 
-Long-run prices differ because a shock can move persistent state variables
-that influence future cash-flow growth or future discounting.
+### Comparison in the affine model
 
 In the affine model, the local price of exposure to $B^o$ is
 
 $$
-    -\gamma_o^s .
+    -\gamma_o^s ,
 $$
 
-The long-run price of exposure to $B^o$ in the cash-flow valuation problem is
+while the long-run price is
 
 $$
     -\gamma_o^s
     - \frac{\beta_o^s}{\xi_o}\sigma_o .
 $$ (eq:long-run-price-o)
 
-```{prf:definition} Long-Run Risk Price
-:label: lrr-def-long-run-risk-price
-
-In the cash-flow valuation problem, the **long-run risk price** for exposure
-$\gamma_i^g$ is the marginal change
-$\partial R_\infty / \partial \gamma_i^g$ in the asymptotic required return
-$R_\infty=-\rho+\delta$.
-```
-
-The second term is the persistence adjustment.
-
-A shock to $B^o$ moves the persistent growth predictor $X^o$.
-
-Because $X^o$ mean reverts at rate $\xi_o$, the cumulative effect of the shock
-is larger when $\xi_o$ is smaller.
+The second term is the **persistence adjustment**: a shock to $B^o$ moves
+the persistent growth predictor $X^o$, and because $X^o$ mean reverts at rate
+$\xi_o$, the cumulative effect of the shock is larger when $\xi_o$ is
+smaller.
 
 ```{code-cell} ipython3
 gamma_s_o = params_sdf["gamma_o"]
@@ -1379,7 +1452,7 @@ ax.legend()
 plt.show()
 ```
 
-### Changing Cash-Flow Risk
+### Changing cash-flow risk
 
 Let a cash-flow growth functional be
 
@@ -1475,11 +1548,20 @@ print(f"finite-difference slope = {finite_difference:.6f}")
 print(f"formula                 = {long_run_price_o:.6f}")
 ```
 
-## Perron-Frobenius Dominance
+## Perron-Frobenius dominance
+
+The long-run limit {eq}`eq:long-run-limit` works because the principal
+eigenvalue $\rho$ strictly dominates the real part of every other eigenvalue
+of $\mathbb A$.
 
-The finite-state examples make the limiting argument transparent.
+In a finite-state chain this is just the Perron-Frobenius theorem: the
+positive generator $A$ in {eq}`eq:finite-a` has a unique largest real
+eigenvalue, and contributions from the remaining eigenvalues decay at an
+exponential rate equal to the gap between $\rho$ and the next-largest real
+part.
 
-Let us repeat the calculation for a three-state chain.
+We illustrate this on a three-state chain and read off the spectral gap
+directly.
 
 ```{code-cell} ipython3
 state_names = ["expansion", "normal", "contraction"]
@@ -1539,6 +1621,11 @@ plt.tight_layout()
 plt.show()
 ```
 
+For each choice of $\psi$ and each initial state, the rescaled value
+$\exp(-\rho t)\mathbb M_t \psi$ converges to the dashed horizontal line --- the
+long-run limit $\phi \int (\psi/\phi)\, d\hat\varsigma$ --- at a rate
+controlled by the spectral gap.
+
 ## Summary
 
 The Hansen-Scheinkman approach studies long-run risk by studying
@@ -1800,3 +1887,172 @@ above.
 
 ```{solution-end}
 ```
+
+```{exercise}
+:label: lrr_ex4
+
+Derive the closed-form expression for the extended generator in
+{eq}`eq:extended-generator`.
+
+Take the jump-diffusion setting in which the Markov state satisfies
+
+$$
+    dX_t^c = \xi(X_t)dt + \Gamma(X_t)dB_t
+$$
+
+between jumps, with $\Sigma = \Gamma\Gamma^\top$ and jump compensator
+$\eta(dy \mid x)$.
+
+Let $M = \exp(A)$ for the additive functional $A$ in
+{eq}`eq:additive-functional` and let $\phi$ be a smooth, strictly positive
+function.
+
+Recall from {prf:ref}`lrr-def-extended-generator` that
+$\mathbb A\phi = \chi$ if
+
+$$
+    N_t = M_t\phi(X_t) - \phi(X_0) - \int_0^t M_s \chi(X_s)\, ds
+$$
+
+is a local martingale, so the task is to identify the predictable drift of
+$M_t\phi(X_t)$ and read off $\chi$.
+
+(a) Apply Itô's formula to $Y_t = \exp(A_t)\phi(X_t)$ between jumps and
+show that the continuous part of $dY_t$ has drift
+
+$$
+    M_t
+    \left[
+        (\xi + \Gamma\gamma)^\top
+            \frac{\partial \phi}{\partial x}
+        + \frac{1}{2}
+          \operatorname{trace}\!\left(
+              \Sigma \frac{\partial^2 \phi}{\partial x \partial x^\top}
+          \right)
+        + \left(\beta + \frac{\gamma^\top \gamma}{2}\right)\phi
+    \right] dt .
+$$
+
+(b) Show that at a jump time $t$ with $X_{t-}=x$ and $X_t=y$,
+
+$$
+    \Delta Y_t = M_{t-}\big[\exp[\kappa(y,x)]\phi(y) - \phi(x)\big] ,
+$$
+
+and conclude that the predictable jump compensator contributes drift
+
+$$
+    M_{t-}
+    \int
+        \big\{
+            \exp[\kappa(y,x)]\phi(y) - \phi(x)
+        \big\}
+        \eta(dy \mid x)\, dt .
+$$
+
+(c) Decompose
+
+$$
+    \exp[\kappa(y,x)]\phi(y) - \phi(x)
+    =
+    \exp[\kappa(y,x)]
+        \big[\phi(y) - \phi(x)\big]
+    + \big[\exp[\kappa(y,x)] - 1\big]\phi(x),
+$$
+
+combine the result with part (a), and read off $\mathbb A \phi$ to recover
+{eq}`eq:extended-generator`.
+```
+
+```{solution-start} lrr_ex4
+:class: dropdown
+```
+
+*(a)* Set $g(a, x) = e^a \phi(x)$, so that $Y_t = g(A_t, X_t)$.
+
+Between jumps, the continuous parts of $A$ and $X$ are
+
+$$
+    dA_t^c = \beta(X_t)\, dt + \gamma(X_t)^\top dB_t,
+    \qquad
+    dX_t^c = \xi(X_t)\, dt + \Gamma(X_t)\, dB_t ,
+$$
+
+with quadratic covariations
+
+$$
+    d\langle A^c, A^c\rangle_t = \gamma^\top \gamma\, dt,
+    \qquad
+    d\langle X^c, X^c\rangle_t = \Sigma\, dt,
+    \qquad
+    d\langle A^c, X^c\rangle_t = \Gamma\gamma\, dt .
+$$
+
+The partial derivatives of $g$ satisfy
+
+$$
+    \partial_a g = g,
+    \quad
+    \partial_{aa} g = g,
+    \quad
+    \partial_x g = e^a \frac{\partial \phi}{\partial x},
+    \quad
+    \partial_{ax} g = e^a \frac{\partial \phi}{\partial x},
+    \quad
+    \partial_{xx} g = e^a \frac{\partial^2 \phi}{\partial x \partial x^\top} .
+$$
+
+Itô's formula yields a continuous martingale part plus the drift
+
+$$
+\begin{aligned}
+    M_t \Big[
+        \beta \phi
+        + \tfrac{1}{2}\gamma^\top \gamma\, \phi
+        + \xi^\top \tfrac{\partial \phi}{\partial x}
+        + \tfrac{1}{2}\operatorname{trace}\!\big(\Sigma\, \tfrac{\partial^2 \phi}{\partial x \partial x^\top}\big)
+        + (\Gamma\gamma)^\top \tfrac{\partial \phi}{\partial x}
+    \Big]\, dt .
+\end{aligned}
+$$
+
+Grouping the gradient terms gives the expression in the question.
+
+*(b)* At a jump time $t$,
+$\Delta A_t = \kappa(X_t, X_{t-}) = \kappa(y, x)$, so
+$M_t = M_{t-}\exp[\kappa(y,x)]$ and
+
+$$
+    \Delta Y_t
+    = M_t \phi(X_t) - M_{t-}\phi(X_{t-})
+    = M_{t-}\big[\exp[\kappa(y,x)]\phi(y) - \phi(x)\big] .
+$$
+
+Compensating these jumps against the predictable intensity $\eta(dy \mid x)$
+gives the stated predictable drift.
+
+*(c)* Adding the jump drift from (b) to the continuous drift from (a), the
+predictable drift of $Y_t = M_t \phi(X_t)$ at state $x$ is $M_t\, \chi(x)\, dt$
+with
+
+$$
+\begin{aligned}
+    \chi(x)
+    &=
+    (\xi + \Gamma\gamma)^\top \frac{\partial \phi}{\partial x}
+    + \frac{1}{2}\operatorname{trace}\!\left(
+        \Sigma \frac{\partial^2 \phi}{\partial x \partial x^\top}
+      \right)
+    + \left(\beta + \frac{\gamma^\top \gamma}{2}\right)\phi(x)
+    \\
+    &\quad
+    + \int \exp[\kappa(y,x)]\big[\phi(y) - \phi(x)\big]\, \eta(dy \mid x)
+    + \phi(x)\int \big[\exp[\kappa(y,x)] - 1\big]\, \eta(dy \mid x) .
+\end{aligned}
+$$
+
+Collecting the terms multiplying $\phi(x)$ recovers
+{eq}`eq:extended-generator`, so $\chi = \mathbb A \phi$.
+
+```{solution-end}
+```

From 8ebf8f0dde7138f20a2d6ba07e22695d9b314586 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Thu, 28 May 2026 17:23:59 +1000
Subject: [PATCH 07/25] updates

---
 lectures/_static/quant-econ.bib    |   16 +
 lectures/long_run_risk_operator.md | 1022 ++++++++++++++++++----------
 lectures/ls_learning.bib           |   60 --
 lectures/rational_learning_re.bib  |  172 -----
 4 files changed, 697 insertions(+), 573 deletions(-)
 delete mode 100644 lectures/ls_learning.bib
 delete mode 100644 lectures/rational_learning_re.bib

diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 19bbd8f79..cadf56cd4 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -3976,6 +3976,13 @@ @article{GrossmanStiglitz1980
   pages   = {393--408}
 }
 
+@article{GrossmanSonnenschein1982,
+  author  = {Grossman, Sanford J. and Sonnenschein, Hugo},
+  title   = {Notes on Expectations Equilibria in Bayesian Settings},
+  journal = {Working Paper},
+  year    = {1982}
+}
+
 @article{BlumeEasley1982,
   author  = {Blume, Lawrence E. and Easley, David},
   title   = {Learning to be Rational},
@@ -4016,6 +4023,15 @@ @article{Townsend1983b
   pages   = {546--588}
 }
 
+@article{ArrowGreen1973,
+  author      = {Arrow, Kenneth J. and Green, Jerry R.},
+  title       = {Notes on Expectations Equilibria in Bayesian Settings},
+  journal     = {Working Paper in Economics},
+  year        = {1973},
+  number      = {33},
+  institution = {Institute for Mathematical Studies in the Social Sciences, Stanford University}
+}
+
 @article{Kobayashi1977,
   author  = {Kobayashi, Tetsuya},
   title   = {A Note on Fulfilled Expectations Equilibria},
diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index 1b88162f7..9152dd0ee 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -37,8 +37,8 @@ investors for instantaneous exposure to Brownian and jump shocks.
 Driving the time interval to zero gives a clean limiting object, but it
 describes only the *short end* of the term structure of risk prices.
 
-Hansen and Scheinkman instead study the *long end*: what happens as the time
-between valuation and payoff grows large.
+This lecture instead studies the *long end*: what happens as the time between
+valuation and payoff grows large.
 
 The two ends are complementary --- together they pin down the slope of the term
 structure of risk prices, and economic restrictions are often more reliable
@@ -51,7 +51,8 @@ The central object is a positive multiplicative functional $\{M_t\}_{t \geq 0}$,
 such as a stochastic discount factor, a cumulated return, a stochastic growth
 functional, or a product of discounting and growth.
 
-Under suitable conditions, $M$ admits the factorization
+When the right principal eigenfunction is selected, $M$ admits the
+factorization
 
 $$
     M_t
@@ -67,6 +68,13 @@ where
 * $\hat M$ is a martingale used to change probability measure, and
 * $\phi(X_0)/\phi(X_t)$ is a transient state-dependent component.
 
+The qualifier "right" matters because general state-space Markov models can
+have more than one positive eigenfunction.
+
+The economically useful eigenfunction is the one for which $\hat M$ is a true
+martingale and the Markov process remains stable under the probability measure
+twisted by $\hat M$.
+
 ```{prf:definition} Multiplicative Factorization
 :label: lrr-def-multiplicative-factorization
 
@@ -81,22 +89,23 @@ For long horizons, the scalar $\rho$ controls the exponential growth or decay
 rate of the relevant valuation semigroup, while $\phi$ controls the limiting
 dependence on the current Markov state.
 
-{cite:t}`AlvarezJermann2005` used a related permanent-transitory decomposition
+{cite:t}`AlvarezJermann2005` use a related permanent-transitory decomposition
 for stochastic discount factors.
 
-{cite:t}`HansenScheinkman2009` link this decomposition to principal
-eigenfunctions and use it to characterize long-run risk-return trade-offs.
+The key insight is that the decomposition can be constructed from principal
+eigenfunctions and used to characterize long-run risk-return trade-offs.
 
 This lecture covers
 
 * multiplicative functionals and valuation semigroups,
+* the pricing restriction that links stochastic discount factors to returns,
 * the extended generator associated with a multiplicative functional,
 * principal eigenfunctions and the Hansen-Scheinkman factorization,
 * a finite-state example where the analysis reduces to Perron-Frobenius theory,
 * an affine diffusion example, and
 * long-run risk prices for persistent growth shocks.
 
-We start with imports.
+We start with the following imports
 
 ```{code-cell} ipython3
 import numpy as np
@@ -111,6 +120,28 @@ $\mathcal D_0$.
 
 Let $\mathcal F_t$ be the filtration generated by the history of $X$.
 
+We work with a strong Markov process whose sample paths are right-continuous
+with left limits.
+
+For the jump-diffusion formulas below, $X$ is a semimartingale with a
+continuous component
+
+$$
+    dX_t^c = \xi(X_{t-})dt + \Gamma(X_{t-})dB_t
+$$
+
+The jump component has compensator $\eta(dy \mid X_{t-})dt$.
+
+We assume finite jump activity on finite time intervals to keep the notation
+simple.
+
+We also assume enough rank in $\Gamma$ that the Brownian shocks relevant for
+pricing can be recovered from the state history.
+
+These assumptions are not cosmetic: they let us write down the extended
+generator explicitly and apply changes of probability measure using
+martingales.
+
 ```{prf:definition} Functional
 :label: lrr-def-functional
 
@@ -160,11 +191,29 @@ $$ (eq:multiplicative)
 where $\theta_t$ shifts the underlying Markov path forward by $t$ units.
 ```
 
-For example, if $S_t$ is a stochastic discount factor, then
-$S_{t+u}/S_t$ is the date-$t$ discount factor for payoffs at date $t+u$.
+The pricing origin of {eq}`eq:multiplicative` is the law of one price.
+
+If $S_t$ is a stochastic discount factor, the date-$0$ value of a date-$t$
+payoff $\Pi_t$ is $E[S_t\Pi_t \mid \mathcal F_0]$.
+
+If the same payoff is purchased at an intermediate date $\tau$, its date-$\tau$
+price is
 
-The Markov version of this intertemporal consistency condition is exactly
-{eq}`eq:multiplicative`.
+$$
+    E\left[\frac{S_t}{S_\tau}\Pi_t \mid \mathcal F_\tau\right].
+$$
+
+For prices to be Markov in the current state, the ratio $S_t/S_\tau$ must
+depend only on the Markov path after $\tau$.
+Thus, in Markov form,
+
+$$
+    \frac{S_{\tau+u}}{S_\tau} = S_u(\theta_\tau),
+$$
+
+This identity is precisely multiplicativity.
+
+The same structure is then used for stochastic growth and cumulated returns.
 
 When $M_t > 0$, we can write $M_t = \exp(A_t)$.
 
@@ -202,6 +251,26 @@ In this notation, $\beta$ is allowed to be positive or negative.
 For instance, a pure discount factor with short rate $r(X_t)$ has
 $\beta(x) = -r(x)$.
 
+We impose the integrability conditions needed for these objects to be well
+defined:
+
+$$
+    \int_0^t |\beta(X_s)|ds < \infty,
+    \qquad
+    \int_0^t \|\gamma(X_s)\|^2ds < \infty,
+$$
+
+These conditions hold for finite $t$, and we also impose $\kappa(x,x)=0$ and
+
+$$
+    \int \exp[\kappa(y,x)]\eta(dy \mid x) < \infty.
+$$
+
+This parameterization is broad enough for the examples in this lecture, but it
+is not exhaustive.
+
+Occupation times and local times are also additive functionals.
+
 ## Semigroups
 
 A multiplicative functional $M$ together with the Markov process $X$
@@ -257,8 +326,15 @@ Economically, it is the Markov law of iterated values: the date-$0$ price of
 a date-$(t+u)$ payoff equals the date-$0$ price of holding the date-$t$
 price of that payoff.
 
-Multiplicativity of $M$ (at the path level) and the semigroup property of
-$\{\mathbb M_t\}$ (at the operator level) are the same condition.
+Path-level multiplicativity is the structural restriction that gives the
+operator semigroup property.
+
+Conversely, in a Markov pricing model where operators are represented by
+stochastic discount factor ratios, the semigroup property is the operator
+shadow of this same intertemporal consistency restriction.
+
+The path-level statement contains more information than the operator identity
+alone.
 
 ### Functionals we will use
 
@@ -278,26 +354,30 @@ flows that both grow and require discounting.
 :label: lrr-def-stochastic-discount-factor
 
 A **stochastic discount factor** $S$ is a positive multiplicative functional
-for which $E[S_t Z_t \mid X_0=x]$ gives the date-$0$ value of a payoff $Z_t$
-delivered at date $t$.
+for which $E[S_t Z_t \mid X_0=x]$ is the date-$0$ value of an
+$\mathcal F_t$-measurable payoff $Z_t$.
 ```
 
 ```{prf:definition} Valuation Functional
 :label: lrr-def-valuation-functional
 
 Given a stochastic discount factor $S$, a **valuation functional** $V$ is a
-multiplicative functional such that $\{V_tS_t : t \geq 0\}$ is a martingale.
+positive multiplicative functional such that $\{V_t S_t : t \geq 0\}$ is a
+martingale.
 ```
 
 ```{prf:definition} Stochastic Growth Functional
 :label: lrr-def-stochastic-growth-functional
 
-A **stochastic growth functional** $G$ is a positive multiplicative functional
-that scales a cash flow between dates.
+A **stochastic growth functional** $G$ is a positive multiplicative
+functional, interpreted as a multiplicative growth factor applied to a
+date-$0$ cash flow.
 ```
 
-To price a cash flow $D_t = D_0 G_t \psi(X_t)$, we discount with $S$ and grow
-with $G$, so its date-$0$ value is
+Consider a cash flow $D_t = D_0\, G_t\, \psi(X_t)$, where $D_0 > 0$ is the
+date-$0$ level and $\psi$ is a Borel state-payoff function.
+
+Discounting with $S$ and growing with $G$, its date-$0$ value is
 
 $$
     D_0 \mathbb Q_t \psi(X_0),
@@ -319,6 +399,114 @@ The long-horizon behaviour of $\mathbb Q_t$ is the central object of the
 lecture: it tells us how current prices value cash-flow growth risk that
 materializes far in the future.
 
+The split $D_t=D_0G_t\psi(X_t)$ is not unique.
+For any positive function $\varphi$,
+
+$$
+    D_t
+    =
+    D_0
+    \left[
+        G_t\frac{\varphi(X_t)}{\varphi(X_0)}
+    \right]
+    \left[
+        \frac{\psi(X_t)\varphi(X_0)}{\varphi(X_t)}
+    \right].
+$$
+
+Thus a transient state-dependent component can be moved between $G$ and
+$\psi$.
+
+We therefore normalize growth components so that their permanent part is
+represented by a martingale:
+
+$$
+    G_t = \exp(\delta t)\hat G_t,
+$$
+
+Here $\hat G$ is a martingale and $\delta$ is a constant conditional growth
+rate.
+
+The eigenfunction construction below explains how such martingale components
+can be extracted and which one is relevant for long-run valuation.
+
+### Local pricing restriction
+
+Before studying long horizons, it is useful to record the short-horizon
+risk-return relation.
+
+Let the stochastic discount factor $S$ be parameterized by
+$(\beta^s,\gamma^s,\kappa^s)$ and a valuation functional $V$ by
+$(\beta^v,\gamma^v,\kappa^v)$.
+
+The definition of a valuation functional requires $VS$ to be a martingale.
+
+For a positive multiplicative functional parameterized by
+$(\beta,\gamma,\kappa)$, the local martingale restriction is
+
+$$
+    \beta
+    + \frac{\gamma^\top\gamma}{2}
+    + \int \left(\exp[\kappa(y,\cdot)]-1\right)\eta(dy \mid \cdot)
+    = 0.
+$$ (eq:local-martingale-restriction)
+
+Applying this to $VS$ gives the local pricing restriction
+
+$$
+    \beta^v+\beta^s
+    =
+    -\frac{\|\gamma^v+\gamma^s\|^2}{2}
+    -
+    \int
+        \left(
+            \exp[\kappa^v(y,\cdot)+\kappa^s(y,\cdot)]-1
+        \right)
+        \eta(dy \mid \cdot).
+$$ (eq:local-pricing-restriction)
+
+The expected net rate of return on $V$ is
+
+$$
+    \epsilon^v
+    =
+    \beta^v
+    + \frac{\|\gamma^v\|^2}{2}
+    + \int
+        \left(\exp[\kappa^v(y,\cdot)]-1\right)
+        \eta(dy \mid \cdot).
+$$
+
+Combining this expression with {eq}`eq:local-pricing-restriction` gives
+
+$$
+\begin{aligned}
+    \epsilon^v
+    &=
+    -\beta^s
+    - \gamma^v \cdot \gamma^s
+    - \frac{\|\gamma^s\|^2}{2}
+    \\
+    &\quad
+    -
+    \int
+        \left(
+            \exp[\kappa^v(y,\cdot)+\kappa^s(y,\cdot)]
+            -
+            \exp[\kappa^v(y,\cdot)]
+        \right)
+        \eta(dy \mid \cdot).
+\end{aligned}
+$$ (eq:local-risk-return)
+
+Thus the Brownian local risk-price vector is $-\gamma^s(x)$, expressed in
+the same exposure units as $\gamma^v(x)$.
+
+Jump risk is priced through the function $\kappa^s$.
+
+This local relation is one end of the term structure of risk prices; the
+eigenvalue calculations below describe the other end.
+
 ## The generator
 
 So far we have a family of operators $\{\mathbb M_t\}_{t \geq 0}$, one for each
@@ -539,7 +727,8 @@ $$ (eq:generator-eigen)
 ```{prf:definition} Principal Eigenfunction
 :label: lrr-def-principal-eigenfunction
 
-A **principal eigenfunction** is an eigenfunction that is strictly positive on the state space.
+A **principal eigenfunction** is an eigenfunction $\phi$ that is strictly
+positive on the state space, i.e. $\phi(x) > 0$ for all $x \in \mathcal D_0$.
 ```
 
 To see why this expression is the natural object built from the eigenpair,
@@ -615,9 +804,17 @@ $$
 $$
 ```
 
-Rearranging {eq}`eq:mhat` gives the multiplicative factorization
-{eq}`eq:hs-factorization`, and taking expectations gives the semigroup
-eigenvalue equation
+The candidate $\hat M$ is always a nonnegative local martingale, hence a
+supermartingale.
+Therefore
+
+$$
+    \mathbb M_t \phi \leq \exp(\rho t)\phi.
+$$
+
+When $\hat M$ is a true martingale, taking expectations on both sides of
+{eq}`eq:mhat` and using $E\hat M_t = 1$ gives the semigroup eigenvalue
+equation
 
 $$
     \mathbb M_t \phi = \exp(\rho t)\phi,
@@ -629,38 +826,77 @@ $$ (eq:semigroup-eigen)
 The eigenpair $(\rho, \phi)$ controls *long-run* behaviour of $\mathbb M_t$
 only if the twisted process settles into a stationary regime.
 
-We need three notions: a stationary distribution, recurrence (the process
-revisits every set of interest), and irreducibility of a discretely sampled
-skeleton (so the recurrence is not an artefact of the continuous-time
-sampling).
+We need three conditions on the twisted process, applied in turn:
 
-```{prf:definition} Harris Recurrence
-:label: lrr-def-harris-recurrence
+* a **stationary distribution** $\hat\varsigma$ that the twisted dynamics
+  leave invariant — the candidate long-run distribution;
+* **irreducibility of a discretely sampled skeleton** of $X$ under
+  $\hat\varsigma$ — every region of positive $\hat\varsigma$-mass can be
+  reached from any starting point;
+* **Harris recurrence** of $X$ under the twisted measure — every such region
+  is visited infinitely often, which guarantees that $\hat\varsigma$ is
+  unique.
 
-A Markov process with stationary distribution $\hat\varsigma$ is **Harris
-recurrent** if every Borel set with positive $\hat\varsigma$ measure is visited
-for an infinite amount of time with probability one from every initial state.
-```
+Let $\hat E$ and $\widehat{\Pr}$ denote expectation and probability under the
+twisted measure, and let $\hat{\mathbb A}$ be the generator of $X$ under that
+measure.
 
-```{prf:definition} Stochastically Stable Twisted Process
-:label: lrr-def-stochastic-stability
+```{prf:definition} Stationary Distribution of the Twisted Process
+:label: lrr-def-stationary-distribution
 
-The $\hat M$-twisted Markov process is **stochastically stable** if it has a
-stationary distribution $\hat\varsigma$ and is Harris recurrent under the
-probability measure induced by $\hat M$.
+A probability measure $\hat\varsigma$ on $\mathcal D_0$ is a **stationary
+distribution** for the twisted process if
+
+$$
+    \int \hat{\mathbb A}\psi\, d\hat\varsigma = 0
+$$
+
+for every $\psi$ in the $L^\infty$ domain of $\hat{\mathbb A}$.
 ```
 
 ```{prf:definition} Irreducible Skeleton
 :label: lrr-def-irreducible-skeleton
 
-A discretely sampled skeleton $\{X_{\Delta j}: j=0,1,\ldots\}$ is
-**irreducible** relative to $\hat\varsigma$ if every Borel set with positive
-$\hat\varsigma$ measure is reached with positive probability from every
-initial state.
+The discretely sampled skeleton $\{X_{\Delta j} : j = 0, 1, \ldots\}$ is
+**irreducible** relative to $\hat\varsigma$ if there exists $\Delta > 0$
+such that, for every Borel set $\Lambda \subseteq \mathcal D_0$ with
+$\hat\varsigma(\Lambda) > 0$,
+
+$$
+    \hat E\!\left[
+        \sum_{j=0}^\infty \mathbf 1_{\{X_{\Delta j} \in \Lambda\}}
+        \,\bigg|\, X_0 = x
+    \right] > 0
+    \qquad \text{for all } x \in \mathcal D_0 .
+$$
 ```
 
-When the twisted process is stochastically stable with an irreducible
-skeleton, the long-run approximation
+```{prf:definition} Harris Recurrence
+:label: lrr-def-harris-recurrence
+
+The process $X$ is **Harris recurrent** under the twisted measure if, for
+every Borel set $\Lambda \subseteq \mathcal D_0$ with $\hat\varsigma(\Lambda) > 0$,
+
+$$
+    \widehat{\Pr}\!\left\{
+        \int_0^\infty \mathbf 1_{\{X_t \in \Lambda\}}\, dt = \infty
+        \,\bigg|\, X_0 = x
+    \right\} = 1
+    \qquad \text{for all } x \in \mathcal D_0 .
+$$
+```
+
+```{prf:definition} Stochastically Stable Twisted Process
+:label: lrr-def-stochastic-stability
+
+The $\hat M$-twisted Markov process is **stochastically stable** if it has
+a stationary distribution $\hat\varsigma$, the skeleton
+$\{X_{\Delta j}\}$ is irreducible relative to $\hat\varsigma$, and $X$ is
+Harris recurrent under the twisted measure.
+```
+
+Under the martingale condition for $\hat M$, strict positivity of $M$, and
+the stability conditions above, the long-run approximation is
 
 $$
     \lim_{t \to \infty}
@@ -670,8 +906,17 @@ $$
     \int \frac{\psi}{\phi} d\hat\varsigma
 $$ (eq:long-run-limit)
 
-holds, where $\hat\varsigma$ is the stationary distribution of the twisted
-Markov process.
+Here $\hat\varsigma$ is the stationary distribution of the twisted Markov
+process.
+
+The mode of convergence depends on the payoff class.
+
+For any fixed sampling interval $\Delta>0$, convergence along
+$t=\Delta j$ holds for almost every initial state when
+$\int |\psi|/\phi\, d\hat\varsigma < \infty$.
+
+For all continuous times $t$, the pointwise statement holds when $\psi/\phi$
+is bounded.
 
 This is the formal sense in which $\rho$ is the long-run growth rate and
 $\phi$ is the long-run state dependence.
@@ -706,6 +951,9 @@ Let the multiplicative functional have
 * a jump multiplier $\exp[\kappa(x_j,x_i)]$ when the state jumps from $i$ to
   $j$.
 
+In the code below, `κ[j, i]` means $\kappa(x_j,x_i)$, the log multiplier for
+the transition from state $i$ to state $j$.
+
 The generator matrix $A$ for the multiplicative semigroup is
 
 $$
@@ -729,84 +977,59 @@ principal eigenvalue is the real eigenvalue of $A$ with largest real part.
 The associated right eigenvector is strictly positive.
 
 ```{code-cell} ipython3
-def build_generator(U, r, kappa):
-    """
-    Build the generator matrix for a finite-state multiplicative semigroup.
-
-    Parameters
-    ----------
-    U : array_like, shape (N, N)
-        Intensity matrix of the Markov chain.
-    r : array_like, shape (N,)
-        State-dependent decay rates.
-    kappa : array_like, shape (N, N)
-        kappa[j, i] is the log jump multiplier for a transition i -> j.
-
-    Returns
-    -------
-    A : ndarray, shape (N, N)
-        Generator of the multiplicative semigroup.
+def build_generator(U, r, κ):
+    """Generator matrix for a finite-state multiplicative semigroup.
+
+    κ[j, i] is the log jump multiplier for a transition i -> j.
     """
     U = np.asarray(U, dtype=float)
     r = np.asarray(r, dtype=float)
-    kappa = np.asarray(kappa, dtype=float)
-
-    N = U.shape[0]
-    A = np.empty_like(U)
-
-    for i in range(N):
-        for j in range(N):
-            if i == j:
-                A[i, i] = U[i, i] - r[i]
-            else:
-                A[i, j] = U[i, j] * np.exp(kappa[j, i])
+    κ = np.asarray(κ, dtype=float)
 
+    A = U * np.exp(κ.T)
+    np.fill_diagonal(A, np.diag(U) - r)
     return A
 
 
 def principal_eigenpair(A):
-    """
-    Compute the Perron eigenvalue and positive right eigenvector.
-    """
+    """Perron eigenpair with a positive right eigenvector."""
     vals, vecs = eig(A)
     idx = np.argmax(vals.real)
 
-    rho = vals[idx].real
-    phi = vecs[:, idx].real
+    ρ = vals[idx].real
+    φ = vecs[:, idx].real
 
-    if phi.sum() < 0:
-        phi = -phi
+    if φ.sum() < 0:
+        φ = -φ
 
-    # Remove tiny numerical sign errors.
-    if np.any(phi <= 0):
-        phi = np.abs(phi)
+    if np.any(φ <= 0):
+        if np.min(φ) > -1e-10:
+            φ = np.maximum(φ, 0)
+        else:
+            raise ValueError("Dominant eigenvector is not strictly positive.")
 
-    phi = phi / phi.mean()
-    return rho, phi
+    φ = φ / φ.mean()
+    return ρ, φ
 
 
-def twisted_generator(A, rho, phi):
-    """
-    Generator of the Markov process under the twisted measure.
-    """
-    D = np.diag(phi)
-    D_inv = np.diag(1 / phi)
-    return D_inv @ A @ D - rho * np.eye(len(phi))
+def twisted_generator(A, ρ, φ):
+    """Generator under the twisted measure."""
+    D = np.diag(φ)
+    D_inv = np.diag(1 / φ)
+    return D_inv @ A @ D - ρ * np.eye(len(φ))
 
 
 def stationary_distribution(Q):
-    """
-    Stationary distribution pi for a finite-state intensity matrix Q.
-    """
+    """Stationary distribution of a finite-state generator."""
     vals, vecs = eig(Q.T)
     idx = np.argmin(np.abs(vals))
-    pi = vecs[:, idx].real
+    π = vecs[:, idx].real
 
-    if pi.sum() < 0:
-        pi = -pi
+    if π.sum() < 0:
+        π = -π
 
-    pi = np.maximum(pi, 0)
-    return pi / pi.sum()
+    π = np.maximum(π, 0)
+    return π / π.sum()
 ```
 
 ### Two states
@@ -817,23 +1040,23 @@ The boom state switches to recession at rate $\lambda_1$, while recession
 switches to boom at rate $\lambda_2$.
 
 ```{code-cell} ipython3
-lambda_1 = 0.30
-lambda_2 = 0.50
+λ_1 = 0.30
+λ_2 = 0.50
 
-U = np.array([[-lambda_1,  lambda_1],
-              [ lambda_2, -lambda_2]])
+U = np.array([[-λ_1,  λ_1],
+              [ λ_2, -λ_2]])
 
 r = np.array([0.05, 0.02])
-kappa = np.zeros((2, 2))
+κ = np.zeros((2, 2))
 
-A = build_generator(U, r, kappa)
-rho, phi = principal_eigenpair(A)
+A = build_generator(U, r, κ)
+ρ, φ = principal_eigenpair(A)
 
 print("A =")
 print(np.round(A, 4))
-print(f"\nrho = {rho:.6f}")
-print(f"phi = {phi}")
-print(f"long-run zero-coupon yield = {-rho:.4f}")
+print(f"\nρ = {ρ:.6f}")
+print(f"φ = {φ}")
+print(f"long-run zero-coupon yield = {-ρ:.4f}")
 ```
 
 We can verify the eigenvalue equation
@@ -841,8 +1064,8 @@ $\mathbb M_t \phi = \exp(\rho t)\phi$.
 
 ```{code-cell} ipython3
 for t in [1.0, 5.0, 25.0]:
-    lhs = expm(t * A) @ phi
-    rhs = np.exp(rho * t) * phi
+    lhs = expm(t * A) @ φ
+    rhs = np.exp(ρ * t) * φ
     err = np.max(np.abs(lhs - rhs))
     print(f"t = {t:4.1f}, error = {err:.2e}")
 ```
@@ -851,15 +1074,15 @@ Next we compute the twisted generator and the stationary distribution
 $\hat\varsigma$ under the twisted probability measure.
 
 ```{code-cell} ipython3
-A_hat = twisted_generator(A, rho, phi)
-varsigma_hat = stationary_distribution(A_hat)
+A_hat = twisted_generator(A, ρ, φ)
+ς_hat = stationary_distribution(A_hat)
 
 print("twisted generator row sums:")
 print(np.round(A_hat.sum(axis=1), 12))
 
 print("\ntwisted stationary distribution:")
-print(f"  boom      {varsigma_hat[0]:.4f}")
-print(f"  recession {varsigma_hat[1]:.4f}")
+print(f"  boom      {ς_hat[0]:.4f}")
+print(f"  recession {ς_hat[1]:.4f}")
 ```
 
 For any payoff function $\psi$, the limit in {eq}`eq:long-run-limit` is
@@ -870,11 +1093,11 @@ $$
 $$
 
 ```{code-cell} ipython3
-psi = np.array([1.0, 2.0])
-limit = phi * np.sum((psi / phi) * varsigma_hat)
+ψ = np.array([1.0, 2.0])
+limit = φ * np.sum((ψ / φ) * ς_hat)
 
 for t in [1, 5, 20, 80]:
-    approx = np.exp(-rho * t) * expm(t * A) @ psi
+    approx = np.exp(-ρ * t) * expm(t * A) @ ψ
     print(f"t = {t:2d}, normalized value = {approx}")
 
 print("\nlimit =", limit)
@@ -888,20 +1111,20 @@ functional should be allowed to jump at the transition times.
 A natural case is a stochastic discount factor that pays out more when the
 economy switches into a boom and less when it switches into a recession.
 
-The matrix `kappa_jump` below encodes this: the functional jumps up on a
+The matrix `κ_jump` below encodes this: the functional jumps up on a
 recession-to-boom transition and down on a boom-to-recession transition.
 
 ```{code-cell} ipython3
-kappa_jump = np.array([[0.0,  0.30],
-                       [-0.20, 0.0]])
+κ_jump = np.array([[0.0,  0.30],
+                   [-0.20, 0.0]])
 
-A_jump = build_generator(U, r, kappa_jump)
-rho_jump, phi_jump = principal_eigenpair(A_jump)
+A_jump = build_generator(U, r, κ_jump)
+ρ_jump, φ_jump = principal_eigenpair(A_jump)
 
-print(f"rho without jumps = {rho:.6f}")
-print(f"rho with jumps    = {rho_jump:.6f}")
-print("\nphi with jumps:")
-print(phi_jump)
+print(f"ρ without jumps = {ρ:.6f}")
+print(f"ρ with jumps    = {ρ_jump:.6f}")
+print("\nφ with jumps:")
+print(φ_jump)
 ```
 
 To see how the long-run rate $\rho$ responds to jump risk, we hold the
@@ -909,18 +1132,18 @@ boom-to-recession multiplier fixed and trace out $\rho$ as the
 recession-to-boom multiplier varies.
 
 ```{code-cell} ipython3
-kappa_grid = np.linspace(-0.5, 0.5, 100)
-rho_grid = np.empty_like(kappa_grid)
+κ_grid = np.linspace(-0.5, 0.5, 100)
+ρ_grid = np.empty_like(κ_grid)
 
-for n, k in enumerate(kappa_grid):
-    kappa_temp = np.array([[0.0, k],
+for n, k in enumerate(κ_grid):
+    κ_temp = np.array([[0.0, k],
                            [-0.2, 0.0]])
-    A_temp = build_generator(U, r, kappa_temp)
-    rho_grid[n], _ = principal_eigenpair(A_temp)
+    A_temp = build_generator(U, r, κ_temp)
+    ρ_grid[n], _ = principal_eigenpair(A_temp)
 
 fig, ax = plt.subplots()
-ax.plot(kappa_grid, rho_grid, lw=2)
-ax.axhline(rho, color="black", ls="--", lw=1)
+ax.plot(κ_grid, ρ_grid, lw=2)
+ax.axhline(ρ, color="black", ls="--", lw=1)
 ax.axvline(0, color="black", ls=":", lw=1)
 ax.set_xlabel("jump log multiplier for recession to boom")
 ax.set_ylabel("principal eigenvalue")
@@ -989,8 +1212,15 @@ algebraic conditions on $(c_f, c_o, \rho)$.
 ```{prf:definition} Exponential-Affine Eigenfunction
 :label: lrr-def-exponential-affine-eigenfunction
 
-An **exponential-affine eigenfunction** is a positive eigenfunction whose
-logarithm is affine in the state variables.
+An eigenfunction $\phi$ of $\mathbb A$ on a state space $\mathcal D_0
+\subseteq \mathbb R^n$ is **exponential-affine** if
+
+$$
+    \phi(x) = \exp(c_0 + c^\top x)
+    \qquad \text{for all } x \in \mathcal D_0 ,
+$$
+
+for some constant $c_0 \in \mathbb R$ and vector $c \in \mathbb R^n$.
 ```
 
 Substituting
@@ -1064,53 +1294,50 @@ which must be positive.
 
 ```{code-cell} ipython3
 def solve_affine_eigenfunction(params):
-    """
-    Solve the exponential-affine eigenvalue problem.
-    """
-    xi_f = params["xi_f"]
+    """Solve the exponential-affine eigenvalue problem."""
+    ξ_f = params["ξ_f"]
     xbar_f = params["xbar_f"]
-    sigma_f = params["sigma_f"]
-    xi_o = params["xi_o"]
+    σ_f = params["σ_f"]
+    ξ_o = params["ξ_o"]
     xbar_o = params["xbar_o"]
-    sigma_o = params["sigma_o"]
-    beta_bar = params["beta_bar"]
-    beta_f = params["beta_f"]
-    beta_o = params["beta_o"]
-    gamma_f = params["gamma_f"]
-    gamma_o = params["gamma_o"]
+    σ_o = params["σ_o"]
+    β_bar = params["β_bar"]
+    β_f = params["β_f"]
+    β_o = params["β_o"]
+    γ_f = params["γ_f"]
+    γ_o = params["γ_o"]
 
-    co = beta_o / xi_o
+    co = β_o / ξ_o
 
-    disc = ((xi_f - gamma_f * sigma_f) ** 2
-            - sigma_f ** 2 * (2 * beta_f + gamma_f ** 2))
+    disc = ((ξ_f - γ_f * σ_f) ** 2
+            - σ_f ** 2 * (2 * β_f + γ_f ** 2))
 
     if disc < 0:
         raise ValueError("No real affine eigenfunction for these parameters.")
 
     root = np.sqrt(disc)
-    cf_plus = ((xi_f - gamma_f * sigma_f) + root) / sigma_f ** 2
-    cf_minus = ((xi_f - gamma_f * sigma_f) - root) / sigma_f ** 2
+    cf_roots = (
+        ((ξ_f - γ_f * σ_f) - root) / σ_f ** 2,
+        ((ξ_f - γ_f * σ_f) + root) / σ_f ** 2
+    )
 
     def mean_reversion(cf):
-        return xi_f - sigma_f * (gamma_f + cf * sigma_f)
-
-    candidates = [(cf_minus, mean_reversion(cf_minus)),
-                  (cf_plus, mean_reversion(cf_plus))]
+        return ξ_f - σ_f * (γ_f + cf * σ_f)
 
-    stationary = [(cf, mr) for cf, mr in candidates if mr > 0]
-
-    if not stationary:
+    for cf in cf_roots:
+        mr = mean_reversion(cf)
+        if mr > 0:
+            break
+    else:
         raise ValueError("Neither root gives a stationary twisted process.")
 
-    cf, mr = stationary[0]
-
-    rho = (beta_bar
-           + gamma_o ** 2 / 2
-           + cf * xi_f * xbar_f
-           + co * (xi_o * xbar_o + gamma_o * sigma_o)
-           + co ** 2 * sigma_o ** 2 / 2)
+    ρ = (β_bar
+         + γ_o ** 2 / 2
+         + cf * ξ_f * xbar_f
+         + co * (ξ_o * xbar_o + γ_o * σ_o)
+         + co ** 2 * σ_o ** 2 / 2)
 
-    return cf, co, rho, mr
+    return cf, co, ρ, mr
 ```
 
 ### A Breeden SDF
@@ -1159,58 +1386,58 @@ once $(\bar\beta,\beta_f,\beta_o,\gamma_f,\gamma_o)$ are specified.
 
 ```{code-cell} ipython3
 params_state = {
-    "xi_f": 0.70,
+    "ξ_f": 0.70,
     "xbar_f": 0.04,
-    "sigma_f": -0.20,
-    "xi_o": 0.50,
+    "σ_f": -0.20,
+    "ξ_o": 0.50,
     "xbar_o": 0.02,
-    "sigma_o": 0.01,
+    "σ_o": 0.01,
 }
 
 a = 4.0
 b = 0.03
-theta_f = 0.06
-theta_o = 0.02
+ϑ_f = 0.06
+ϑ_o = 0.02
 
 params_sdf = {
     **params_state,
-    "beta_bar": -b,
-    "beta_f": 0.0,
-    "beta_o": -a,
-    "gamma_f": -a * theta_f,
-    "gamma_o": -a * theta_o,
+    "β_bar": -b,
+    "β_f": 0.0,
+    "β_o": -a,
+    "γ_f": -a * ϑ_f,
+    "γ_o": -a * ϑ_o,
 }
 
-cf_s, co_s, rho_s, mr_s = solve_affine_eigenfunction(params_sdf)
+cf_s, co_s, ρ_s, mr_s = solve_affine_eigenfunction(params_sdf)
 
-print("principal eigenfunction phi(xf, xo) = exp(cf xf + co xo)")
+print("principal eigenfunction φ(xf, xo) = exp(cf xf + co xo)")
 print(f"cf = {cf_s:.6f}")
 print(f"co = {co_s:.6f}")
-print(f"rho = {rho_s:.6f}")
+print(f"ρ = {ρ_s:.6f}")
 print(f"twisted mean-reversion coefficient for Xf = {mr_s:.6f}")
-print(f"long-run zero-coupon yield = {-rho_s:.4f}")
+print(f"long-run zero-coupon yield = {-ρ_s:.4f}")
 ```
 
 The rejected root for $c_f$ would make the twisted volatility process
 explosive rather than stationary.
 
 ```{code-cell} ipython3
-xi_f = params_sdf["xi_f"]
-sigma_f = params_sdf["sigma_f"]
-gamma_f = params_sdf["gamma_f"]
-beta_f = params_sdf["beta_f"]
+ξ_f = params_sdf["ξ_f"]
+σ_f = params_sdf["σ_f"]
+γ_f = params_sdf["γ_f"]
+β_f = params_sdf["β_f"]
 
-disc = ((xi_f - gamma_f * sigma_f) ** 2
-        - sigma_f ** 2 * (2 * beta_f + gamma_f ** 2))
+disc = ((ξ_f - γ_f * σ_f) ** 2
+        - σ_f ** 2 * (2 * β_f + γ_f ** 2))
 root = np.sqrt(disc)
 
 cf_candidates = np.array([
-    ((xi_f - gamma_f * sigma_f) - root) / sigma_f ** 2,
-    ((xi_f - gamma_f * sigma_f) + root) / sigma_f ** 2
+    ((ξ_f - γ_f * σ_f) - root) / σ_f ** 2,
+    ((ξ_f - γ_f * σ_f) + root) / σ_f ** 2
 ])
 
 for cf in cf_candidates:
-    mr = xi_f - sigma_f * (gamma_f + cf * sigma_f)
+    mr = ξ_f - σ_f * (γ_f + cf * σ_f)
     print(f"cf = {cf:8.4f}, twisted mean reversion = {mr:8.4f}")
 ```
 
@@ -1260,77 +1487,72 @@ The code below simulates the state and constructs the three factors in
 {eq}`eq:hs-factorization`.
 
 ```{code-cell} ipython3
-def simulate_states(params, T=40.0, dt=0.01, seed=1234):
-    """
-    Euler simulation of the affine state process.
-    """
+def brownian_increments(n, dt, seed=1234):
+    """Independent Brownian increments for the simulation."""
     rng = np.random.default_rng(seed)
+    draws = rng.normal(scale=np.sqrt(dt), size=(n, 2))
+    return draws[:, 0], draws[:, 1]
+
 
+def simulate_states(params, T=40.0, dt=0.01, seed=1234):
+    """Euler simulation of the affine state process."""
     n = int(T / dt)
     t = np.linspace(0, T, n + 1)
     Xf = np.empty(n + 1)
     Xo = np.empty(n + 1)
+    dBf, dBo = brownian_increments(n, dt, seed)
 
     Xf[0] = params["xbar_f"]
     Xo[0] = params["xbar_o"]
 
-    for k in range(n):
+    for k, (dBf_k, dBo_k) in enumerate(zip(dBf, dBo)):
         xf = max(Xf[k], 1e-10)
         xo = Xo[k]
 
-        dBf = rng.normal() * np.sqrt(dt)
-        dBo = rng.normal() * np.sqrt(dt)
-
         Xf[k + 1] = (xf
-                     + params["xi_f"] * (params["xbar_f"] - xf) * dt
-                     + np.sqrt(xf) * params["sigma_f"] * dBf)
+                     + params["ξ_f"] * (params["xbar_f"] - xf) * dt
+                     + np.sqrt(xf) * params["σ_f"] * dBf_k)
         Xf[k + 1] = max(Xf[k + 1], 1e-10)
 
         Xo[k + 1] = (xo
-                     + params["xi_o"] * (params["xbar_o"] - xo) * dt
-                     + params["sigma_o"] * dBo)
+                     + params["ξ_o"] * (params["xbar_o"] - xo) * dt
+                     + params["σ_o"] * dBo_k)
 
-    return t, Xf, Xo
+    return t, Xf, Xo, dBf, dBo
 
 
-def additive_log_M(params, t, Xf, Xo, seed=1234):
-    """
-    Recompute the Brownian increments used in simulate_states and construct A_t.
-    """
-    rng = np.random.default_rng(seed)
+def additive_log_M(params, t, Xf, Xo, dBf, dBo):
+    """Additive log functional along a simulated path."""
     dt = t[1] - t[0]
     A = np.zeros_like(t)
 
-    for k in range(len(t) - 1):
+    for k, (dBf_k, dBo_k) in enumerate(zip(dBf, dBo)):
         xf = max(Xf[k], 1e-10)
         xo = Xo[k]
 
-        dBf = rng.normal() * np.sqrt(dt)
-        dBo = rng.normal() * np.sqrt(dt)
-
-        drift = (params["beta_bar"]
-                 + params["beta_f"] * xf
-                 + params["beta_o"] * xo)
+        drift = (params["β_bar"]
+                 + params["β_f"] * xf
+                 + params["β_o"] * xo)
 
-        shock = (np.sqrt(xf) * params["gamma_f"] * dBf
-                 + params["gamma_o"] * dBo)
+        shock = (np.sqrt(xf) * params["γ_f"] * dBf_k
+                 + params["γ_o"] * dBo_k)
 
         A[k + 1] = A[k] + drift * dt + shock
 
     return A
 
 
-t, Xf, Xo = simulate_states(params_sdf)
-A_log = additive_log_M(params_sdf, t, Xf, Xo)
+t, Xf, Xo, dBf, dBo = simulate_states(params_sdf)
+A_log = additive_log_M(params_sdf, t, Xf, Xo, dBf, dBo)
 
-phi0 = np.exp(cf_s * Xf[0] + co_s * Xo[0])
-phit = np.exp(cf_s * Xf + co_s * Xo)
+φ_0 = np.exp(cf_s * Xf[0] + co_s * Xo[0])
+φ_t = np.exp(cf_s * Xf + co_s * Xo)
 
 M = np.exp(A_log)
-M_hat = np.exp(-rho_s * t) * M * phit / phi0
-transient = phi0 / phit
+M_hat = np.exp(-ρ_s * t) * M * φ_t / φ_0
+transient = φ_0 / φ_t
 
-identity_error = np.max(np.abs(M - np.exp(rho_s * t) * M_hat * transient))
+identity_error = np.max(np.abs(M - np.exp(ρ_s * t) * M_hat * transient))
 print(f"maximum factorization error = {identity_error:.2e}")
 ```
 
@@ -1346,13 +1568,13 @@ axes[0, 1].set_title("$X_t^o$")
 axes[0, 1].set_xlabel("$t$")
 
 axes[1, 0].plot(t, M, label="$M_t$")
-axes[1, 0].plot(t, np.exp(rho_s * t) * M_hat * transient,
+axes[1, 0].plot(t, np.exp(ρ_s * t) * M_hat * transient,
                 "--", label="factorization")
 axes[1, 0].set_title("Multiplicative Factorization")
 axes[1, 0].set_xlabel("$t$")
 axes[1, 0].legend()
 
-axes[1, 1].plot(t, np.exp(rho_s * t), label="$\\exp(\\rho t)$")
+axes[1, 1].plot(t, np.exp(ρ_s * t), label="$\\exp(\\rho t)$")
 axes[1, 1].plot(t, M_hat, label="$\\hat M_t$", alpha=0.8)
 axes[1, 1].plot(t, transient, label="$\\phi(X_0)/\\phi(X_t)$", alpha=0.8)
 axes[1, 1].set_title("Three Components")
@@ -1378,17 +1600,24 @@ Our aim is to compare the two.
 ```{prf:definition} Local Brownian Risk Price
 :label: lrr-def-local-brownian-risk-price
 
-The **local price** of exposure to a Brownian shock with SDF loading
-$\gamma_i^s$ is $-\gamma_i^s$.
+The **local Brownian risk price** is the state-dependent vector
+$-\gamma^s(x)$, which prices exposure measured in the same Brownian units as
+the valuation functional loading $\gamma^v(x)$.
 ```
 
 ```{prf:definition} Long-Run Risk Price
 :label: lrr-def-long-run-risk-price
 
-In the cash-flow valuation problem, the **long-run risk price** for exposure
-$\gamma_i^g$ is the marginal change
-$\partial R_\infty / \partial \gamma_i^g$ in the asymptotic required return
-$R_\infty=-\rho+\delta$.
+For cash-flow growth risk, the **long-run risk price** is the marginal change
+in the asymptotic required return with respect to the cash-flow growth
+exposure.
+
+$$
+    R_\infty = -\rho+\delta
+$$
+
+Equivalently, it is the negative of the marginal change in the principal
+eigenvalue of the $GS$ semigroup, because $\delta$ is held fixed.
 ```
 
 The local Brownian price is read off the SDF directly: for a valuation
@@ -1396,9 +1625,47 @@ functional with Brownian exposure $\gamma^v$, the Brownian part of the local
 required expected return is $-\gamma^v \cdot \gamma^s$, so a unit of
 $\gamma^v_i$ exposure is priced at $-\gamma^s_i$.
 
-The long-run price requires solving the principal eigenvalue problem for the
-$GS$ semigroup, since it depends on how a shock propagates through the
-persistent state.
+The long-run price requires solving the principal eigenvalue problem, since
+it depends on how a shock propagates through the persistent state.
+
+There are two related long-run frontiers.
+
+For a valuation frontier, set $M=V$: choose the return exposure
+$(\gamma^v,\kappa^v)$, use the local pricing restriction to determine
+$\beta^v$, and compute the principal eigenvalue of the $V$ semigroup.
+
+For a cash-flow frontier, set $M=GS$: choose the growth exposure in $G$ and
+compute the principal eigenvalue of the valuation semigroup for growing cash
+flows.
+
+These frontiers coincide in simple log-normal examples for some shocks, but
+they can differ with stochastic volatility, nonlinear dynamics, or jump risk.
+
+### Stochastic discount factor decomposition
+
+A useful benchmark is the case $M=S$.
+The factorization becomes
+
+$$
+    S_t
+    =
+    \exp(\rho t)\hat M_t
+    \frac{\phi(X_0)}{\phi(X_t)} .
+$$
+
+This is the permanent-transitory decomposition emphasized by
+{cite:t}`AlvarezJermann2005`, now linked to a principal eigenfunction.
+For a long zero-coupon bond,
+
+$$
+    \exp(-\rho t)E[S_t \mid X_0=x]
+    \to
+    \phi(x)
+    \int \frac{1}{\phi}\, d\hat\varsigma .
+$$
+
+Thus prices of very long maturity discount bonds depend on the current state
+primarily through the eigenfunction $\phi$.
 
 ### Comparison in the affine model
 
@@ -1420,14 +1687,20 @@ the persistent growth predictor $X^o$, and because $X^o$ mean reverts at rate
 $\xi_o$, the cumulative effect of the shock is larger when $\xi_o$ is
 smaller.
 
+The local price of $B^f$ exposure is state dependent because the exposure is
+scaled by $\sqrt{X^f_t}$.
+
+The long-run price of $B^f$ exposure is nonlinear in general because the
+coefficient $c_f$ of the principal eigenfunction solves a quadratic equation.
+
 ```{code-cell} ipython3
-gamma_s_o = params_sdf["gamma_o"]
-beta_s_o = params_sdf["beta_o"]
-xi_o = params_sdf["xi_o"]
-sigma_o = params_sdf["sigma_o"]
+γ_s_o = params_sdf["γ_o"]
+β_s_o = params_sdf["β_o"]
+ξ_o = params_sdf["ξ_o"]
+σ_o = params_sdf["σ_o"]
 
-local_price_o = -gamma_s_o
-long_run_price_o = -gamma_s_o - (beta_s_o / xi_o) * sigma_o
+local_price_o = -γ_s_o
+long_run_price_o = -γ_s_o - (β_s_o / ξ_o) * σ_o
 
 print(f"local price of B^o exposure    = {local_price_o:.4f}")
 print(f"long-run price of B^o exposure = {long_run_price_o:.4f}")
@@ -1437,13 +1710,13 @@ The next cell illustrates how persistence changes the wedge between local and
 long-run prices.
 
 ```{code-cell} ipython3
-xi_o_grid = np.array([0.10, 0.20, 0.50, 1.00, 2.00, 5.00])
-local_grid = np.full_like(xi_o_grid, local_price_o)
-long_grid = -gamma_s_o - (beta_s_o / xi_o_grid) * sigma_o
+ξ_o_grid = np.array([0.10, 0.20, 0.50, 1.00, 2.00, 5.00])
+local_grid = np.full_like(ξ_o_grid, local_price_o)
+long_grid = -γ_s_o - (β_s_o / ξ_o_grid) * σ_o
 
 fig, ax = plt.subplots()
-ax.plot(xi_o_grid, local_grid, "--", lw=2, label="local")
-ax.plot(xi_o_grid, long_grid, "o-", lw=2, label="long-run")
+ax.plot(ξ_o_grid, local_grid, "--", lw=2, label="local")
+ax.plot(ξ_o_grid, long_grid, "o-", lw=2, label="long-run")
 ax.set_xscale("log")
 ax.set_xlabel("mean-reversion speed $\\xi_o$")
 ax.set_ylabel("risk price")
@@ -1474,6 +1747,15 @@ A_t^g
 $$ (eq:growth-functional)
 
 The last line makes $\exp(A_t^g-\delta t)$ a martingale.
+To keep the growth-twisted square-root volatility process stationary, the
+cash-flow exposure to $B^f$ must also satisfy the Feller-type restriction
+
+$$
+    2(\xi_f+\sigma_f\gamma_f^g)\bar x_f \geq \sigma_f^2 .
+$$
+
+This is one example of a general point: changing growth risk can destroy the
+stability conditions needed for a long-run approximation.
 
 To price the cash flow $D_t=D_0G_t\psi(X_t)$, use the semigroup generated by
 $M=GS$.
@@ -1501,33 +1783,33 @@ $$
 is the asymptotic required return net of the cash-flow growth rate.
 
 ```{code-cell} ipython3
-def required_return_for_growth_exposure(gamma_g_o, gamma_g_f=0.0, delta=0.02):
-    """
-    Long-run required return -rho + delta for a cash-flow growth exposure.
-    """
+def required_return_for_growth_exposure(γ_g_o, γ_g_f=0.0, δ=0.02):
+    """Long-run required return for a cash-flow growth exposure."""
     p = dict(params_sdf)
-    p["beta_bar"] = params_sdf["beta_bar"] + delta - 0.5 * gamma_g_o ** 2
-    p["beta_f"] = params_sdf["beta_f"] - 0.5 * gamma_g_f ** 2
-    p["beta_o"] = params_sdf["beta_o"]
-    p["gamma_f"] = params_sdf["gamma_f"] + gamma_g_f
-    p["gamma_o"] = params_sdf["gamma_o"] + gamma_g_o
+    p.update({
+        "β_bar": params_sdf["β_bar"] + δ - 0.5 * γ_g_o ** 2,
+        "β_f": params_sdf["β_f"] - 0.5 * γ_g_f ** 2,
+        "β_o": params_sdf["β_o"],
+        "γ_f": params_sdf["γ_f"] + γ_g_f,
+        "γ_o": params_sdf["γ_o"] + γ_g_o,
+    })
 
-    _, _, rho, _ = solve_affine_eigenfunction(p)
-    return -rho + delta
+    _, _, ρ, _ = solve_affine_eigenfunction(p)
+    return -ρ + δ
 
 
-gamma_g_o_grid = np.linspace(-0.5, 0.5, 101)
+γ_g_o_grid = np.linspace(-0.5, 0.5, 101)
 required_returns = np.array([
-    required_return_for_growth_exposure(g) for g in gamma_g_o_grid
+    required_return_for_growth_exposure(g) for g in γ_g_o_grid
 ])
 
-local_line = (-params_sdf["beta_bar"]
-              + local_price_o * gamma_g_o_grid)
+local_line = (-params_sdf["β_bar"]
+              + local_price_o * γ_g_o_grid)
 
 fig, ax = plt.subplots()
-ax.plot(gamma_g_o_grid, required_returns, lw=2,
+ax.plot(γ_g_o_grid, required_returns, lw=2,
         label="long-run required return")
-ax.plot(gamma_g_o_grid, local_line, "--", lw=2,
+ax.plot(γ_g_o_grid, local_line, "--", lw=2,
         label="local slope")
 ax.set_xlabel("cash-flow exposure $\\gamma_o^g$")
 ax.set_ylabel("rate of return")
@@ -1550,15 +1832,20 @@ print(f"formula                 = {long_run_price_o:.6f}")
 
 ## Perron-Frobenius dominance
 
-The long-run limit {eq}`eq:long-run-limit` works because the principal
-eigenvalue $\rho$ strictly dominates the real part of every other eigenvalue
-of $\mathbb A$.
+In a finite-state chain, the long-run limit {eq}`eq:long-run-limit` is the
+Perron-Frobenius theorem in action.
 
-In a finite-state chain this is just the Perron-Frobenius theorem: the
-positive generator $A$ in {eq}`eq:finite-a` has a unique largest real
-eigenvalue, and contributions from the remaining eigenvalues decay at an
-exponential rate equal to the gap between $\rho$ and the next-largest real
-part.
+The positive semigroup generated by $A$ in {eq}`eq:finite-a` has a unique
+dominant real eigenvalue, and contributions from the remaining eigenvalues
+decay at an exponential rate equal to the gap between $\rho$ and the
+next-largest real part.
+
+For general state spaces, the argument is not simply a finite-dimensional
+spectral-gap argument.
+
+The martingale component $\hat M$ changes probability measure, and stability
+of the twisted process selects the eigenfunction that actually governs the
+long-run approximation.
 
 We illustrate this on a three-state chain and read off the spectral gap
 directly.
@@ -1571,16 +1858,16 @@ U3 = np.array([[-0.40,  0.30,  0.10],
                [ 0.10,  0.20, -0.30]])
 
 r3 = np.array([0.06, 0.04, 0.01])
-kappa3 = np.zeros((3, 3))
+κ3 = np.zeros((3, 3))
 
-A3 = build_generator(U3, r3, kappa3)
-rho3, phi3 = principal_eigenpair(A3)
-A3_hat = twisted_generator(A3, rho3, phi3)
-varsigma3 = stationary_distribution(A3_hat)
+A3 = build_generator(U3, r3, κ3)
+ρ3, φ3 = principal_eigenpair(A3)
+A3_hat = twisted_generator(A3, ρ3, φ3)
+ς3 = stationary_distribution(A3_hat)
 
-print(f"rho = {rho3:.6f}")
-print(f"phi = {phi3}")
-print(f"varsigma_hat = {varsigma3}")
+print(f"ρ = {ρ3:.6f}")
+print(f"φ = {φ3}")
+print(f"ς_hat = {ς3}")
 
 eigs3 = np.sort(eig(A3, right=False).real)[::-1]
 print("eigenvalues by real part:")
@@ -1588,7 +1875,7 @@ print(np.round(eigs3, 6))
 ```
 
 ```{code-cell} ipython3
-psi_list = {
+ψ_list = {
     "$\\psi=(1,0,0)$": np.array([1.0, 0.0, 0.0]),
     "$\\psi=(0,1,0)$": np.array([0.0, 1.0, 0.0]),
     "$\\psi=(1,2,3)$": np.array([1.0, 2.0, 3.0]),
@@ -1599,16 +1886,15 @@ colors = ["C0", "C1", "C2"]
 
 fig, axes = plt.subplots(1, 3, figsize=(14, 4))
 
-for ax, (label, psi) in zip(axes, psi_list.items()):
-    limit = phi3 * np.sum((psi / phi3) * varsigma3)
+for ax, (label, ψ) in zip(axes, ψ_list.items()):
+    limit = φ3 * np.sum((ψ / φ3) * ς3)
+    values = np.array([
+        np.exp(-ρ3 * t_val) * expm(t_val * A3) @ ψ
+        for t_val in t_grid
+    ])
 
     for i, color in enumerate(colors):
-        path = []
-        for t_val in t_grid:
-            value = np.exp(-rho3 * t_val) * expm(t_val * A3) @ psi
-            path.append(value[i])
-
-        ax.plot(t_grid, path, color=color, lw=1.5,
+        ax.plot(t_grid, values[:, i], color=color, lw=1.5,
                 label=state_names[i])
         ax.axhline(limit[i], color=color, ls="--", lw=1)
 
@@ -1626,6 +1912,46 @@ $\exp(-\rho t)\mathbb M_t \psi$ converges to the dashed horizontal line --- the
 long-run limit $\phi \int (\psi/\phi)\, d\hat\varsigma$ --- at a rate
 controlled by the spectral gap.
 
+## Assumptions behind the scenes
+
+The examples above make the eigenfunction calculation look mechanical.
+
+Several things can go wrong in general state spaces.
+
+First, a positive eigenfunction only gives a nonnegative local martingale
+$\hat M$.
+
+It must be a true martingale before it can define a probability measure.
+
+A useful sufficient condition is a two-sided Girsanov construction in which
+the Brownian drift and jump compensator implied by $\hat M$ define a
+well-behaved distorted Markov process and the reverse density is locally
+integrable.
+
+Second, the twisted Markov process must be stable.
+
+Stationarity alone is not enough for the long-run limit, so we also use
+irreducibility of a sampled skeleton and Harris recurrence.
+
+These conditions eliminate spurious positive eigenfunctions.
+
+In the affine example, this is why we reject the root that makes the
+square-root process explosive under the twisted measure.
+
+Third, existence of a principal eigenfunction is not automatic in a general
+state space.
+
+Useful sufficient conditions use drift or Lyapunov bounds such as
+
+$$
+    \frac{\mathbb A V}{V} \leq a
+$$
+
+for a function $V \geq 1$, plus irreducibility of a resolvent operator.
+
+Finite-state Perron-Frobenius theory and the affine closed-form solution are
+special cases where these issues are easy to verify directly.
+
 ## Summary
 
 The Hansen-Scheinkman approach studies long-run risk by studying
@@ -1635,13 +1961,20 @@ The main steps are:
 
 1. Model discounting, growth, or cumulated returns by a positive
    multiplicative functional $M$.
+
 2. Build the semigroup
    $\mathbb M_t\psi(x)=E[M_t\psi(X_t)\mid X_0=x]$.
-3. Solve the principal eigenvalue problem
+
+3. Solve the local pricing restriction when $M$ is a valuation or
+   cash-flow valuation object.
+
+4. Solve the principal eigenvalue problem
    $\mathbb A\phi=\rho\phi$.
-4. Use the factorization
+
+5. Use the factorization
    $M_t=\exp(\rho t)\hat M_t\phi(X_0)/\phi(X_t)$.
-5. Under the twisted probability measure induced by $\hat M$, use stability
+
+6. Under the twisted probability measure induced by $\hat M$, use stability
    to obtain long-run approximations of the form {eq}`eq:long-run-limit`.
 
 In finite-state problems, this is Perron-Frobenius theory.
@@ -1673,21 +2006,23 @@ $$
 Let the multiplicative functional have decay rate $r_1>0$ in state 1, decay
 rate $r_2=0$ in state 2, and no jumps.
 
-(a) Write down the generator matrix $A$.
+a. Write down the generator matrix $A$.
 
-(b) Find the principal eigenvalue $\rho$ in terms of $\lambda$, $\mu$, and
+b. Find the principal eigenvalue $\rho$ in terms of $\lambda$, $\mu$, and
 $r_1$.
 
-(c) Verify numerically with $\lambda=0.4$, $\mu=0.6$, and $r_1=0.05$.
+c. Verify numerically with $\lambda=0.4$, $\mu=0.6$, and $r_1=0.05$.
 
-(d) Show that $-r_1 < \rho < 0$.
+d. Show that $-r_1 < \rho < 0$.
 ```
 
 ```{solution-start} lrr_ex1
 :class: dropdown
 ```
 
-*(a)* The generator is
+Here is one solution:
+
+*a.* The generator is
 
 $$
 A =
@@ -1697,7 +2032,7 @@ A =
 \end{pmatrix}.
 $$
 
-*(b)* The characteristic equation is
+*b.* The characteristic equation is
 
 $$
     \rho^2 + (\lambda+\mu+r_1)\rho + \mu r_1 = 0.
@@ -1714,28 +2049,28 @@ $$
 }{2}.
 $$
 
-*(c)* Numerical verification:
+*c.* Numerical verification:
 
 ```{code-cell} ipython3
-lam, mu, r1 = 0.4, 0.6, 0.05
+λ, μ, r1 = 0.4, 0.6, 0.05
 
-disc = (lam + mu + r1) ** 2 - 4 * mu * r1
-rho_formula = (-(lam + mu + r1) + np.sqrt(disc)) / 2
+disc = (λ + μ + r1) ** 2 - 4 * μ * r1
+ρ_formula = (-(λ + μ + r1) + np.sqrt(disc)) / 2
 
-U_ex = np.array([[-lam, lam],
-                 [mu, -mu]])
+U_ex = np.array([[-λ, λ],
+                 [μ, -μ]])
 r_ex = np.array([r1, 0.0])
-kappa_ex = np.zeros((2, 2))
+κ_ex = np.zeros((2, 2))
 
-A_ex = build_generator(U_ex, r_ex, kappa_ex)
-rho_numeric, phi_numeric = principal_eigenpair(A_ex)
+A_ex = build_generator(U_ex, r_ex, κ_ex)
+ρ_numeric, φ_numeric = principal_eigenpair(A_ex)
 
-print(f"formula  rho = {rho_formula:.8f}")
-print(f"numeric  rho = {rho_numeric:.8f}")
-print(f"difference   = {abs(rho_formula-rho_numeric):.2e}")
+print(f"formula  ρ = {ρ_formula:.8f}")
+print(f"numeric  ρ = {ρ_numeric:.8f}")
+print(f"difference   = {abs(ρ_formula-ρ_numeric):.2e}")
 ```
 
-*(d)* Let
+*d.* Let
 
 $$
 q(x)=x^2+(\lambda+\mu+r_1)x+\mu r_1.
@@ -1789,18 +2124,20 @@ Explain why the two prices converge as $\xi_o \to \infty$.
 :class: dropdown
 ```
 
+Here is one solution:
+
 ```{code-cell} ipython3
-xi_vals = np.array([0.1, 0.2, 0.5, 1.0, 2.0, 5.0])
-local_vals = np.full_like(xi_vals, -params_sdf["gamma_o"])
-long_vals = (-params_sdf["gamma_o"]
-             - (params_sdf["beta_o"] / xi_vals) * params_sdf["sigma_o"])
+ξ_vals = np.array([0.1, 0.2, 0.5, 1.0, 2.0, 5.0])
+local_vals = np.full_like(ξ_vals, -params_sdf["γ_o"])
+long_vals = (-params_sdf["γ_o"]
+             - (params_sdf["β_o"] / ξ_vals) * params_sdf["σ_o"])
 
-for xi, lp, lrp in zip(xi_vals, local_vals, long_vals):
-    print(f"xi_o = {xi:3.1f}: local = {lp:.4f}, long-run = {lrp:.4f}")
+for ξ, lp, lrp in zip(ξ_vals, local_vals, long_vals):
+    print(f"ξ_o = {ξ:3.1f}: local = {lp:.4f}, long-run = {lrp:.4f}")
 
 fig, ax = plt.subplots()
-ax.plot(xi_vals, local_vals, "--", lw=2, label="local")
-ax.plot(xi_vals, long_vals, "o-", lw=2, label="long-run")
+ax.plot(ξ_vals, local_vals, "--", lw=2, label="local")
+ax.plot(ξ_vals, long_vals, "o-", lw=2, label="long-run")
 ax.set_xscale("log")
 ax.set_xlabel("$\\xi_o$")
 ax.set_ylabel("risk price")
@@ -1823,13 +2160,13 @@ long-run price converges to the local price.
 
 Using the three-state example, let $\psi=(3,1,2)$.
 
-(a) Compute the theoretical limit
+a. Compute the theoretical limit
 
 $$
     \phi \sum_i \frac{\psi_i}{\phi_i}\hat\varsigma_i .
 $$
 
-(b) Plot
+b. Plot
 
 $$
     \max_i
@@ -1842,7 +2179,7 @@ $$
 
 on a logarithmic scale.
 
-(c) Compare the convergence rate to the spectral gap between the largest and
+c. Compare the convergence rate to the spectral gap between the largest and
 second-largest real parts of the eigenvalues of $A$.
 ```
 
@@ -1850,20 +2187,21 @@ second-largest real parts of the eigenvalues of $A$.
 :class: dropdown
 ```
 
+Here is one solution:
+
 ```{code-cell} ipython3
-psi = np.array([3.0, 1.0, 2.0])
-limit = phi3 * np.sum((psi / phi3) * varsigma3)
+ψ = np.array([3.0, 1.0, 2.0])
+limit = φ3 * np.sum((ψ / φ3) * ς3)
 
 print("limit:")
 for name, value in zip(state_names, limit):
     print(f"  {name:11s} {value:.6f}")
 
 t_vals = np.linspace(0.1, 40, 300)
-errors = np.empty_like(t_vals)
-
-for n, t_val in enumerate(t_vals):
-    approx = np.exp(-rho3 * t_val) * expm(t_val * A3) @ psi
-    errors[n] = np.max(np.abs(approx - limit))
+errors = np.array([
+    np.max(np.abs(np.exp(-ρ3 * t_val) * expm(t_val * A3) @ ψ - limit))
+    for t_val in t_vals
+])
 
 eigenvalues = eig(A3, right=False)
 real_parts = np.sort(eigenvalues.real)[::-1]
@@ -1917,7 +2255,7 @@ $$
 is a local martingale, so the task is to identify the predictable drift of
 $M_t\phi(X_t)$ and read off $\chi$.
 
-(a) Apply Itô's formula to $Y_t = \exp(A_t)\phi(X_t)$ between jumps and
+a. Apply Itô's formula to $Y_t = \exp(A_t)\phi(X_t)$ between jumps and
 show that the continuous part of $dY_t$ has drift
 
 $$
@@ -1933,7 +2271,7 @@ $$
     \right] dt .
 $$
 
-(b) Show that at a jump time $t$ with $X_{t-}=x$ and $X_t=y$,
+b. Show that at a jump time $t$ with $X_{t-}=x$ and $X_t=y$,
 
 $$
     \Delta Y_t = M_{t-}\big[\exp[\kappa(y,x)]\phi(y) - \phi(x)\big] ,
@@ -1950,7 +2288,7 @@ $$
         \eta(dy \mid x)\, dt .
 $$
 
-(c) Decompose
+c. Decompose
 
 $$
     \exp[\kappa(y,x)]\phi(y) - \phi(x)
@@ -1960,7 +2298,7 @@ $$
     + \big[\exp[\kappa(y,x)] - 1\big]\phi(x),
 $$
 
-combine the result with part (a), and read off $\mathbb A \phi$ to recover
+combine the result with part a., and read off $\mathbb A \phi$ to recover
 {eq}`eq:extended-generator`.
 ```
 
@@ -1968,7 +2306,9 @@ combine the result with part (a), and read off $\mathbb A \phi$ to recover
 :class: dropdown
 ```
 
-*(a)* Set $g(a, x) = e^a \phi(x)$, so that $Y_t = g(A_t, X_t)$.
+Here is one solution:
+
+*a.* Set $g(a, x) = e^a \phi(x)$, so that $Y_t = g(A_t, X_t)$.
 
 Between jumps, the continuous parts of $A$ and $X$ are
 
@@ -2018,7 +2358,7 @@ $$
 
 Grouping the gradient terms gives the expression in the question.
 
-*(b)* At a jump time $t$,
+*b.* At a jump time $t$,
 $\Delta A_t = \kappa(X_t, X_{t-}) = \kappa(y, x)$, so
 $M_t = M_{t-}\exp[\kappa(y,x)]$ and
 
@@ -2031,7 +2371,7 @@ $$
 Compensating these jumps against the predictable intensity $\eta(dy \mid x)$
 gives the stated predictable drift.
 
-*(c)* Adding the jump drift from (b) to the continuous drift from (a), the
+*c.* Adding the jump drift from b. to the continuous drift from a., the
 predictable drift of $Y_t = M_t \phi(X_t)$ at state $x$ is $M_t\, \chi(x)\, dt$
 with
 
diff --git a/lectures/ls_learning.bib b/lectures/ls_learning.bib
deleted file mode 100644
index 671215e99..000000000
--- a/lectures/ls_learning.bib
+++ /dev/null
@@ -1,60 +0,0 @@
-% BibTeX references for ls_learning.md
-% These entries have been added to _static/quant-econ.bib.
-% This file is kept as a local record.
-%
-% References already in quant-econ.bib before this lecture was written:
-%   BrayKreps1987, Bray1982, BraySavin1984, Frydman1982,
-%   Lucas_Prescott_1971, Cagan, Sargent1979, MarcetSargent1989 (JPE version)
-%
-% New entries added to quant-econ.bib for this lecture:
-%   MarcetSargent1989jet  (JET paper -- the main paper of the lecture)
-%   Ljung1977
-%   Evans1985
-%   FourgeaudGourieroux1986
-
-@article{MarcetSargent1989jet,
-  author    = {Marcet, Albert and Sargent, Thomas J.},
-  title     = {Convergence of Least Squares Learning Mechanisms in
-               Self-Referential Linear Stochastic Models},
-  journal   = {Journal of Economic Theory},
-  year      = {1989},
-  volume    = {48},
-  number    = {2},
-  pages     = {337--368},
-  publisher = {Elsevier},
-  doi       = {10.1016/0022-0531(89)90032-X}
-}
-
-@article{Ljung1977,
-  author  = {Ljung, Lennart},
-  title   = {Analysis of Recursive Stochastic Algorithms},
-  journal = {IEEE Transactions on Automatic Control},
-  year    = {1977},
-  volume  = {22},
-  number  = {4},
-  pages   = {551--575},
-  doi     = {10.1109/TAC.1977.1101561}
-}
-
-@article{Evans1985,
-  author  = {Evans, George W.},
-  title   = {Expectational Stability and the Multiple Equilibria Problem
-             in Linear Rational Expectations Models},
-  journal = {Quarterly Journal of Economics},
-  year    = {1985},
-  volume  = {100},
-  number  = {4},
-  pages   = {1217--1233},
-  doi     = {10.2307/1885377}
-}
-
-@article{FourgeaudGourieroux1986,
-  author  = {Fourgeaud, Claude and Gourieroux, Christian and Pradel, Jacqueline},
-  title   = {Learning Procedures and Convergence to Rationality},
-  journal = {Econometrica},
-  year    = {1986},
-  volume  = {54},
-  number  = {4},
-  pages   = {845--868},
-  doi     = {10.2307/1912836}
-}
diff --git a/lectures/rational_learning_re.bib b/lectures/rational_learning_re.bib
deleted file mode 100644
index 0eff25810..000000000
--- a/lectures/rational_learning_re.bib
+++ /dev/null
@@ -1,172 +0,0 @@
-% BibTeX references for rational_learning_re.md
-% References NOT already in quant-econ.bib
-
-@incollection{BrayKreps1987,
-  author    = {Bray, Margaret M. and Kreps, David M.},
-  title     = {Rational Learning and Rational Expectations},
-  booktitle = {Arrow and the Ascent of Modern Economic Theory},
-  editor    = {Feiwel, George R.},
-  publisher = {New York University Press},
-  address   = {New York},
-  year      = {1987},
-  pages     = {597--625},
-  note      = {Chapter 19}
-}
-
-@article{Bray1982,
-  author  = {Bray, Margaret M.},
-  title   = {Learning, Estimation, and the Stability of Rational Expectations},
-  journal = {Journal of Economic Theory},
-  year    = {1982},
-  volume  = {26},
-  number  = {2},
-  pages   = {318--339},
-  doi     = {10.1016/0022-0531(82)90028-X}
-}
-
-@article{BraySavin1984,
-  author  = {Bray, Margaret M. and Savin, N. E.},
-  title   = {Rational Expectations Equilibria, Learning and Model Specification},
-  journal = {Econometrica},
-  year    = {1986},
-  volume  = {54},
-  number  = {5},
-  pages   = {1129--1160},
-  doi     = {10.2307/1912325}
-}
-
-@article{Radner1979,
-  author  = {Radner, Roy},
-  title   = {Rational Expectations Equilibrium: Generic Existence and the Information Revealed by Prices},
-  journal = {Econometrica},
-  year    = {1979},
-  volume  = {47},
-  number  = {3},
-  pages   = {655--678},
-  doi     = {10.2307/1910414}
-}
-
-@article{Jordan1982,
-  author  = {Jordan, James S.},
-  title   = {The Generic Existence of Rational Expectations Equilibrium in the Higher Dimensional Case},
-  journal = {Journal of Economic Theory},
-  year    = {1982},
-  volume  = {26},
-  number  = {2},
-  pages   = {224--243},
-  doi     = {10.1016/0022-0531(82)90021-7}
-}
-
-@article{Jordan1982b,
-  author  = {Jordan, James S.},
-  title   = {Admissible Market Data Structures: A Complete Characterization},
-  journal = {Journal of Economic Theory},
-  year    = {1982},
-  volume  = {28},
-  number  = {1},
-  pages   = {19--31},
-  doi     = {10.1016/0022-0531(82)90089-8}
-}
-
-@article{Admati1985,
-  author  = {Admati, Anat R.},
-  title   = {A Noisy Rational Expectations Equilibrium for Multi-Asset Securities Markets},
-  journal = {Econometrica},
-  year    = {1985},
-  volume  = {53},
-  number  = {3},
-  pages   = {629--658},
-  doi     = {10.2307/1911659}
-}
-
-@article{GrossmanStiglitz1980,
-  author  = {Grossman, Sanford J. and Stiglitz, Joseph E.},
-  title   = {On the Impossibility of Informationally Efficient Markets},
-  journal = {American Economic Review},
-  year    = {1980},
-  volume  = {70},
-  number  = {3},
-  pages   = {393--408}
-}
-
-@article{GrossmanSonnenschein1982,
-  author  = {Grossman, Sanford J. and Sonnenschein, Hugo},
-  title   = {Notes on Expectations Equilibria in Bayesian Settings},
-  journal = {Working Paper},
-  year    = {1982}
-}
-
-@article{BlumeEasley1982,
-  author  = {Blume, Lawrence E. and Easley, David},
-  title   = {Learning to be Rational},
-  journal = {Journal of Economic Theory},
-  year    = {1982},
-  volume  = {26},
-  number  = {2},
-  pages   = {340--351},
-  doi     = {10.1016/0022-0531(82)90022-9}
-}
-
-@article{Frydman1982,
-  author  = {Frydman, Roman},
-  title   = {Towards an Understanding of Market Processes: Individual Expectations, Learning, and Convergence to Rational Expectations Equilibrium},
-  journal = {American Economic Review},
-  year    = {1982},
-  volume  = {72},
-  number  = {4},
-  pages   = {652--668}
-}
-
-@article{Lewis1981,
-  author  = {Lewis, Karen K.},
-  title   = {An Introduction to the Theory of Rational Expectations under Asymmetric Information},
-  journal = {Review of Economic Studies},
-  year    = {1981},
-  volume  = {48},
-  number  = {4},
-  pages   = {541--560},
-  doi     = {10.2307/2297169}
-}
-
-@article{Townsend1983,
-  author  = {Townsend, Robert M.},
-  title   = {Forecasting the Forecasts of Others},
-  journal = {Journal of Political Economy},
-  year    = {1983},
-  volume  = {91},
-  number  = {4},
-  pages   = {546--588},
-  doi     = {10.1086/261170}
-}
-
-@article{MarcetSargent1989,
-  author    = {Marcet, Albert and Sargent, Thomas J.},
-  title     = {Convergence of Least Squares Learning Mechanisms in Self-Referential Linear Stochastic Models},
-  journal   = {Journal of Economic Theory},
-  year      = {1989},
-  volume    = {48},
-  number    = {2},
-  pages     = {337--368},
-  publisher = {Elsevier},
-  doi       = {10.1016/0022-0531(89)90032-X}
-}
-
-@article{ArrowGreen1973,
-  author  = {Arrow, Kenneth J. and Green, Jerry R.},
-  title   = {Notes on Expectations Equilibria in Bayesian Settings},
-  journal = {Working Paper in Economics},
-  year    = {1973},
-  number  = {33},
-  institution = {Institute for Mathematical Studies in the Social Sciences, Stanford University}
-}
-
-@article{Kobayashi1977,
-  author  = {Kobayashi, Tetsuya},
-  title   = {A Note on Fulfilled Expectations Equilibria},
-  journal = {Journal of Economic Theory},
-  year    = {1977},
-  volume  = {14},
-  number  = {1},
-  pages   = {32--43},
-  doi     = {10.1016/0022-0531(77)90098-0}
-}

From 6dc438005425ecc6f6e293ace42d195e288a4067 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Thu, 28 May 2026 22:56:41 +1000
Subject: [PATCH 08/25] updates

---
 lectures/long_run_risk_operator.md | 1541 ++++++++++++++++++++--------
 lectures/ls_learning.md            |  745 ++++++++------
 lectures/rational_learning_re.md   |  720 +++++++------
 3 files changed, 1926 insertions(+), 1080 deletions(-)

diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index 9152dd0ee..ef4e48c2c 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -28,82 +28,101 @@ kernelspec:
 
 ## Overview
 
-This lecture studies the operator approach to long-run risk developed by
-{cite:t}`HansenScheinkman2009`.
+How should we value a cash flow that pays off thirty years from now?
 
-Local continuous-time asset pricing tells us how expected returns compensate
-investors for instantaneous exposure to Brownian and jump shocks.
+Standard short-horizon asset pricing tells us how investors are compensated
+for tiny, instantaneous exposures to shocks --- the *short end* of the term
+structure of risk prices.
 
-Driving the time interval to zero gives a clean limiting object, but it
-describes only the *short end* of the term structure of risk prices.
+But many of the most interesting asset-pricing questions, the equity
+premium puzzle, the slope of the yield curve, the prices of long-dated
+options, depend on what happens at the *long end* instead.
 
-This lecture instead studies the *long end*: what happens as the time between
-valuation and payoff grows large.
+This lecture studies the long end using the operator approach of
+{cite:t}`HansenScheinkman2009`.
 
-The two ends are complementary --- together they pin down the slope of the term
-structure of risk prices, and economic restrictions are often more reliable
-over long horizons than over instantaneous ones.
+At the center of the play is a stochastic discount factor or a return is
+multiplicative across time, so its expectation defines a *semigroup* of
+valuation operators indexed by horizon $t$.
 
-The mathematical vehicle is a family of valuation operators indexed by horizon
-$t$, which form a *semigroup*.
+Long-horizon behaviour of the semigroup is controlled by a single eigenvalue
+problem on the state space.
 
-The central object is a positive multiplicative functional $\{M_t\}_{t \geq 0}$,
-such as a stochastic discount factor, a cumulated return, a stochastic growth
-functional, or a product of discounting and growth.
+When we solve that eigenvalue problem and pick the right eigenfunction, the
+multiplicative functional $M_t$ factors into three economically meaningful
+pieces: a deterministic exponential trend, a martingale that changes
+probability measure, and a transient state-dependent component.
 
-When the right principal eigenfunction is selected, $M$ admits the
-factorization
+We will derive this factorization carefully, but here is the headline result
+to keep in mind:
 
 $$
     M_t
     =
-    \exp(\rho t) \hat M_t
-    \frac{\phi(X_0)}{\phi(X_t)} ,
+    \exp(\rho t)\, \hat M_t\,
+    \frac{\phi(X_0)}{\phi(X_t)},
 $$ (eq:hs-factorization)
 
 where
 
-* $\rho$ is a principal eigenvalue,
-* $\phi$ is a strictly positive principal eigenfunction,
-* $\hat M$ is a martingale used to change probability measure, and
-* $\phi(X_0)/\phi(X_t)$ is a transient state-dependent component.
+* $\rho$ is a scalar (the **principal eigenvalue**) giving the asymptotic
+  growth or decay rate,
+* $\phi$ is a strictly positive **principal eigenfunction** on the state
+  space, capturing the long-run dependence on the current state,
+* $\hat M$ is a positive martingale used to change probability measure (the
+  **martingale component**), and
+* $\phi(X_0)/\phi(X_t)$ is a transient component that washes out as the
+  twisted process settles into its stationary distribution.
+
+In finite-state problems this is exactly the Perron-Frobenius decomposition of
+a positive matrix; in general state spaces it is a continuous-state analogue.
 
-The qualifier "right" matters because general state-space Markov models can
-have more than one positive eigenfunction.
+We will refer to {eq}`eq:hs-factorization` as the **multiplicative
+factorization** associated with $(\rho,\phi,\hat M)$.
 
-The economically useful eigenfunction is the one for which $\hat M$ is a true
-martingale and the Markov process remains stable under the probability measure
-twisted by $\hat M$.
+{cite:t}`AlvarezJermann2005` introduced a related permanent-transitory
+decomposition for stochastic discount factors. 
 
-```{prf:definition} Multiplicative Factorization
-:label: lrr-def-multiplicative-factorization
+The operator approach links
+that decomposition to an explicit eigenvalue problem.
 
-A representation of the form {eq}`eq:hs-factorization` is called the
-**multiplicative factorization** associated with $(\rho,\phi,\hat M)$.
+```{seealso}
+This lecture is closely related to the advanced lecture
+{doc}`advanced:additive_functionals`, which studies the same kind of
+permanent-transitory decomposition for additive and multiplicative
+functionals in a discrete-time linear-Gaussian setting.
+
+Reading the two together is a good way to see the same long-run risk ideas
+in both continuous and discrete time.
 ```
 
-It generalizes the Perron-Frobenius decomposition of a positive matrix to
-continuous-time Markov valuation problems.
+We will build up to {eq}`eq:hs-factorization` and use it to compute
+long-run risk prices in concrete models.
+
+The plan is:
 
-For long horizons, the scalar $\rho$ controls the exponential growth or decay
-rate of the relevant valuation semigroup, while $\phi$ controls the limiting
-dependence on the current Markov state.
+1. Set up positive multiplicative functionals $M$ (discount factors, returns,
+   stochastic growth) and the valuation semigroups they generate.
 
-{cite:t}`AlvarezJermann2005` use a related permanent-transitory decomposition
-for stochastic discount factors.
+2. Introduce the **generator** of a semigroup --- the local operator whose
+   eigenvalue problem controls long-run behaviour.
 
-The key insight is that the decomposition can be constructed from principal
-eigenfunctions and used to characterize long-run risk-return trade-offs.
+3. Find the principal eigenfunction $\phi$ and derive the factorization.
 
-This lecture covers
+4. Work two examples in detail: a finite-state Markov chain (where everything
+   reduces to Perron-Frobenius theory) and an affine diffusion model (where
+   the eigenfunction is exponential-affine and we get closed-form formulas).
 
-* multiplicative functionals and valuation semigroups,
-* the pricing restriction that links stochastic discount factors to returns,
-* the extended generator associated with a multiplicative functional,
-* principal eigenfunctions and the Hansen-Scheinkman factorization,
-* a finite-state example where the analysis reduces to Perron-Frobenius theory,
-* an affine diffusion example, and
-* long-run risk prices for persistent growth shocks.
+5. Use the factorization to compute long-run risk prices and compare them to
+   the local risk prices that would be reported by short-horizon asset
+   pricing.
+
+A recurring theme will be that local and long-run risk prices can differ
+sharply when shocks move persistent state variables.
+
+That is the key
+mechanism that makes long-run risk models like {cite:t}`Bansal_Yaron_2004`
+generate large equity premia.
 
 We start with the following imports
 
@@ -115,32 +134,57 @@ from scipy.linalg import eig, expm
 
 ## Multiplicative functionals
 
+### Setting
+
 Let $\{X_t : t \geq 0\}$ be a continuous-time Markov process with state space
-$\mathcal D_0$.
+$\mathcal D_0$, and let $\mathcal F_t$ denote the filtration generated by its
+history.
 
-Let $\mathcal F_t$ be the filtration generated by the history of $X$.
+We will work with a strong Markov process whose sample paths are càdlàg
+(defined below).
 
-We work with a strong Markov process whose sample paths are right-continuous
-with left limits.
+For the explicit formulas later we will specialize to a semimartingale,
+which decomposes into a continuous component $X^c$ and a pure-jump
+component $X^j$:
 
-For the jump-diffusion formulas below, $X$ is a semimartingale with a
-continuous component
+$$
+    X_t = X_t^c + X_t^j .
+$$
+
+We write the continuous-component dynamics as
+
+$$
+    dX_t^c = \xi(X_{t-})\, dt + \Gamma(X_{t-})\, dB_t,
+$$
+
+and the pure-jump component as
 
 $$
-    dX_t^c = \xi(X_{t-})dt + \Gamma(X_{t-})dB_t
+    dX_t^j = \int y\, \zeta(dy, dt),
 $$
 
-The jump component has compensator $\eta(dy \mid X_{t-})dt$.
+where $\zeta$ is the random counting measure of jumps and
+$\eta(dy \mid X_{t-})\, dt$ is its compensator --- the rate at which $X$
+jumps from $X_{t-}$ to a region $dy$.
 
-We assume finite jump activity on finite time intervals to keep the notation
-simple.
+We also impose two simplifying assumptions:
 
-We also assume enough rank in $\Gamma$ that the Brownian shocks relevant for
-pricing can be recovered from the state history.
+* **Finite jumps** on finite time intervals: only finitely many jumps
+  occur on any bounded interval, which keeps integrals against the jump
+  measure well-defined and finite.
+* **Sufficient rank in $\Gamma$** so that the Brownian shocks relevant for
+  pricing can be recovered from the state history --- this is what makes the
+  Markov state $X$ "rich enough" to be a sufficient statistic for valuation.
 
-These assumptions are not cosmetic: they let us write down the extended
-generator explicitly and apply changes of probability measure using
-martingales.
+They let us write the generator
+in closed form and use martingale-based changes of measure freely.
+
+### Functionals and càdlàg paths
+
+We need a name for "any process that records something about the history of
+$X$".
+
+This includes, for example, a stochastic discount factor or a cumulated return.
 
 ```{prf:definition} Functional
 :label: lrr-def-functional
@@ -150,33 +194,40 @@ constructed from the history of $X$, so that $M_t$ is
 $\mathcal F_t$-measurable for each $t$.
 ```
 
-We assume that functionals have versions with right-continuous sample
-paths and left limits, the **càdlàg** property.
+We will always work with the **càdlàg** version of a functional --- the
+French acronym for "right-continuous with left limits".
 
-Concretely, for almost every $\omega$, the path $t \mapsto M_t(\omega)$ satisfies
+Concretely, for almost every sample path $\omega$,
 
 $$
     \lim_{s \downarrow t} M_s(\omega) = M_t(\omega)
-    \quad \text{for all } t \geq 0,
+    \qquad \text{for all } t \geq 0,
 $$
 
-and the left limit
+and the left limit $M_{t-}(\omega) := \lim_{s \uparrow t} M_s(\omega)$ exists
+and is finite for all $t > 0$.
 
-$$
-    M_{t-}(\omega) := \lim_{s \uparrow t} M_s(\omega)
-$$
+In words: paths may jump, but each jump $\Delta M_t := M_t - M_{t-}$ resolves
+instantaneously.
+
+At the jump time $t$, the value is the post-jump value,
+not the pre-jump value.
 
-exists and is finite for all $t > 0$.
+```{note}
+Why is the càdlàg property worth insisting on?
 
-Paths may jump, but each jump is resolved at the jump time:
-$M_t = M_{t-} + \Delta M_t$ with $\Delta M_t := M_t - M_{t-}$.
+Because we will later want to (i) integrate functionals against time, (ii)
+apply optional stopping arguments, and (iii) take limits like
+$\lim_{t \to \infty} t^{-1}\log M_t$.
+
+All three operations need the joint measurability in $(\omega,t)$ that
+càdlàg paths give us automatically.
+```
 
-The word *version* means we are free to replace $M_t$ by any process
-$\tilde M_t$ with $\mathbb P(M_t = \tilde M_t) = 1$ for each $t$.
+### Multiplicativity
 
-Càdlàg paths give the joint measurability in $(\omega, t)$ that we need to
-integrate functionals against time, apply optional stopping, and pass to
-limits such as $\lim_{t \to \infty} t^{-1} \log M_t$ that appear later.
+The central restriction we impose on the functional $M$ is that it is
+*multiplicative across time*.
 
 ```{prf:definition} Multiplicative Functional
 :label: lrr-def-multiplicative-functional
@@ -184,40 +235,44 @@ limits such as $\lim_{t \to \infty} t^{-1} \log M_t$ that appear later.
 A functional $\{M_t : t \geq 0\}$ is **multiplicative** if $M_0 = 1$ and
 
 $$
-    M_{t+u} = M_u(\theta_t) M_t ,
+    M_{t+u} = M_u(\theta_t)\, M_t ,
     \qquad t, u \geq 0,
 $$ (eq:multiplicative)
 
 where $\theta_t$ shifts the underlying Markov path forward by $t$ units.
 ```
 
-The pricing origin of {eq}`eq:multiplicative` is the law of one price.
+Why is this the natural condition?
 
-If $S_t$ is a stochastic discount factor, the date-$0$ value of a date-$t$
-payoff $\Pi_t$ is $E[S_t\Pi_t \mid \mathcal F_0]$.
+Think of $M_t = S_t$, a stochastic discount factor.
 
-If the same payoff is purchased at an intermediate date $\tau$, its date-$\tau$
-price is
+The date-$0$ value of a date-$t$ payoff $\Pi_t$ is $E[S_t\Pi_t \mid \mathcal F_0]$.
+
+If we instead buy this payoff at intermediate date $\tau$, its date-$\tau$
+price must be
 
 $$
-    E\left[\frac{S_t}{S_\tau}\Pi_t \mid \mathcal F_\tau\right].
+    E\left[\frac{S_t}{S_\tau}\Pi_t \,\Big|\, \mathcal F_\tau\right].
 $$
 
-For prices to be Markov in the current state, the ratio $S_t/S_\tau$ must
-depend only on the Markov path after $\tau$.
-Thus, in Markov form,
+For the price to depend only on the current Markov state $X_\tau$ (and not on
+the entire history up to $\tau$), the ratio $S_t/S_\tau$ must be a function
+only of the Markov path *after* $\tau$ --- that is,
+$S_{\tau+u}/S_\tau = S_u(\theta_\tau)$, which is exactly
+{eq}`eq:multiplicative`.
 
-$$
-    \frac{S_{\tau+u}}{S_\tau} = S_u(\theta_\tau),
-$$
+So multiplicativity is the Markov form of the law of one price.
 
-This identity is precisely multiplicativity.
+The same structural property is then carried over to stochastic growth and
+cumulated returns.
 
-The same structure is then used for stochastic growth and cumulated returns.
+### Additive functionals
 
-When $M_t > 0$, we can write $M_t = \exp(A_t)$.
+It is often easier to work with the *log* of a strictly positive
+multiplicative functional.
 
-The logarithm $A$ then satisfies the following additive property.
+If $M_t > 0$, we can write $M_t = \exp(A_t)$, and the multiplicative property
+{eq}`eq:multiplicative` becomes the corresponding additive property of $A$.
 
 ```{prf:definition} Additive Functional
 :label: lrr-def-additive-functional
@@ -229,73 +284,69 @@ $$
 $$
 ```
 
-Exponentials of additive functionals are strictly positive multiplicative functionals.
+So exponentials of additive functionals are exactly the strictly positive
+multiplicative functionals 
 
-In a jump-diffusion setting, a useful parameterization is
+In our jump-diffusion setting, a useful parameterization is
 
 $$
-\begin{aligned}
     A_t
-    &=
-    \int_0^t \beta(X_s) ds
+    =
+    \int_0^t \beta(X_s)\, ds
     + \int_0^t \gamma(X_{s-})^\top dB_s
-    + \sum_{0 \leq s \leq t} \kappa(X_s, X_{s-}) .
-\end{aligned}
+    + \sum_{0 \leq s \leq t} \kappa(X_s, X_{s-}) ,
 $$ (eq:additive-functional)
 
-The functions $(\beta, \gamma, \kappa)$ are the drift, diffusion coefficient,
-and jump amplitudes.
+where the three functions $(\beta, \gamma, \kappa)$ play the following
+roles:
 
-In this notation, $\beta$ is allowed to be positive or negative.
+* $\beta(x)$ is a **state-dependent drift** rate (
+  e.g., a pure discount factor with short rate $r(x)$ has $\beta(x) = -r(x)$).
+* $\gamma(x)$ is a **Brownian loading**, controlling how Brownian shocks feed
+  into $A$ at state $x$.
+* $\kappa(y,x)$ is a **jump amplitude** that fires whenever $X$ jumps from
+  $x$ to $y$.
 
-For instance, a pure discount factor with short rate $r(X_t)$ has
-$\beta(x) = -r(x)$.
-
-We impose the integrability conditions needed for these objects to be well
-defined:
+For everything to be well defined, we impose the natural integrability
+conditions
 
 $$
-    \int_0^t |\beta(X_s)|ds < \infty,
+    \int_0^t |\beta(X_s)|\, ds < \infty,
     \qquad
-    \int_0^t \|\gamma(X_s)\|^2ds < \infty,
+    \int_0^t \|\gamma(X_s)\|^2\, ds < \infty,
 $$
 
-These conditions hold for finite $t$, and we also impose $\kappa(x,x)=0$ and
+together with $\kappa(x,x)=0$ (no jump if the state doesn't change) and
 
 $$
-    \int \exp[\kappa(y,x)]\eta(dy \mid x) < \infty.
+    \int \exp[\kappa(y,x)]\, \eta(dy \mid x) < \infty.
 $$
 
-This parameterization is broad enough for the examples in this lecture, but it
-is not exhaustive.
-
-Occupation times and local times are also additive functionals.
+This parameterization is rich enough for everything we do in this lecture,
+though additive functionals can also include things like occupation times.
 
 ## Semigroups
 
-A multiplicative functional $M$ together with the Markov process $X$
-defines, for each horizon $t$, the valuation operator
+A multiplicative functional $M$ together with the Markov process $X$ defines
+a **valuation operator** for each horizon $t$:
 
 $$
     \mathbb M_t \psi(x) = E\left[M_t \psi(X_t) \mid X_0 = x\right] .
 $$
 
-These operators inherit a clean composition rule from the multiplicative
-property of $M$, which makes the family $\{\mathbb M_t\}_{t \geq 0}$ a
-*semigroup*.
+You should read $\mathbb M_t \psi(x)$ as "the date-$0$ value, starting from
+state $x$, of a date-$t$ payoff $\psi(X_t)$", weighted by $M_t$.
+
+The family of operators $\{\mathbb M_t\}_{t \geq 0}$ has a key compositional
+structure --- the *semigroup property*.
 
 ```{prf:definition} One-Parameter Semigroup
 :label: lrr-def-one-parameter-semigroup
 
 A family of linear operators $\{T_t : t \geq 0\}$ is a **one-parameter
 semigroup** if $T_0=I$ and $T_{t+s}=T_tT_s$ for all $s,t \geq 0$.
-```
 
-```{prf:definition} Positive Semigroup
-:label: lrr-def-positive-semigroup
-
-A semigroup $\{T_t : t \geq 0\}$ is **positive** if $T_t\psi \geq 0$
-whenever $\psi \geq 0$ and $t \geq 0$.
+It is **positive** if $T_t\psi \geq 0$ whenever $\psi \geq 0$.
 ```
 
 ```{prf:definition} Multiplicative Semigroup
@@ -309,46 +360,37 @@ $$
     =
     E\left[M_t \psi(X_t) \mid X_0 = x\right].
 $$ (eq:m-semigroup)
-
-These operators form a semigroup:
-
-$$
-    \mathbb M_0 = I,
-    \qquad
-    \mathbb M_{t+u} = \mathbb M_t \mathbb M_u .
-$$
 ```
 
-The semigroup property follows from iterated expectations and the
-multiplicative property of $M$.
+The semigroup identity $\mathbb M_{t+u} = \mathbb M_t \mathbb M_u$ follows
+from iterated expectations and the multiplicative property of $M$.
 
-Economically, it is the Markov law of iterated values: the date-$0$ price of
-a date-$(t+u)$ payoff equals the date-$0$ price of holding the date-$t$
-price of that payoff.
+Economically, this is the *Markov law of iterated values*: to value a
+date-$(t+u)$ payoff today, we can either
 
-Path-level multiplicativity is the structural restriction that gives the
-operator semigroup property.
+* discount it directly from $t+u$ back to $0$ in one step (apply
+  $\mathbb M_{t+u}$), or
+* first discount it from $t+u$ back to $t$ (apply $\mathbb M_u$), then
+  discount the resulting date-$t$ price back to $0$ (apply $\mathbb M_t$).
 
-Conversely, in a Markov pricing model where operators are represented by
-stochastic discount factor ratios, the semigroup property is the operator
-shadow of this same intertemporal consistency restriction.
+The semigroup identity says these two procedures give the same answer.
 
-The path-level statement contains more information than the operator identity
-alone.
+This is the operator-level version of the intertemporal consistency that
+rules out arbitrage across horizons.
 
-### Functionals we will use
+Four positive multiplicative functionals will appear throughout.
 
-We work with four positive multiplicative functionals throughout the lecture.
+| Symbol | Object | Semigroup |
+|:---|:---|:---:|
+| $S$ | stochastic discount factor | $\{\mathbb S_t\}$ |
+| $V$ | cumulated return on an asset | $\{\mathbb V_t\}$ |
+| $G$ | stochastic growth in cash flows | $\{\mathbb G_t\}$ |
+| $Q=GS$ | valuation of growing cash flows | $\{\mathbb Q_t\}$ |
 
-| Object | Multiplicative functional | Semigroup |
-|---|---:|---:|
-| stochastic discount factor | $S$ | $\{\mathbb S_t\}$ |
-| cumulated return | $V$ | $\{\mathbb V_t\}$ |
-| stochastic growth | $G$ | $\{\mathbb G_t\}$ |
-| valuation with stochastic growth | $Q = GS$ | $\{\mathbb Q_t\}$ |
+The first three are primitives. 
 
-The first three are primitives; the last one combines them to value cash
-flows that both grow and require discounting.
+The fourth combines discounting and growth to
+value cash flows that grow stochastically over time.
 
 ```{prf:definition} Stochastic Discount Factor
 :label: lrr-def-stochastic-discount-factor
@@ -380,7 +422,7 @@ date-$0$ level and $\psi$ is a Borel state-payoff function.
 Discounting with $S$ and growing with $G$, its date-$0$ value is
 
 $$
-    D_0 \mathbb Q_t \psi(X_0),
+    D_0\, \mathbb Q_t \psi(X_0),
     \qquad
     \mathbb Q_t \psi(x)
     =
@@ -391,58 +433,61 @@ $$
 :label: lrr-def-cash-flow-valuation-semigroup
 
 The **cash-flow valuation semigroup** is the multiplicative semigroup
-generated by $Q=GS$, where $G$ is stochastic growth and $S$ is the stochastic
-discount factor.
+generated by $Q=GS$.
 ```
 
 The long-horizon behaviour of $\mathbb Q_t$ is the central object of the
 lecture: it tells us how current prices value cash-flow growth risk that
 materializes far in the future.
 
-The split $D_t=D_0G_t\psi(X_t)$ is not unique.
-For any positive function $\varphi$,
+```{note}
+The split $D_t=D_0 G_t \psi(X_t)$ is not unique --- for any positive function
+$\varphi$,
 
 $$
     D_t
     =
     D_0
-    \left[
-        G_t\frac{\varphi(X_t)}{\varphi(X_0)}
-    \right]
-    \left[
-        \frac{\psi(X_t)\varphi(X_0)}{\varphi(X_t)}
-    \right].
+    \left[G_t\frac{\varphi(X_t)}{\varphi(X_0)}\right]
+    \left[\frac{\psi(X_t)\varphi(X_0)}{\varphi(X_t)}\right],
 $$
 
-Thus a transient state-dependent component can be moved between $G$ and
-$\psi$.
+so a transient state-dependent factor can be shuffled between $G$ and $\psi$
+without changing $D_t$.
 
-We therefore normalize growth components so that their permanent part is
-represented by a martingale:
+We resolve this by normalizing the growth component so its permanent part is
+a martingale: $G_t = \exp(\delta t)\hat G_t$, with $\hat G$ a martingale and
+$\delta$ a constant trend.
 
-$$
-    G_t = \exp(\delta t)\hat G_t,
-$$
+The eigenfunction construction below will tell us exactly which martingale to
+pick.
+```
 
-Here $\hat G$ is a martingale and $\delta$ is a constant conditional growth
-rate.
+### Local pricing restriction
 
-The eigenfunction construction below explains how such martingale components
-can be extracted and which one is relevant for long-run valuation.
+Before tackling long horizons, it is worth knowing what valuation looks
+like at the *short* end.
 
-### Local pricing restriction
+That is the standard instantaneous risk-return relation.
 
-Before studying long horizons, it is useful to record the short-horizon
-risk-return relation.
+This will give us a benchmark to compare long-run risk prices against later.
 
-Let the stochastic discount factor $S$ be parameterized by
-$(\beta^s,\gamma^s,\kappa^s)$ and a valuation functional $V$ by
-$(\beta^v,\gamma^v,\kappa^v)$.
+For a textbook discrete-time treatment of the same SDF-based asset-pricing
+ideas, see {doc}`advanced:asset_pricing_lph`; for an estimation perspective
+on Euler-equation-based asset pricing, see {doc}`hansen_singleton_1982`.
 
-The definition of a valuation functional requires $VS$ to be a martingale.
+The key starting point is that a valuation functional $V$ must satisfy the
+no-arbitrage requirement that $VS$ is a martingale (Definition
+{prf:ref}`lrr-def-valuation-functional`).
 
-For a positive multiplicative functional parameterized by
-$(\beta,\gamma,\kappa)$, the local martingale restriction is
+We parameterize the stochastic discount factor $S$ and valuation functional
+$V$ as additive functionals with coefficients $(\beta^s,\gamma^s,\kappa^s)$
+and $(\beta^v,\gamma^v,\kappa^v)$ respectively, in the notation of
+{eq}`eq:additive-functional`.
+
+For a generic positive multiplicative functional with parameters
+$(\beta,\gamma,\kappa)$, applying Itô's formula and zeroing out the drift
+gives the **local martingale restriction**
 
 $$
     \beta
@@ -451,7 +496,12 @@ $$
     = 0.
 $$ (eq:local-martingale-restriction)
 
-Applying this to $VS$ gives the local pricing restriction
+(The drift coefficient $\beta$, the Itô correction $\gamma^\top \gamma/2$,
+and the jump compensator sum to zero.)
+
+Applying this to $VS$ --- whose parameters add: $(\beta^v + \beta^s,
+\gamma^v + \gamma^s, \kappa^v + \kappa^s)$ --- gives the **local pricing
+restriction**
 
 $$
     \beta^v+\beta^s
@@ -465,7 +515,11 @@ $$
         \eta(dy \mid \cdot).
 $$ (eq:local-pricing-restriction)
 
-The expected net rate of return on $V$ is
+This determines the drift $\beta^v$ of any candidate valuation functional in
+terms of its Brownian and jump exposures.
+
+To turn this into an *expected return*, note that the expected net rate of
+return on $V$ is
 
 $$
     \epsilon^v
@@ -474,10 +528,10 @@ $$
     + \frac{\|\gamma^v\|^2}{2}
     + \int
         \left(\exp[\kappa^v(y,\cdot)]-1\right)
-        \eta(dy \mid \cdot).
+        \eta(dy \mid \cdot) .
 $$
 
-Combining this expression with {eq}`eq:local-pricing-restriction` gives
+Combining with {eq}`eq:local-pricing-restriction` gives
 
 $$
 \begin{aligned}
@@ -592,9 +646,9 @@ $$
     \quad \text{for small } h > 0.
 $$
 
-The operator $\mathbb A$ is *local* in the sense that $\mathbb A\psi(x)$
-depends only on what happens in an infinitesimal neighbourhood of $x$, not on
-a path integral over $[0,t]$.
+The operator $\mathbb A$ is local in time and conditional on the current state,
+with the jump term integrating over possible post-jump states rather than over
+a realized path on $[0,t]$.
 
 If $\mathbb A\phi = \rho \phi$, then
 
@@ -659,17 +713,20 @@ state.
 
 ### A closed form for jump diffusions
 
-Suppose the Markov state satisfies
+For the jump-diffusion setting introduced above, we can compute $\mathbb A$
+explicitly by applying Itô's formula to $M_t\phi(X_t)$.
+
+Suppose the continuous part of $X$ satisfies
 
 $$
     dX_t^c = \xi(X_t)dt + \Gamma(X_t)dB_t
 $$
 
-between jumps, let $\Sigma = \Gamma \Gamma^\top$, and let
+with diffusion matrix $\Sigma = \Gamma \Gamma^\top$, and let
 $\eta(dy \mid x)$ denote the jump compensator.
 
 If $M=\exp(A)$ is parameterized by $(\beta,\gamma,\kappa)$ as in
-{eq}`eq:additive-functional`, then, for smooth $\phi$,
+{eq}`eq:additive-functional`, then for smooth $\phi$,
 
 $$
 \begin{aligned}
@@ -700,18 +757,37 @@ $$
 \end{aligned}
 $$ (eq:extended-generator)
 
+The four terms have transparent interpretations:
+
+1. The first term is the standard Markov drift, modified by $\Gamma\gamma$
+   --- a *covariance correction* between the Brownian shocks driving $X$ and
+   those driving $M$.
+2. The second is the standard diffusion (Itô) term.
+3. The third integrates $\phi$ against the jump-compensated transition rates,
+   reweighted by the jump multiplier $\exp[\kappa(y,x)]$.
+4. The fourth is a multiplicative *yield-like* term --- it multiplies $\phi(x)$
+   itself and combines the drift of $M$, the Brownian Itô correction, and the
+   compensated jumps.
+
 ```{note}
-When $M=S$ is a stochastic discount factor, the extra terms multiplying
-$\phi(x)$ encode local prices of Brownian and jump risk.
+When $M=S$ is a stochastic discount factor, the term multiplying $\phi(x)$
+in the fourth line encodes local prices of Brownian and jump risk --- the
+short-end of the term structure we will revisit later.
+
+Derivation of {eq}`eq:extended-generator` is the content of Exercise
+{ref}`lrr_ex4`.
 ```
 
 We will apply this formula directly in the affine-diffusion example below.
 
 ## Principal eigenfunctions
 
-With the local operator $\mathbb A$ in hand, the long-run question becomes:
-which positive payoffs grow at a constant proportional rate under the
-valuation semigroup?
+We now arrive at the central technical question of the lecture:
+
+> Which positive payoffs grow at a constant proportional rate under the
+> valuation semigroup?
+
+The answer, when it exists, is a positive eigenfunction of the generator.
 
 ```{prf:definition} Eigenfunction of the Extended Generator
 :label: lrr-def-generator-eigenfunction
@@ -722,39 +798,42 @@ eigenvalue $\rho$ if
 $$
     \mathbb A \phi = \rho \phi .
 $$ (eq:generator-eigen)
-```
-
-```{prf:definition} Principal Eigenfunction
-:label: lrr-def-principal-eigenfunction
 
 A **principal eigenfunction** is an eigenfunction $\phi$ that is strictly
-positive on the state space, i.e. $\phi(x) > 0$ for all $x \in \mathcal D_0$.
+positive on the state space: $\phi(x) > 0$ for all $x \in \mathcal D_0$.
 ```
 
-To see why this expression is the natural object built from the eigenpair,
-recall the discrete-time picture from the generator section.
+The strict-positivity requirement matters because $\phi$ will appear in
+denominators throughout: it has to be safe to divide by it.
+
+### From eigenfunction to factorization
+
+Why does an eigenfunction of $\mathbb A$ give us the multiplicative
+factorization {eq}`eq:hs-factorization`?
+
+The cleanest way to see it is again through the discrete-time analogy.
 
-There, if $K\phi = \lambda\phi$, then the process
+If $K\phi = \lambda\phi$ in discrete time, then
 
 $$
-    \lambda^{-n} M_n \frac{\phi(X_n)}{\phi(X_0)}
+    \lambda^{-n}\, M_n\, \frac{\phi(X_n)}{\phi(X_0)}
 $$
 
-is a martingale: $K\phi = \lambda\phi$ exactly cancels the one-step drift of
-$M_n\phi(X_n)$ after we divide by $\lambda^n$.
+is a martingale --- the eigenvalue equation exactly cancels the one-step
+drift of $M_n\phi(X_n)$ once we divide by $\lambda^n$.
 
-In continuous time, $\lambda^n$ is replaced by $\exp(\rho t)$, and the
-analogous candidate martingale is
+In continuous time, $\lambda^n$ is replaced by $\exp(\rho t)$, and our
+candidate martingale becomes
 
 $$
     \hat M_t
     =
-    \exp(-\rho t) M_t
+    \exp(-\rho t)\, M_t\,
     \frac{\phi(X_t)}{\phi(X_0)} .
 $$ (eq:mhat)
 
-The eigenfunction equation $\mathbb A\phi = \rho\phi$ is what we need to make
-this candidate work, just as $K\phi = \lambda\phi$ did in discrete time.
+The eigenfunction equation $\mathbb A\phi = \rho\phi$ is exactly what we
+need to make this candidate work.
 
 To verify, apply the definition of the extended generator to $M_t\phi(X_t)$:
 
@@ -823,23 +902,23 @@ $$ (eq:semigroup-eigen)
 
 ### Stability of the twisted process
 
-The eigenpair $(\rho, \phi)$ controls *long-run* behaviour of $\mathbb M_t$
-only if the twisted process settles into a stationary regime.
+We now have a factorization {eq}`eq:hs-factorization` for *any* principal
+eigenfunction.
 
-We need three conditions on the twisted process, applied in turn:
+But for $(\rho,\phi)$ to actually describe **long-run** behaviour of
+$\mathbb M_t$ --- not just produce a valid algebraic identity --- the twisted
+process must settle into a stationary regime as $t \to \infty$.
 
-* a **stationary distribution** $\hat\varsigma$ that the twisted dynamics
-  leave invariant — the candidate long-run distribution;
-* **irreducibility of a discretely sampled skeleton** of $X$ under
-  $\hat\varsigma$ — every region of positive $\hat\varsigma$-mass can be
-  reached from any starting point;
-* **Harris recurrence** of $X$ under the twisted measure — every such region
-  is visited infinitely often, which guarantees that $\hat\varsigma$ is
-  unique.
+If it doesn't, the transient factor $\phi(X_0)/\phi(X_t)$ will not wash out
+and we cannot read off the asymptotics from $\rho$ alone.
+
+We need three conditions, each ruling out a specific failure mode.
 
 Let $\hat E$ and $\widehat{\Pr}$ denote expectation and probability under the
-twisted measure, and let $\hat{\mathbb A}$ be the generator of $X$ under that
-measure.
+twisted measure, and let $\hat{\mathbb A}$ be the generator of $X$ under
+that measure.
+
+**Condition 1: a stationary distribution exists.**
 
 ```{prf:definition} Stationary Distribution of the Twisted Process
 :label: lrr-def-stationary-distribution
@@ -854,6 +933,12 @@ $$
 for every $\psi$ in the $L^\infty$ domain of $\hat{\mathbb A}$.
 ```
 
+*Why we need it:* $\hat\varsigma$ is the candidate long-run distribution. If
+it doesn't exist, the twisted process has no steady state for $X_t$ to settle
+into, and the long-run limit cannot be expressed as a state-space integral.
+
+**Condition 2: every important region is reachable.**
+
 ```{prf:definition} Irreducible Skeleton
 :label: lrr-def-irreducible-skeleton
 
@@ -871,6 +956,13 @@ $$
 $$
 ```
 
+*Why we need it:* Without it, the long-run distribution could depend on the
+starting state --- different basins of attraction would give different
+limits. The discrete sampling (with spacing $\Delta$) avoids period-2-style
+pathologies that can arise in continuous time.
+
+**Condition 3: every important region is visited infinitely often.**
+
 ```{prf:definition} Harris Recurrence
 :label: lrr-def-harris-recurrence
 
@@ -886,53 +978,91 @@ $$
 $$
 ```
 
+*Why we need it:* Reachability (Condition 2) is not enough --- a region
+might be reachable but visited only with small probability, so time averages
+fail to converge to $\hat\varsigma$-averages. Harris recurrence is the
+continuous-state replacement for "recurrent state" in a finite chain.
+
+Bundling these together:
+
 ```{prf:definition} Stochastically Stable Twisted Process
 :label: lrr-def-stochastic-stability
 
 The $\hat M$-twisted Markov process is **stochastically stable** if it has
-a stationary distribution $\hat\varsigma$, the skeleton
-$\{X_{\Delta j}\}$ is irreducible relative to $\hat\varsigma$, and $X$ is
-Harris recurrent under the twisted measure.
+a stationary distribution $\hat\varsigma$, the skeleton $\{X_{\Delta j}\}$
+is irreducible relative to $\hat\varsigma$, and $X$ is Harris recurrent
+under the twisted measure.
 ```
 
+### The long-run approximation
+
 Under the martingale condition for $\hat M$, strict positivity of $M$, and
-the stability conditions above, the long-run approximation is
+stochastic stability, the long-run limit takes a clean form:
 
 $$
     \lim_{t \to \infty}
-    \exp(-\rho t)\mathbb M_t \psi
+    \exp(-\rho t)\, \mathbb M_t \psi
     =
     \phi
-    \int \frac{\psi}{\phi} d\hat\varsigma
+    \int \frac{\psi}{\phi}\, d\hat\varsigma .
 $$ (eq:long-run-limit)
 
-Here $\hat\varsigma$ is the stationary distribution of the twisted Markov
-process.
-
-The mode of convergence depends on the payoff class.
+Read this as follows:
 
-For any fixed sampling interval $\Delta>0$, convergence along
-$t=\Delta j$ holds for almost every initial state when
-$\int |\psi|/\phi\, d\hat\varsigma < \infty$.
+* The factor $\exp(\rho t)$ captures the exponential growth or decay of the
+  semigroup. After we strip it off, what remains has a finite limit.
+* The state dependence in that limit is *entirely* in $\phi(x)$ --- this is
+  the sense in which $\phi$ is the long-run shape of the state dependence.
+* The scalar $\int (\psi/\phi)\, d\hat\varsigma$ is the **long-run intensity**
+  of the payoff $\psi$, weighted by $1/\phi$ and averaged against the
+  twisted stationary distribution.
 
-For all continuous times $t$, the pointwise statement holds when $\psi/\phi$
-is bounded.
+The mode of convergence depends on how nice $\psi$ is:
 
-This is the formal sense in which $\rho$ is the long-run growth rate and
-$\phi$ is the long-run state dependence.
+* **Almost-everywhere along a sampling grid.** For any fixed $\Delta>0$,
+  convergence along $t=\Delta j$ holds for almost every initial state when
+  $\int |\psi|/\phi\, d\hat\varsigma < \infty$.
+* **Pointwise for all continuous $t$.** Stronger but needs $\psi/\phi$
+  bounded.
 
 ```{note}
-Positive eigenfunctions need not be unique in general state spaces.
+Strict positivity of $\phi$ is also why uniqueness can fail in general state
+spaces: there can be more than one positive eigenfunction yielding a true
+martingale $\hat M$.
+
+What stochastic stability buys is *selection*: among all candidate
+eigenpairs, the principal eigenfunction selected by stochastic stability is
+the one whose eigenvalue is smallest, and any other positive eigenfunction
+with that eigenvalue is proportional to $\phi$, $\hat\varsigma$-a.s.
 
-The stability requirements above select the relevant eigenfunction up to
-scale --- they pick out the eigenpair whose twisted process is ergodic, and so
-the one that governs the long-run limit.
+This is the analogue of "the Perron-Frobenius eigenvector is unique up to
+scaling" in finite dimensions.
 ```
 
 ## A finite-state Markov chain
 
-We first study a finite-state chain, where the analysis is exactly
-Perron-Frobenius theory.
+To see the whole framework in action, we start with the simplest possible
+case: a finite-state Markov chain.
+
+For background on finite Markov chains in discrete time, see
+{doc}`finite_markov`; for the asset-pricing applications of finite-state
+chains that motivate the construction here, see {doc}`markov_asset`.
+
+Here, every abstract object collapses to a familiar one:
+
+| Abstract object | Finite-state version |
+|:---|:---|
+| Markov process $X$ | continuous-time chain with intensity matrix $U$ |
+| Generator $\mathbb A$ | a matrix $A$ |
+| Semigroup $\mathbb M_t$ | matrix exponential $\exp(tA)$ |
+| Principal eigenfunction $\phi$ | Perron right eigenvector |
+| Principal eigenvalue $\rho$ | dominant real eigenvalue of $A$ |
+| Stationary distribution $\hat\varsigma$ | left eigenvector of twisted generator |
+
+So the long-run analysis is exactly Perron-Frobenius theory --- nothing more,
+nothing less.
+
+### Setup
 
 Let $X$ take values in $\{x_1,\ldots,x_N\}$ and let $U$ be its intensity
 matrix.
@@ -943,18 +1073,22 @@ matrix.
 An **intensity matrix** $U$ for a finite-state continuous-time Markov chain
 satisfies $u_{ij} \geq 0$ for $i \neq j$ and $\sum_j u_{ij}=0$ for each
 state $i$.
+
+Off-diagonal entries $u_{ij}$ are the jump rates from state $i$ to state
+$j$; the diagonal entry $u_{ii} = -\sum_{j \neq i}u_{ij}$ is minus the exit
+rate from state $i$.
 ```
 
 Let the multiplicative functional have
 
-* a discount or decay rate $r_i$ in state $i$, and
-* a jump multiplier $\exp[\kappa(x_j,x_i)]$ when the state jumps from $i$ to
-  $j$.
+* a **discount or decay rate** $r_i$ in state $i$ (the analogue of the drift
+  $\beta(X_s)$ in the additive parameterization), and
+* a **jump multiplier** $\exp[\kappa(x_j,x_i)]$ that fires whenever the
+  state jumps from $i$ to $j$.
 
-In the code below, `κ[j, i]` means $\kappa(x_j,x_i)$, the log multiplier for
-the transition from state $i$ to state $j$.
+In code below, `κ[j, i]` means $\kappa(x_j,x_i)$.
 
-The generator matrix $A$ for the multiplicative semigroup is
+Then the generator matrix $A$ for the multiplicative semigroup is
 
 $$
     a_{ij}
@@ -965,16 +1099,28 @@ $$
     \end{cases}
 $$ (eq:finite-a)
 
-The semigroup is
+The off-diagonal entries are the chain's jump rates *weighted* by the
+jump multipliers; the diagonal entries combine the exit rate with the
+in-state decay rate.
+
+The semigroup is then just $\mathbb M_t = \exp(tA)$.
+
+For an irreducible chain with strictly positive jump multipliers, the
+principal eigenvalue $\rho$ is the unique real eigenvalue of $A$ with
+largest real part, and the associated right eigenvector is strictly
+positive --- this is the Perron-Frobenius theorem.
+
+The twisted generator under the principal eigenpair $(\rho,\phi)$ is
 
 $$
-    \mathbb M_t = \exp(tA).
+    \hat A = D_\phi^{-1} A D_\phi - \rho I,
 $$
 
-For an irreducible chain with strictly positive jump multipliers, the
-principal eigenvalue is the real eigenvalue of $A$ with largest real part.
+where $D_\phi = \operatorname{diag}(\phi)$. The row sums of $\hat A$ vanish,
+so $\hat A$ is itself a valid intensity matrix; the stationary distribution
+$\hat\varsigma$ solves $\hat\varsigma^\top \hat A = 0$.
 
-The associated right eigenvector is strictly positive.
+The helper functions below implement these three calculations.
 
 ```{code-cell} ipython3
 def build_generator(U, r, κ):
@@ -1036,8 +1182,14 @@ def stationary_distribution(Q):
 
 Consider a boom-recession economy.
 
-The boom state switches to recession at rate $\lambda_1$, while recession
-switches to boom at rate $\lambda_2$.
+State 1 is a *boom* (low short rate $r_1=0.05$, switching to recession at
+rate $\lambda_1 = 0.30$).
+
+State 2 is a *recession* (lower short rate $r_2=0.02$, switching to boom at
+rate $\lambda_2 = 0.50$).
+
+For now we set the jump multipliers to zero, so the SDF only changes
+continuously through the in-state decay rates.
 
 ```{code-cell} ipython3
 λ_1 = 0.30
@@ -1059,8 +1211,12 @@ print(f"φ = {φ}")
 print(f"long-run zero-coupon yield = {-ρ:.4f}")
 ```
 
-We can verify the eigenvalue equation
-$\mathbb M_t \phi = \exp(\rho t)\phi$.
+Note that $-\rho$ is the asymptotic yield on a zero-coupon bond: from
+{eq}`eq:hs-factorization`, the date-$0$ price of a long zero-coupon bond
+decays like $\exp(\rho t)$, so its yield is $-\rho$.
+
+Let's verify the semigroup eigenvalue equation
+$\mathbb M_t \phi = \exp(\rho t)\phi$ numerically.
 
 ```{code-cell} ipython3
 for t in [1.0, 5.0, 25.0]:
@@ -1070,8 +1226,14 @@ for t in [1.0, 5.0, 25.0]:
     print(f"t = {t:4.1f}, error = {err:.2e}")
 ```
 
-Next we compute the twisted generator and the stationary distribution
-$\hat\varsigma$ under the twisted probability measure.
+The error decays towards zero --- the equation holds to machine precision
+(small errors are floating-point noise from the eigendecomposition).
+
+Next we compute the twisted generator $\hat A$ and the stationary
+distribution $\hat\varsigma$ of the chain under the twisted measure.
+
+This is the candidate long-run distribution that appears in the long-run
+limit {eq}`eq:long-run-limit`.
 
 ```{code-cell} ipython3
 A_hat = twisted_generator(A, ρ, φ)
@@ -1085,13 +1247,23 @@ print(f"  boom      {ς_hat[0]:.4f}")
 print(f"  recession {ς_hat[1]:.4f}")
 ```
 
-For any payoff function $\psi$, the limit in {eq}`eq:long-run-limit` is
+This twisted stationary distribution --- not the original chain's
+stationary distribution --- is what determines long-horizon valuations.
+
+It differs from the original distribution because the eigenfunction $\phi$
+reweights states by how persistently they affect the multiplicative
+functional.
+
+For any payoff function $\psi$, the long-run limit
+{eq}`eq:long-run-limit` is the vector
 
 $$
     \phi
     \sum_i \frac{\psi_i}{\phi_i}\hat\varsigma_i .
 $$
 
+Let's check that the rescaled semigroup converges to this limit as $t$ grows.
+
 ```{code-cell} ipython3
 ψ = np.array([1.0, 2.0])
 limit = φ * np.sum((ψ / φ) * ς_hat)
@@ -1103,16 +1275,20 @@ for t in [1, 5, 20, 80]:
 print("\nlimit =", limit)
 ```
 
+The rescaled value converges to the same limiting vector regardless of the
+starting state --- exactly what {eq}`eq:long-run-limit` predicts.
+
 ### Adding jumps
 
-State transitions in this model are discontinuous, so the multiplicative
-functional should be allowed to jump at the transition times.
+State transitions in this chain are discontinuous, so it is natural to allow
+the multiplicative functional to jump at the transition times --- the
+analogue of the $\kappa$ function in the jump-diffusion parameterization.
 
-A natural case is a stochastic discount factor that pays out more when the
-economy switches into a boom and less when it switches into a recession.
+A natural case for a stochastic discount factor: it jumps *up* when the
+economy moves from recession into boom (good news, marginal utility falls)
+and *down* on the reverse transition.
 
-The matrix `κ_jump` below encodes this: the functional jumps up on a
-recession-to-boom transition and down on a boom-to-recession transition.
+The matrix `κ_jump` below encodes this.
 
 ```{code-cell} ipython3
 κ_jump = np.array([[0.0,  0.30],
@@ -1151,22 +1327,27 @@ ax.set_title("Jumps and the Long-Run Growth Rate")
 plt.show()
 ```
 
-Larger upward jumps on the recession-to-boom transition raise $\rho$,
-because they make the functional grow more on transitions out of the
-high-decay state.
+Larger upward jumps on recession-to-boom transitions raise $\rho$ because the
+functional jumps up on those transitions.
 
 ## The affine diffusion example
 
-We now apply the operator approach to a continuous-state model that is
-tractable enough to solve in closed form.
+We now move to a continuous-state model.
+
+We will use a two-factor affine specification that captures the two main
+empirical features of asset returns:
 
-The state has two independent components.
+* **stochastic volatility** --- the dispersion of shocks is itself a state
+  variable, and
+* **predictable growth** --- there is a small, persistent state variable
+  shifting expected growth rates.
 
-The first is a Feller square-root process $X^f$, used to model stochastic
-volatility.
+This is the kind of state process used in long-run risk models like
+{cite:t}`Bansal_Yaron_2004`.
 
-The second is an Ornstein-Uhlenbeck process $X^o$, used to model predictable
-growth.
+We work with two independent state components: a Feller square-root process
+$X^f$ (stochastic volatility) and an Ornstein-Uhlenbeck process $X^o$
+(predictable growth):
 
 $$
 \begin{aligned}
@@ -1182,32 +1363,52 @@ dX_t^o
 \end{aligned}
 $$ (eq:affine-state)
 
-We normalize $\sigma_o > 0$ and $\sigma_f < 0$.
+The parameters $\xi_f, \xi_o>0$ are mean-reversion speeds, $\bar x_f,
+\bar x_o$ are the unconditional means, and $\sigma_f, \sigma_o$ are
+diffusion coefficients.
 
-The sign of $\sigma_f$ is a convention that makes a positive $B^f$ shock
-reduce volatility.
+The OU process $X^o$ is the continuous-time analogue of the AR(1) process
+studied in {doc}`intro:ar1_processes`, and continuous-time linear
+asset-pricing models in the same family are developed in
+{doc}`affine_risk_prices`.
 
-Consider a multiplicative functional $M=\exp(A)$ with
+We follow a sign convention with $\sigma_o>0$ and $\sigma_f<0$: a positive
+$B^f$ shock then *reduces* volatility, in line with the empirical
+"leverage effect."
+
+Now consider a multiplicative functional $M=\exp(A)$ with affine
+parameters:
 
 $$
 \begin{aligned}
 A_t
 &=
 \bar\beta t
-+ \int_0^t \beta_f X_s^f ds
-+ \int_0^t \beta_o X_s^o ds
++ \int_0^t \beta_f X_s^f\, ds
++ \int_0^t \beta_o X_s^o\, ds
 \\
 &\quad
-+ \int_0^t \sqrt{X_s^f}\gamma_f dB_s^f
-+ \int_0^t \gamma_o dB_s^o .
++ \int_0^t \sqrt{X_s^f}\,\gamma_f\, dB_s^f
++ \int_0^t \gamma_o\, dB_s^o .
 \end{aligned}
 $$ (eq:affine-additive)
 
-Because the state dynamics and the drift of $A$ are both affine in $(x^f,
-x^o)$, an exponential-affine eigenfunction closes the eigenvalue problem:
-applying the generator to $\phi(x^f,x^o) = \exp(c_f x^f + c_o x^o)$ produces
-another exponential-affine function, so $\mathbb A\phi = \rho\phi$ reduces to
-algebraic conditions on $(c_f, c_o, \rho)$.
+So the drift of $A$ is affine in the state ($\bar\beta + \beta_f X^f +
+\beta_o X^o$), the Brownian loadings are constant in the $B^o$ direction
+and proportional to $\sqrt{X^f}$ in the $B^f$ direction.
+
+### Why exponential-affine eigenfunctions work
+
+The key observation is a closure property: when the state is affine and
+the drift of $A$ is affine, applying the generator to an
+exponential-affine function $\phi(x^f,x^o) = \exp(c_f x^f + c_o x^o)$
+returns another exponential-affine function.
+
+The eigenvalue equation $\mathbb A\phi = \rho\phi$ then collapses to a
+small number of algebraic equations in $(c_f, c_o, \rho)$.
+
+This is the *continuous-state analogue* of the matrix Perron-Frobenius
+problem: we replace eigenvectors with exponential-affine eigenfunctions.
 
 ```{prf:definition} Exponential-Affine Eigenfunction
 :label: lrr-def-exponential-affine-eigenfunction
@@ -1223,14 +1424,9 @@ $$
 for some constant $c_0 \in \mathbb R$ and vector $c \in \mathbb R^n$.
 ```
 
-Substituting
-
-$$
-    \phi(x^f,x^o) = \exp(c_f x^f + c_o x^o)
-$$
-
-into the generator formula {eq}`eq:extended-generator` and matching
-coefficients of $x^f$, $x^o$, and the constant term gives
+Substituting $\phi(x^f,x^o) = \exp(c_f x^f + c_o x^o)$ into the generator
+formula {eq}`eq:extended-generator` and matching coefficients of $x^f$,
+$x^o$, and the constant term gives
 
 $$
 0
@@ -1262,7 +1458,7 @@ c_f
 }{\sigma_f^2}.
 $$ (eq:cf-roots)
 
-The eigenvalue is
+The eigenvalue is then determined by matching the constant term:
 
 $$
 \rho
@@ -1274,23 +1470,32 @@ $$
 + c_o^2 \frac{\sigma_o^2}{2}.
 $$ (eq:affine-rho)
 
-The relevant root is the one that keeps the twisted $X^f$ process mean
-reverting.
+**Picking the right root.** Equation {eq}`eq:cf-roots` gives two candidate
+values of $c_f$, and we need to know which one is the principal
+eigenfunction.
+
+This is where the stochastic-stability condition does real work.
 
 Under the twisted measure, the drift of $X^f$ is
 
 $$
     \xi_f(\bar x_f - x^f)
-    + x^f\sigma_f(\gamma_f+c_f\sigma_f).
+    + x^f\sigma_f(\gamma_f+c_f\sigma_f),
 $$
 
-Hence the mean-reversion coefficient is
+so the mean-reversion coefficient becomes
 
 $$
-    \xi_f - \sigma_f(\gamma_f+c_f\sigma_f),
+    \xi_f - \sigma_f(\gamma_f+c_f\sigma_f) .
 $$
 
-which must be positive.
+If this is positive, the twisted square-root process stays stationary; if
+it is negative, the twisted process is explosive and the eigenfunction is
+not the long-run-relevant one.
+
+So we **pick the root that keeps the twisted process mean-reverting** ---
+exactly the way stochastic stability selects the principal eigenfunction
+in the abstract theory.
 
 ```{code-cell} ipython3
 def solve_affine_eigenfunction(params):
@@ -1342,47 +1547,183 @@ def solve_affine_eigenfunction(params):
 
 ### A Breeden SDF
 
-{cite:t}`Breeden1979` studies a consumption-based continuous-time asset
-pricing model.
+To make things concrete, we now plug in a specific stochastic discount
+factor: the Breeden CRRA consumption-based SDF.
 
-In the present state specification, suppose log consumption satisfies
+{cite:t}`Breeden1979` derived the continuous-time SDF for an investor with
+time-separable CRRA preferences over a consumption stream.
+
+We specify log consumption with the same affine state $X^o$ driving
+expected growth and $\sqrt{X^f}$ driving volatility:
 
 $$
     dc_t
     =
-    X_t^o dt
-    + \sqrt{X_t^f}\vartheta_f dB_t^f
-    + \vartheta_o dB_t^o .
+    X_t^o\, dt
+    + \sqrt{X_t^f}\,\vartheta_f\, dB_t^f
+    + \vartheta_o\, dB_t^o .
 $$
 
-With time-separable CRRA utility and subjective discount rate $b$, the
-stochastic discount factor is
+With time-separable CRRA utility (risk aversion $a$) and subjective discount
+rate $b$, the stochastic discount factor is
 
 $$
     S_t
     =
-    \exp(-bt-a(c_t-c_0)).
+    \exp\!\big(-bt - a(c_t-c_0)\big),
 $$
 
-Thus it has the affine parameters
+i.e. an exponential of $-b$ times time minus $a$ times log consumption growth.
+
+Reading off the additive functional coefficients gives
 
 $$
     \bar\beta^s = -b,
-    \qquad
+    \quad
     \beta_f^s = 0,
-    \qquad
+    \quad
     \beta_o^s = -a,
-    \qquad
+    \quad
     \gamma_f^s = -a\vartheta_f,
-    \qquad
+    \quad
     \gamma_o^s = -a\vartheta_o .
 $$ (eq:breeden-sdf-params)
 
-The recursive preferences of {cite:t}`Kreps_Porteus1978` and
-{cite:t}`Epstein_Zin1989`, used in long-run risk models such as
-{cite:t}`Bansal_Yaron_2004`, change these parameters by adding
-forward-looking terms --- but the operator calculations below are identical
-once $(\bar\beta,\beta_f,\beta_o,\gamma_f,\gamma_o)$ are specified.
+We will use these parameters in the numerical example below.
+
+### Recursive preferences (optional)
+
+A famous limitation of CRRA preferences is that they conflate risk aversion
+with the elasticity of intertemporal substitution.
+
+Recursive preferences {cite:t}`Kreps_Porteus1978, Epstein_Zin1989` separate
+the two, and add a forward-looking continuation-value term to the
+discount-factor expression.
+
+This is what powers the equity-premium results in
+{cite:t}`Bansal_Yaron_2004`.
+
+A QuantEcon lecture that studies long-run dynamics under recursive
+preferences in a different setting is
+{doc}`survival_recursive_preferences`.
+
+The block below derives the SDF coefficients for the unit-elasticity
+recursive specification. You can skip on a first read and come back later
+--- the numerical example uses the simpler Breeden parameters above.
+
+For the unit-elasticity recursive specification, conjecture a continuation
+value of the form
+
+$$
+    W_t
+    =
+    \frac{1}{1-a}
+    \exp\left[
+        (1-a)(w_f X_t^f + w_o X_t^o + c_t + \bar w)
+    \right],
+    \qquad a>1 .
+$$ (eq:kp-continuation-value)
+
+Matching the local mean of this continuation value gives
+
+$$
+\begin{aligned}
+0
+&=
+-\xi_f w_f
++ \frac{(1-a)\sigma_f^2}{2}w_f^2
++ (1-a)\vartheta_f\sigma_f w_f
++ \frac{(1-a)\vartheta_f^2}{2}
+- b w_f,
+\\
+0
+&=
+-\xi_o w_o + 1 - b w_o,
+\\
+b\bar w
+&=
+\xi_f \bar x_f w_f
++ \xi_o \bar x_o w_o
++ \frac{(1-a)\sigma_o^2}{2}w_o^2
++ (1-a)\vartheta_o\sigma_o w_o
++ \frac{(1-a)\vartheta_o^2}{2}.
+\end{aligned}
+$$ (eq:kp-continuation-coefficients)
+
+The relevant solution has $w_o=1/(\xi_o+b)$, selects the stable root of the
+quadratic for $w_f$, and then determines $\bar w$ from the constant equation.
+
+The stochastic discount factor is the product $S_t=\exp(A_t^B)\exp(A_t^W)$,
+where the Breeden-like logarithmic component is
+
+$$
+    A_t^B
+    =
+    -bt
+    - \int_0^t X_s^o ds
+    - \int_0^t \sqrt{X_s^f}\vartheta_f dB_s^f
+    - \int_0^t \vartheta_o dB_s^o
+$$
+
+The continuation-value martingale component is
+
+$$
+\begin{aligned}
+    A_t^W
+    &=
+    (1-a)\int_0^t
+        \sqrt{X_s^f}(\vartheta_f+w_f\sigma_f)dB_s^f
+    + (1-a)\int_0^t
+        (\vartheta_o+w_o\sigma_o)dB_s^o
+\\
+    &\quad
+    - \frac{(1-a)^2}{2}
+        \int_0^t X_s^f(\vartheta_f+w_f\sigma_f)^2ds
+    - \frac{(1-a)^2}{2}
+        (\vartheta_o+w_o\sigma_o)^2t .
+\end{aligned}
+$$ (eq:kp-sdf-components)
+
+Thus the recursive-utility SDF has affine parameters
+
+$$
+\begin{aligned}
+\bar\beta^s
+&=
+-b-\frac{(1-a)^2}{2}(\vartheta_o+w_o\sigma_o)^2,
+\\
+\beta_f^s
+&=
+-\frac{(1-a)^2}{2}(\vartheta_f+w_f\sigma_f)^2,
+\\
+\beta_o^s
+&= -1,
+\\
+\gamma_f^s
+&=
+-a\vartheta_f-(a-1)w_f\sigma_f,
+\\
+\gamma_o^s
+&=
+-a\vartheta_o-(a-1)w_o\sigma_o .
+\end{aligned}
+$$ (eq:kp-sdf-params)
+
+The local Brownian risk prices are therefore
+$\sqrt{x^f}[a\vartheta_f+(a-1)w_f\sigma_f]$ for $B^f$ exposure and
+$a\vartheta_o+(a-1)w_o\sigma_o$ for $B^o$ exposure.
+
+The numerical example below uses the simpler Breeden specification, but the
+same operator calculation applies once the SDF parameters are replaced by
+{eq}`eq:kp-sdf-params`.
+
+### Numerical example
+
+Let's set up parameters and solve for the principal eigenpair.
+
+We use plausible monthly-frequency parameters: a mean-reverting volatility
+factor $X^f$ with mean $0.04$, a slower-moving predictable-growth factor
+$X^o$ with mean $0.02$, risk aversion $a=4$, time discount rate $b=0.03$.
 
 ```{code-cell} ipython3
 params_state = {
@@ -1418,8 +1759,12 @@ print(f"twisted mean-reversion coefficient for Xf = {mr_s:.6f}")
 print(f"long-run zero-coupon yield = {-ρ_s:.4f}")
 ```
 
-The rejected root for $c_f$ would make the twisted volatility process
-explosive rather than stationary.
+The long-run zero-coupon yield $-\rho_s$ represents the asymptotic decay
+rate in the SDF expectation $E[S_t]$.
+
+We can also check that the rejected root for $c_f$ would have produced a
+non-stationary twisted process --- a clear example of stochastic stability
+selecting one of two algebraically valid eigenfunctions.
 
 ```{code-cell} ipython3
 ξ_f = params_sdf["ξ_f"]
@@ -1443,38 +1788,37 @@ for cf in cf_candidates:
 
 ### The martingale component
 
-Having solved for the eigenpair $(\rho,\phi)$, we can now assemble the
-multiplicative factorization {eq}`eq:hs-factorization` explicitly.
+Now that we have the eigenpair $(\rho,\phi)$, we can write down all three
+pieces of the factorization {eq}`eq:hs-factorization` explicitly.
 
-The martingale component $\hat M$ defined in {eq}`eq:mhat` has log
+The martingale component $\hat M_t = \exp(\hat A_t)$ defined in
+{eq}`eq:mhat` has log
 
 $$
 \begin{aligned}
 \hat A_t
 &=
-\int_0^t
-    \sqrt{X_s^f}(\gamma_f+c_f\sigma_f)dB_s^f
-+ \int_0^t
-    (\gamma_o+c_o\sigma_o)dB_s^o
+\int_0^t \sqrt{X_s^f}(\gamma_f+c_f\sigma_f)\, dB_s^f
++ \int_0^t (\gamma_o+c_o\sigma_o)\, dB_s^o
 \\
 &\quad
-- \frac{1}{2}
-  \int_0^t
-    X_s^f(\gamma_f+c_f\sigma_f)^2 ds
-- \frac{1}{2}
-  \int_0^t
-    (\gamma_o+c_o\sigma_o)^2 ds .
+- \frac{1}{2}\int_0^t X_s^f(\gamma_f+c_f\sigma_f)^2\, ds
+- \frac{1}{2}\int_0^t (\gamma_o+c_o\sigma_o)^2\, ds .
 \end{aligned}
 $$
 
-The corresponding drift distortions are
+The first line is the Brownian integral (the "exponential martingale" piece);
+the second is the Itô correction making it a true martingale.
+
+Under the twisted measure induced by $\hat M$, the drifts of the state
+variables shift to
 
 $$
 \begin{aligned}
 dX_t^f:
 \quad&
 \xi_f(\bar x_f-X_t^f)
-+ X_t^f\sigma_f(\gamma_f+c_f\sigma_f),
++ X_t^f \sigma_f(\gamma_f+c_f\sigma_f),
 \\
 dX_t^o:
 \quad&
@@ -1483,8 +1827,10 @@ dX_t^o:
 \end{aligned}
 $$
 
-The code below simulates the state and constructs the three factors in
-{eq}`eq:hs-factorization`.
+The drift distortions are exactly the Girsanov shifts induced by the
+Brownian loadings of $\hat M$.
+
+Let's now simulate the state and verify the factorization holds.
 
 ```{code-cell} ipython3
 def brownian_increments(n, dt, seed=1234):
@@ -1587,63 +1933,82 @@ plt.show()
 
 ## Long-run risk prices
 
-The eigenpair $(\rho, \phi)$ from the cash-flow valuation problem also lets
-us define a *long-run* analogue of the instantaneous risk prices used in
-local continuous-time asset pricing.
+We can now use the factorization to compute long-run analogues of the
+instantaneous risk prices that come out of standard continuous-time asset
+pricing.
+
+The economic question is sharp:
+
+> If an investor takes on a small exposure to a shock today, how much extra
+> expected return do they need --- as compensation --- when we measure that
+> compensation as a long-horizon rate rather than as an instantaneous one?
+
+The two answers --- local and long-run --- need not agree.
+
+The reason: a shock that moves a persistent state variable has a small
+*immediate* effect on the cash flow but a large *cumulative* effect on
+future expected growth and discounting.
+
+So the long-run risk price is the local price *plus a persistence
+correction.*
 
-The two prices need not agree: a shock that moves a persistent state variable
-has a small immediate effect on the cash flow but a large cumulative effect
-on future growth and discounting.
+The size of the correction depends on the speed of mean reversion.
 
-Our aim is to compare the two.
+### Defining the prices
 
 ```{prf:definition} Local Brownian Risk Price
 :label: lrr-def-local-brownian-risk-price
 
 The **local Brownian risk price** is the state-dependent vector
-$-\gamma^s(x)$, which prices exposure measured in the same Brownian units as
-the valuation functional loading $\gamma^v(x)$.
+$-\gamma^s(x)$.
+
+A small Brownian exposure $\gamma^v_i$ earns a required expected return of
+$-\gamma^v_i \gamma^s_i$ per unit time, so a unit of exposure $\gamma^v_i$
+is priced at $-\gamma^s_i$.
 ```
 
+The local price is easy: read it directly off the SDF coefficients.
+
 ```{prf:definition} Long-Run Risk Price
 :label: lrr-def-long-run-risk-price
 
-For cash-flow growth risk, the **long-run risk price** is the marginal change
-in the asymptotic required return with respect to the cash-flow growth
-exposure.
-
-$$
-    R_\infty = -\rho+\delta
-$$
+The **long-run risk price** is the marginal change in the long-run required
+return on a cash flow with respect to a small change in its risk exposure.
 
-Equivalently, it is the negative of the marginal change in the principal
-eigenvalue of the $GS$ semigroup, because $\delta$ is held fixed.
+When we work with growing cash flows, the long-run required return is
+$R_\infty = -\rho + \delta$, where $\rho$ is the principal eigenvalue of
+the $GS$ semigroup and $\delta$ is the trend growth rate, held fixed.
 ```
 
-The local Brownian price is read off the SDF directly: for a valuation
-functional with Brownian exposure $\gamma^v$, the Brownian part of the local
-required expected return is $-\gamma^v \cdot \gamma^s$, so a unit of
-$\gamma^v_i$ exposure is priced at $-\gamma^s_i$.
+Computing the long-run price requires solving the principal eigenvalue
+problem --- it captures how a shock propagates through the persistent state
+component.
 
-The long-run price requires solving the principal eigenvalue problem, since
-it depends on how a shock propagates through the persistent state.
+### Two frontiers
 
-There are two related long-run frontiers.
+We will see *two* related ways to vary risk exposure, each leading to a
+slightly different long-run risk price:
 
-For a valuation frontier, set $M=V$: choose the return exposure
-$(\gamma^v,\kappa^v)$, use the local pricing restriction to determine
-$\beta^v$, and compute the principal eigenvalue of the $V$ semigroup.
+1. **Valuation-functional frontier.** Hold the SDF $S$ fixed and vary the
+   asset's Brownian exposures $(\gamma^v_f, \gamma^v_o)$. Use the local
+   pricing restriction to determine the drift $\beta^v$, then compute
+   $\rho^v$ for the $V$-semigroup.
 
-For a cash-flow frontier, set $M=GS$: choose the growth exposure in $G$ and
-compute the principal eigenvalue of the valuation semigroup for growing cash
-flows.
+2. **Cash-flow frontier.** Hold the SDF $S$ fixed and vary the cash-flow's
+   growth exposures $(\gamma^g_f, \gamma^g_o)$. Set $M = GS$ and compute
+   the principal eigenvalue $\rho$ of the cash-flow valuation semigroup.
 
-These frontiers coincide in simple log-normal examples for some shocks, but
-they can differ with stochastic volatility, nonlinear dynamics, or jump risk.
+These two frontiers coincide in simple log-normal examples, but they can
+differ with stochastic volatility, nonlinear dynamics, or jump risk.
+
+We will work out both in the affine model below.
 
 ### Stochastic discount factor decomposition
 
-A useful benchmark is the case $M=S$.
+Before getting into risk prices, a natural benchmark is the case $M=S$:
+applying the factorization {eq}`eq:hs-factorization` directly to the SDF
+itself.
+
 The factorization becomes
 
 $$
@@ -1653,45 +2018,70 @@ $$
     \frac{\phi(X_0)}{\phi(X_t)} .
 $$
 
-This is the permanent-transitory decomposition emphasized by
-{cite:t}`AlvarezJermann2005`, now linked to a principal eigenfunction.
+This is the **permanent-transitory decomposition** of
+{cite:t}`AlvarezJermann2005`, now linked to a concrete eigenfunction
+construction.
+
+The factor $\exp(\rho t)$ is the deterministic trend in the SDF and the
+martingale $\hat M$ is its random *permanent* component; the
+state-dependent ratio is *transient* and washes out.
+
+```{seealso}
+The same spectral decomposition of the pricing operator is the central
+tool in {doc}`ross_recovery`, which uses it to "recover" subjective beliefs
+from observed prices, and in {doc}`misspecified_recovery`, which examines
+what goes wrong when the permanent martingale component is mistakenly
+assumed away.
+```
+
 For a long zero-coupon bond,
 
 $$
-    \exp(-\rho t)E[S_t \mid X_0=x]
+    \exp(-\rho t)\, E[S_t \mid X_0=x]
     \to
     \phi(x)
-    \int \frac{1}{\phi}\, d\hat\varsigma .
+    \int \frac{1}{\phi}\, d\hat\varsigma ,
 $$
 
-Thus prices of very long maturity discount bonds depend on the current state
-primarily through the eigenfunction $\phi$.
+so long-maturity discount bond prices depend on the current state primarily
+through the eigenfunction $\phi$.
 
 ### Comparison in the affine model
 
-In the affine model, the local price of exposure to $B^o$ is
+For the affine specification, we can write closed-form expressions for both
+local and long-run prices of a $B^o$ shock.
+
+The local price is just
 
 $$
-    -\gamma_o^s ,
+    \text{local price of } B^o = -\gamma_o^s.
 $$
 
-while the long-run price is
+The long-run price (which we will derive below using the valuation-functional
+frontier) is
 
 $$
+    \text{long-run price of } B^o
+    =
     -\gamma_o^s
     - \frac{\beta_o^s}{\xi_o}\sigma_o .
 $$ (eq:long-run-price-o)
 
-The second term is the **persistence adjustment**: a shock to $B^o$ moves
-the persistent growth predictor $X^o$, and because $X^o$ mean reverts at rate
-$\xi_o$, the cumulative effect of the shock is larger when $\xi_o$ is
-smaller.
+The extra term $-(\beta_o^s/\xi_o)\sigma_o$ is the **persistence correction**.
+
+It arises because:
+
+* a $B^o$ shock moves the persistent growth predictor $X^o$, and
+* $X^o$ mean reverts at rate $\xi_o$, so the cumulative effect of the shock
+  on future SDF growth scales like $1/\xi_o$.
 
-The local price of $B^f$ exposure is state dependent because the exposure is
-scaled by $\sqrt{X^f_t}$.
+As $\xi_o$ shrinks, persistence grows and the long-run price diverges from
+the local one --- which is the central economic content of long-run risk
+models.
 
-The long-run price of $B^f$ exposure is nonlinear in general because the
-coefficient $c_f$ of the principal eigenfunction solves a quadratic equation.
+The local price of $B^f$ exposure is state dependent (it scales with
+$\sqrt{X^f_t}$), and the long-run price of $B^f$ exposure is nonlinear,
+since $c_f$ is the root of a quadratic.
 
 ```{code-cell} ipython3
 γ_s_o = params_sdf["γ_o"]
@@ -1725,9 +2115,149 @@ ax.legend()
 plt.show()
 ```
 
+### Changing valuation functionals
+
+Now we work out the long-run risk price formula by varying the asset's
+exposure --- the **valuation-functional frontier** introduced above.
+
+We hold the SDF $S$ fixed and pick Brownian exposures
+$(\gamma_f^v,\gamma_o^v)$ for the asset return, parameterizing the
+valuation functional as
+
+$$
+\begin{aligned}
+A_t^v
+&=
+\bar\beta^v t
++ \int_0^t \beta_f^v X_s^f ds
++ \int_0^t \beta_o^v X_s^o ds
+\\
+&\quad
++ \int_0^t \sqrt{X_s^f}\gamma_f^v dB_s^f
++ \int_0^t \gamma_o^v dB_s^o .
+\end{aligned}
+$$
+
+The martingale restriction on $VS$ determines the drift coefficients from the
+chosen Brownian exposures $(\gamma_f^v,\gamma_o^v)$:
+
+$$
+\begin{aligned}
+\bar\beta^v
+&=
+-\bar\beta^s
+- \frac{1}{2}(\gamma_o^s+\gamma_o^v)^2,
+\\
+\beta_f^v
+&=
+-\beta_f^s
+- \frac{1}{2}(\gamma_f^s+\gamma_f^v)^2,
+\\
+\beta_o^v
+&=
+-\beta_o^s .
+\end{aligned}
+$$ (eq:valuation-local-restriction-affine)
+
+Applying the affine eigenvalue formula to $M=V$ gives
+
+$$
+\begin{aligned}
+\rho^v
+&=
+\bar\beta^v
++ \frac{(\gamma_o^v)^2}{2}
++ c_f^v \xi_f \bar x_f
++ c_o^v(\xi_o\bar x_o+\gamma_o^v\sigma_o)
++ (c_o^v)^2\frac{\sigma_o^2}{2}
+\\
+&=
+-\bar\beta^s
+- \frac{(\gamma_o^s)^2}{2}
+- \gamma_o^s\gamma_o^v
++ c_f^v \xi_f \bar x_f
++ c_o^v(\xi_o\bar x_o+\gamma_o^v\sigma_o)
++ (c_o^v)^2\frac{\sigma_o^2}{2},
+\end{aligned}
+$$ (eq:valuation-rho-affine)
+
+where $c_o^v=\beta_o^v/\xi_o=-\beta_o^s/\xi_o$ and $c_f^v$ solves the same
+quadratic equation as in {eq}`eq:cf-eq` with $(\beta_f,\gamma_f)$ replaced by
+$(\beta_f^v,\gamma_f^v)$.
+
+Holding $\gamma_f^v$ fixed, differentiating {eq}`eq:valuation-rho-affine`
+with respect to $\gamma_o^v$ gives the long-run valuation-functional price of
+$B^o$ exposure:
+
+$$
+    \frac{\partial \rho^v}{\partial \gamma_o^v}
+    =
+    -\gamma_o^s
+    + c_o^v\sigma_o
+    =
+    -\gamma_o^s
+    - \frac{\beta_o^s}{\xi_o}\sigma_o .
+$$ (eq:valuation-long-run-price-o)
+
+This matches the formula {eq}`eq:long-run-price-o` we previewed above ---
+the local price plus the persistence correction $-(\beta_o^s/\xi_o)\sigma_o$.
+
+Let's verify the formula numerically by finite-differencing the eigenvalue
+computation.
+
+```{code-cell} ipython3
+def valuation_params_from_exposure(γ_v_o, γ_v_f=0.0):
+    """Affine parameters for a valuation functional V."""
+    p = dict(params_sdf)
+    p.update({
+        "β_bar": (-params_sdf["β_bar"]
+                  - 0.5 * (params_sdf["γ_o"] + γ_v_o) ** 2),
+        "β_f": (-params_sdf["β_f"]
+                - 0.5 * (params_sdf["γ_f"] + γ_v_f) ** 2),
+        "β_o": -params_sdf["β_o"],
+        "γ_f": γ_v_f,
+        "γ_o": γ_v_o,
+    })
+    return p
+
+
+def valuation_eigenvalue_for_exposure(γ_v_o, γ_v_f=0.0):
+    """Principal eigenvalue for the valuation functional frontier."""
+    p = valuation_params_from_exposure(γ_v_o, γ_v_f)
+    _, _, ρ, _ = solve_affine_eigenfunction(p)
+    return ρ
+
+
+γ_v_o_grid = np.linspace(-0.5, 0.5, 101)
+ρ_v_grid = np.array([
+    valuation_eigenvalue_for_exposure(g) for g in γ_v_o_grid
+])
+
+fig, ax = plt.subplots()
+ax.plot(γ_v_o_grid, ρ_v_grid, lw=2)
+ax.set_xlabel("valuation exposure $\\gamma_o^v$")
+ax.set_ylabel("principal eigenvalue $\\rho^v$")
+ax.set_title("Changing Valuation Functionals")
+plt.show()
+```
+
+```{code-cell} ipython3
+valuation_slope = (
+    valuation_eigenvalue_for_exposure(0.001)
+    - valuation_eigenvalue_for_exposure(-0.001)
+) / 0.002
+
+print(f"finite-difference slope = {valuation_slope:.6f}")
+print(f"formula                 = {long_run_price_o:.6f}")
+```
+
 ### Changing cash-flow risk
 
-Let a cash-flow growth functional be
+The second long-run frontier varies the cash-flow's exposure to risk rather
+than the return.
+
+We pick a growth functional $G_t = \exp(A_t^g)$ with the affine
+parameterization
 
 $$
 \begin{aligned}
@@ -1746,16 +2276,25 @@ A_t^g
 \end{aligned}
 $$ (eq:growth-functional)
 
-The last line makes $\exp(A_t^g-\delta t)$ a martingale.
-To keep the growth-twisted square-root volatility process stationary, the
-cash-flow exposure to $B^f$ must also satisfy the Feller-type restriction
+The last line makes $\exp(A_t^g-\delta t) = \hat G_t$ a martingale, with
+$\delta$ the constant trend growth rate.
+
+For the cash-flow exposure to $B^f$ we also need the Feller-type restriction
 
 $$
-    2(\xi_f+\sigma_f\gamma_f^g)\bar x_f \geq \sigma_f^2 .
+    2(\xi_f+\sigma_f\gamma_f^g)\bar x_f \geq \sigma_f^2 ,
 $$
 
-This is one example of a general point: changing growth risk can destroy the
-stability conditions needed for a long-run approximation.
+which keeps the growth-twisted square-root volatility process from hitting
+zero --- i.e., it preserves stochastic stability under the growth-twisted
+measure.
+
+```{note}
+This Feller restriction is a concrete instance of a general point we
+flagged earlier: changing growth risk can destroy stability and invalidate
+the long-run approximation, so the choice of $(\gamma_f^g, \gamma_o^g)$
+isn't free.
+```
 
 To price the cash flow $D_t=D_0G_t\psi(X_t)$, use the semigroup generated by
 $M=GS$.
@@ -1830,25 +2369,60 @@ print(f"finite-difference slope = {finite_difference:.6f}")
 print(f"formula                 = {long_run_price_o:.6f}")
 ```
 
+### Limiting holding-period return
+
+The same machinery gives the limiting one-period holding-period return on a
+claim to far-future cash flows.
+
+This is the *gross return* on holding an asset for a single period when its
+cash flow lies far in the future.
+
+For $D_t=D_0\, G_t\, \psi(X_t)$ and $M=GS$, the principal eigenpair
+$(\rho,\phi)$ implies
+
+$$
+    \lim_{t\to\infty}
+    \frac{E[S_t D_t / S_1 \mid \mathcal F_1]}
+         {E[S_t D_t \mid \mathcal F_0]}
+    =
+    \exp(-\rho)\, G_1\, \frac{\phi(X_1)}{\phi(X_0)} .
+$$ (eq:limiting-holding-period-return)
+
+The limit has three factors:
+
+* a **cash-flow growth** component $G_1$,
+* a **discount** component $\exp(-\rho)$ governed by the principal
+  eigenvalue, and
+* a **state-dependent** component $\phi(X_1)/\phi(X_0)$ governed by the
+  eigenfunction.
+
+A striking feature: the transient payoff shape $\psi$ drops out of the
+limiting return, so the long-run holding-period return on *every* claim to
+a far-future cash flow looks the same up to the cash-flow growth factor.
+
 ## Perron-Frobenius dominance
 
-In a finite-state chain, the long-run limit {eq}`eq:long-run-limit` is the
-Perron-Frobenius theorem in action.
+In the finite-state chain, the long-run limit {eq}`eq:long-run-limit` is
+exactly Perron-Frobenius theory in action.
 
 The positive semigroup generated by $A$ in {eq}`eq:finite-a` has a unique
 dominant real eigenvalue, and contributions from the remaining eigenvalues
-decay at an exponential rate equal to the gap between $\rho$ and the
-next-largest real part.
+decay at an exponential rate equal to the **spectral gap** --- the
+difference between $\rho$ and the next-largest real part.
 
-For general state spaces, the argument is not simply a finite-dimensional
-spectral-gap argument.
+The rate at which $\exp(-\rho t)\mathbb M_t\psi$ converges to its long-run
+limit is exactly this spectral gap.
 
-The martingale component $\hat M$ changes probability measure, and stability
-of the twisted process selects the eigenfunction that actually governs the
-long-run approximation.
+```{note}
+In general state spaces, the same intuition holds but the argument is
+substantially more subtle: the martingale component $\hat M$ changes
+probability measure, and *stability* of the twisted process is what
+selects the eigenfunction governing the long-run approximation. The
+finite-state case is a window onto the general theory.
+```
 
-We illustrate this on a three-state chain and read off the spectral gap
-directly.
+We illustrate the connection on a three-state chain: compute the spectral
+gap directly, then show convergence happens at that rate.
 
 ```{code-cell} ipython3
 state_names = ["expansion", "normal", "contraction"]
@@ -1916,41 +2490,108 @@ controlled by the spectral gap.
 
 The examples above make the eigenfunction calculation look mechanical.
 
-Several things can go wrong in general state spaces.
+For finite-state chains and the affine model, it really is mechanical ---
+Perron-Frobenius theory and closed-form algebra handle every requirement.
+
+But in a general state space, three things can go wrong, and each
+corresponds to one of the assumptions we have been carrying along.
+
+This section walks through what they are and why they matter.
+
+### Issue 1: $\hat M$ might fail to be a true martingale
+
+A positive eigenfunction $\phi$ gives us a candidate martingale $\hat M$
+from {eq}`eq:mhat`, but $\hat M$ is *automatically* only a nonnegative local
+martingale --- hence a supermartingale.
 
-First, a positive eigenfunction only gives a nonnegative local martingale
-$\hat M$.
+A supermartingale is not enough to define a probability measure: we need
+$E\hat M_t = 1$, i.e. a *true* martingale.
 
-It must be a true martingale before it can define a probability measure.
+A standard way to verify this is a two-sided **Girsanov construction**:
+write the drift and jump distortion induced by $\hat M$, check that the
+distorted Markov process is well-behaved, and verify that the reverse
+density (the inverse of $\hat M$) is locally integrable.
 
-A useful sufficient condition is a two-sided Girsanov construction in which
-the Brownian drift and jump compensator implied by $\hat M$ define a
-well-behaved distorted Markov process and the reverse density is locally
-integrable.
+### Issue 2: the twisted process might fail to be stable
 
-Second, the twisted Markov process must be stable.
+Even with $\hat M$ a true martingale, the long-run limit
+{eq}`eq:long-run-limit` requires that the twisted process actually settles
+into a steady state.
 
-Stationarity alone is not enough for the long-run limit, so we also use
-irreducibility of a sampled skeleton and Harris recurrence.
+This is where stochastic stability --- our trio of stationary distribution,
+irreducibility of the skeleton, and Harris recurrence --- does real work.
 
-These conditions eliminate spurious positive eigenfunctions.
+The affine example illustrates this concretely: we *rejected* one of the
+two algebraically valid eigenfunctions because it implied an explosive
+twisted square-root process. The math admitted two roots; stochastic
+stability picked the right one.
 
-In the affine example, this is why we reject the root that makes the
-square-root process explosive under the twisted measure.
+### Issue 3: a principal eigenfunction might not exist at all
 
-Third, existence of a principal eigenfunction is not automatic in a general
-state space.
+In a general state space, even *existence* of a strictly positive
+eigenfunction is not automatic.
 
-Useful sufficient conditions use drift or Lyapunov bounds such as
+A standard sufficient condition starts with a **Lyapunov-type drift bound**:
+there is a function $V \geq 1$ on the state space and a constant $a_0$ such
+that
 
 $$
-    \frac{\mathbb A V}{V} \leq a
+    \frac{\mathbb A V}{V} \leq a_0 .
 $$
 
-for a function $V \geq 1$, plus irreducibility of a resolvent operator.
+Roughly: $V$ doesn't grow too fast under the semigroup. With this in hand,
+for any $\alpha > a_0$ define the **resolvent operator**
+
+$$
+    F_\alpha \psi(x)
+    =
+    \int_0^\infty
+    \exp(-\alpha t)\,
+    E\!\left[
+        M_t\, \frac{V(X_t)}{V(x)}\, \psi(X_t)
+        \,\Big|\, X_0=x
+    \right] dt .
+$$ (eq:existence-resolvent)
+
+$F_\alpha$ is the Laplace transform of the semigroup generated by the
+*rescaled* multiplicative functional $M_t V(X_t)/V(X_0)$.
+
+The existence proof then proceeds in three steps:
+
+1. **Irreducibility for the resolvent.** There exists a reference measure
+   $\nu$ such that $F_\alpha\mathbf 1_\Lambda(x) > 0$ for every $x$
+   whenever $\nu(\Lambda) > 0$ --- so the resolvent doesn't "miss" any
+   region of state space.
+
+2. **Nummelin minorization.** Irreducibility yields a lower bound
+   $F_\alpha \psi \geq s\int \psi\, d\nu$ for nonnegative $\psi$. This is a
+   classical tool from general-state-space Markov-chain theory; the
+   constant $s>0$ is the *minorization strength*.
+
+3. **Eigenfunction extraction.** The minorization, combined with additional
+   boundedness or strengthened drift assumptions, identifies a critical
+   spectral value for $F_\alpha$ and an associated positive
+   eigenfunction. Inverting the resolvent transform produces a positive
+   eigenfunction for the original semigroup.
+
+### Summary of the assumption hierarchy
+
+We can summarize the chain of conditions as:
+
+| Want | Need |
+|:---|:---|
+| A factorization {eq}`eq:hs-factorization` | A positive eigenfunction $\phi$ |
+| $\hat M$ to define a probability measure | $\hat M$ is a true martingale |
+| The long-run limit {eq}`eq:long-run-limit` | Stochastic stability of the twisted process |
+| A unique principal eigenfunction | Stability selects among positive eigenfunctions |
+
+In the finite-state case, all four follow from one Perron-Frobenius
+calculation; in the affine model, they reduce to picking the right root of
+a quadratic. In general, each must be checked separately.
 
-Finite-state Perron-Frobenius theory and the affine closed-form solution are
-special cases where these issues are easy to verify directly.
+The full theory in {cite:t}`HansenScheinkman2009` also delivers stronger
+$L^p$ approximation results and Lyapunov criteria for stochastic stability,
+which we don't reproduce here.
 
 ## Summary
 
diff --git a/lectures/ls_learning.md b/lectures/ls_learning.md
index 90b23cbc5..a6ba0e438 100644
--- a/lectures/ls_learning.md
+++ b/lectures/ls_learning.md
@@ -36,7 +36,7 @@ closely related but distinct question of whether **least squares** learning
 converges to a rational expectations equilibrium in self-referential models.
 
 
-This lecture presents the framework of {cite}`MarcetSargent1989jet` for studying
+This lecture presents the framework of {cite:t}`MarcetSargent1989jet` for studying
 **least squares learning** in a class of **self-referential** linear stochastic models.
 
 A self-referential model is one in which the **actual** law of motion for the
@@ -50,9 +50,9 @@ But if agents start away
 from equilibrium and update their beliefs by running least squares regressions,
 will they converge to the REE?
 
-{cite}`MarcetSargent1989jet` answer this question by exploiting a powerful
+{cite:t}`MarcetSargent1989jet` answer this question by exploiting a powerful
 technique from systems-control engineering: the **differential equation
-approach** of {cite}`Ljung1977`.
+approach** of {cite:t}`Ljung1977`.
 
 The key insight is that the stochastic
 difference equation describing how beliefs evolve can be approximated, in the
@@ -62,12 +62,12 @@ Almost-sure
 convergence of least squares to the REE is then equivalent to **local stability**
 of the REE as a fixed point of that ODE.
 
-The framework unifies and extends earlier work by {cite}`Bray1982` and
-{cite}`BraySavin1984` and connects naturally to the distinction between learning
+The framework unifies and extends earlier work by {cite:t}`Bray1982` and
+{cite:t}`BraySavin1984` and connects naturally to the distinction between learning
 *within* a rational expectations equilibrium (Bayesian updating inside a
 correctly specified model) and learning *about* one (adapting an OLS estimator
 whose data-generating process shifts with beliefs) discussed in
-{cite}`BrayKreps1987`.
+{cite:t}`BrayKreps1987`.
 
 
 
@@ -87,22 +87,22 @@ simulate recursive least squares in a scalar self-referential model, and one
 to solve the associated ODE.
 
 ```{code-cell} ipython3
-def simulate_rls_scalar(T_map, sigma_u, beta0, T_periods=500, N_paths=100,
+def simulate_rls_scalar(T_map, σ_u, β0, T_periods=500, N_paths=100,
                         a_seq=None, seed=0):
     """
     Simulate recursive least squares in a scalar self-referential model.
 
-    The perceived law of motion is:  z1_t = beta_t * z2_{t-1} + u_t
-    The actual law of motion is:     z1_t = T(beta_t) * z2_{t-1} + V * u_t
+    The perceived law of motion is:  z1_t = β_t * z2_{t-1} + u_t
+    The actual law of motion is:     z1_t = T(β_t) * z2_{t-1} + V * u_t
 
     For the scalar examples here z2_t = 1 (constant), so agents learn about
     the mean of a process that depends on their own expectation.
 
     Parameters
     ----------
-    T_map    : callable, the mapping T: beta -> T(beta)
-    sigma_u  : float, std of innovations
-    beta0    : float, initial belief
+    T_map    : callable, the mapping T: β -> T(β)
+    σ_u  : float, std of innovations
+    β0    : float, initial belief
     T_periods: int, simulation length
     N_paths  : int, number of Monte Carlo paths
     a_seq    : None or array of length T_periods (forgetting factors)
@@ -110,49 +110,49 @@ def simulate_rls_scalar(T_map, sigma_u, beta0, T_periods=500, N_paths=100,
 
     Returns
     -------
-    beta_paths : ndarray, shape (N_paths, T_periods)
+    β_paths : ndarray, shape (N_paths, T_periods)
     """
     rng = np.random.default_rng(seed)
     if a_seq is None:
         a_seq = np.ones(T_periods)          # standard OLS
 
-    beta_paths = np.empty((N_paths, T_periods))
+    β_paths = np.empty((N_paths, T_periods))
 
     for i in range(N_paths):
-        beta = beta0
+        β = β0
         R = 1.0          # scalar moment estimate
         prec = 1.0 / R   # use precision for numerical stability
 
         for t in range(T_periods):
-            alpha_t = a_seq[t]
+            α_t = a_seq[t]
             # z2 = 1 (constant regressor), so z2*z2' = 1
             z2 = 1.0
-            u_t = rng.normal(0, sigma_u)
+            u_t = rng.normal(0, σ_u)
 
-            # Actual z1 given current beta
-            z1 = T_map(beta) * z2 + u_t
+            # Actual z1 given current β
+            z1 = T_map(β) * z2 + u_t
 
-            # RLS update (lagged: use previous beta to form z1, then update)
-            R = R + (alpha_t / (t + 1)) * (z2**2 - R / alpha_t)
+            # RLS update (lagged: use previous β to form z1, then update)
+            R = R + (α_t / (t + 1)) * (z2**2 - R / α_t)
             R = max(R, 1e-8)
-            beta = beta + (alpha_t / (t + 1)) / R * z2 * (z1 - beta * z2)
+            β = β + (α_t / (t + 1)) / R * z2 * (z1 - β * z2)
 
-            beta_paths[i, t] = beta
+            β_paths[i, t] = β
 
-    return beta_paths
+    return β_paths
 
 
-def solve_ode(f_ode, beta0, t_span=(0, 80), n_points=1000):
-    """Solve scalar ODE d(beta)/dt = f_ode(beta) from beta0."""
-    sol = solve_ivp(lambda t, y: [f_ode(y[0])], t_span, [beta0],
+def solve_ode(f_ode, β0, t_span=(0, 80), n_points=1000):
+    """Solve scalar ODE d(β)/dt = f_ode(β) from β0."""
+    sol = solve_ivp(lambda t, y: [f_ode(y[0])], t_span, [β0],
                     t_eval=np.linspace(*t_span, n_points), method='RK45',
                     max_step=0.1)
     return sol.t, sol.y[0]
 ```
 
-## The Self-Referential Structure
+## The self-referential structure
 
-### Perceived and Actual Laws of Motion
+### Perceived and actual laws of motion
 
 At each date $t$, agents hold a **perceived law of motion** summarised by a
 parameter matrix $\beta_t$.
@@ -171,7 +171,7 @@ Because agents optimise (or behave) on the basis of this belief, their actions
 feed back into the economy.
 
 The **actual** law of motion for the full state
-vector $z_t = (z_{1t}, z_{1t}^c)'$ is
+vector $z_t = (z_{1t}, z_{1t}^c)^\top$ is
 
 $$
 z_t = \begin{bmatrix} 0 & T(\beta_t) \\ A(\beta_t) & \end{bmatrix}
@@ -187,43 +187,68 @@ to the coefficient that **actually** governs $z_{1t}$ in equilibrium.
 A
 **rational expectations equilibrium** is a fixed point $\beta_f = T(\beta_f)$.
 
-### The Learning Scheme
+### The learning scheme
 
 Agents update $\beta_t$ each period using **recursive least squares** (RLS).
-Define $R_t$ as a running estimate of the second-moment matrix $Ez_{2t}z_{2t}'$.
 
+Define $R_t$ as a running estimate of the second-moment matrix $E z_{2t}z_{2t}^\top$.
 
 Updating equations are
 
 $$
-\beta_t' = \beta_{t-1}' + \frac{\alpha_t}{t} R_{t-1}^{-1}
-           z_{2,t-2} z_{2,t-2}' \bigl[ T(\beta_{t-1})' - \beta_{t-1}' \bigr]
-         + \frac{\alpha_t}{t} z_{2,t-2} u_{t-1}' V(\beta_{t-1})' ,
+\beta_t^\top = \beta_{t-1}^\top + \frac{\alpha_t}{t} R_{t-1}^{-1}
+           z_{2,t-2} z_{2,t-2}^\top \bigl[ T(\beta_{t-1})^\top - \beta_{t-1}^\top \bigr]
+         + \frac{\alpha_t}{t} z_{2,t-2} u_{t-1}^\top V(\beta_{t-1})^\top ,
 $$ (eq:rls_beta)
 
 $$
-R_t = R_{t-1} + \frac{\alpha_t}{t} \bigl[ z_{2,t-1} z_{2,t-1}' - R_{t-1}/\alpha_t \bigr] ,
+R_t = R_{t-1} + \frac{\alpha_t}{t} \bigl[ z_{2,t-1} z_{2,t-1}^\top - R_{t-1}/\alpha_t \bigr] ,
 $$ (eq:rls_R)
 
 where $\{\alpha_t\}$ is a positive, non-decreasing sequence with $\alpha_t \to 1$
-as $t \to \infty$.  When $\alpha_t = 1$ for all $t$, equations
+as $t \to \infty$.
+
+When $\alpha_t = 1$ for all $t$, equations
 {eq}`eq:rls_beta`–{eq}`eq:rls_R` reduce to **ordinary least squares** updated
 recursively.
 
+### Lagged and contemporaneous data
+
+The recursion above is written with **lagged information**, so the estimate
+$\beta_t$ uses observations available through date $t-1$.
+
+Section 3 of {cite:t}`MarcetSargent1989jet` also treats a **contemporaneous-data**
+version in which agents update using $z_{1t}$ and $z_{2,t-1}$ at date $t$.
+
+That timing creates simultaneous determination, because $z_t$ depends on the
+same estimate $\beta_t$ that is being updated from $z_t$.
+
+The extra requirement is that the date-$t$ system have a unique solution
+$(\beta_t, R_t, z_t)$ for each history.
+
+Under that uniqueness condition, Proposition 4 of {cite:t}`MarcetSargent1989jet`
+shows that the same full ODE {eq}`eq:full_ode` and small ODE {eq}`eq:small_ode`
+govern convergence.
+
+Thus the stability criterion below is not an artifact of the one-period lag in
+the displayed learning rule.
+
 ```{note}
-As {cite}`BraySavin1984` and {cite}`BrayKreps1987` emphasise, the RLS algorithm
+As {cite:t}`BraySavin1984` and {cite:t}`BrayKreps1987` emphasise, the RLS algorithm
 cannot be derived from Bayes' rule applied to a correctly specified model, because
 during the learning transition the data-generating process is non-stationary —
-beliefs shift the equilibrium, which shifts the data.  The algorithm is
-"irrational" in the  sense that it acts as if the environment were stationary,
+beliefs shift the equilibrium, which shifts the data.
+
+The algorithm is
+"irrational" in the sense that it acts as if the environment were stationary,
 when it is not.
 ```
 
-## The Governing ODE
+## The governing ODE
 
-### Ljung's Differential-Equation Approach
+### Ljung's differential-equation approach
 
-{cite}`MarcetSargent1989jet` apply Ljung's theorem ({cite}`Ljung1977`) to
+{cite:t}`MarcetSargent1989jet` apply the theorem of {cite:t}`Ljung1977` to
 characterise the almost-sure limiting behaviour of the stochastic system
 {eq}`eq:rls_beta`–{eq}`eq:rls_R`.
 
@@ -234,7 +259,7 @@ $$
 \frac{d\beta}{dt} = T(\beta) - \beta .
 $$ (eq:small_ode)
 
-This is the **small ODE** (equation (6) in {cite}`MarcetSargent1989jet`).
+This is the **small ODE** (equation (6) in {cite:t}`MarcetSargent1989jet`).
 
 Its
 fixed points are exactly the rational expectations equilibria.
@@ -243,25 +268,49 @@ The full ODE system associated with the joint process $(\beta_t, R_t)$ is
 
 $$
 \frac{d}{dt}\begin{bmatrix} \beta \\ R \end{bmatrix}
-= \begin{bmatrix} R^{-1} M_{z_2}(\beta)\,[T(\beta) - \beta]' \\ M_{z_2}(\beta) - R \end{bmatrix} ,
+= \begin{bmatrix} R^{-1} M_{z_2}(\beta)\,[T(\beta) - \beta]^\top \\ M_{z_2}(\beta) - R \end{bmatrix} ,
 $$ (eq:full_ode)
 
-where $M_{z_2}(\beta) = Ez_{2t}z_{2t}'$ evaluated at the stationary distribution
+where $M_{z_2}(\beta) = E z_{2t}z_{2t}^\top$ evaluated at the stationary distribution
 induced by $\beta$.
 
 The fixed point of {eq}`eq:full_ode` is $(\beta_f, R_f)$
 where $R_f = M_{z_2}(\beta_f)$.
 
-### Stability Governs Convergence
+### What the assumptions do
+
+The sufficient conditions in {cite:t}`MarcetSargent1989jet` divide naturally into
+regularity assumptions and boundedness assumptions.
+
+The regularity assumptions require a unique fixed point, smooth maps
+$T, A, B, V$, a nonsingular second-moment matrix at the fixed point,
+well-behaved gain sequence $\alpha_t/t$, and shocks with enough moments.
+
+The harder assumptions are the boundedness conditions A.6--A.7.
+
+Assumption A.6 requires the regressors and estimates to return to bounded sets
+along a subsequence with probability one.
+
+Assumption A.7 requires either an unrestricted algorithm whose ODE paths stay in
+a compact part of the stationarity region, or a projection facility whose ODE
+paths point back toward the interior of the projection set.
+
+When $z_{2t}$ contains only exogenous ergodic variables, A.6 is usually
+automatic.
+
+When $z_{2t}$ contains endogenous variables, as in the investment example below,
+the boundedness argument is a separate and more delicate part of the proof.
+
+### Stability governs convergence
 
 Let $\mathcal{M}$ be the Jacobian matrix of $T(\beta) - \beta$ evaluated at the
 REE $\beta_f$:
 
 $$
-\mathcal{M} = \frac{d\,\text{col}(T(\beta) - \beta)}{d\,\text{col}(\beta)'}\Bigg|_{\beta=\beta_f} .
+\mathcal{M} = \frac{d\,\text{col}(T(\beta) - \beta)}{d\,\text{col}(\beta)^\top}\Bigg|_{\beta=\beta_f} .
 $$ (eq:jacobian)
 
-**Proposition 3** of {cite}`MarcetSargent1989jet` establishes that the Jacobian of
+**Proposition 3** of {cite:t}`MarcetSargent1989jet` establishes that the Jacobian of
 the full system {eq}`eq:full_ode` at $(\beta_f, R_f)$ has $n_2^2$ repeated
 eigenvalues equal to $-1$ (from the $R$ equation), plus the eigenvalues of
 $\mathcal{M}$ (from the $\beta$ equation).
@@ -269,7 +318,9 @@ $\mathcal{M}$ (from the $\beta$ equation).
 Consequently:
 
 * If all eigenvalues of $\mathcal{M}$ have **strictly negative real parts**, both
-  {eq}`eq:small_ode` and {eq}`eq:full_ode` are locally stable.  Under suitable
+  {eq}`eq:small_ode` and {eq}`eq:full_ode` are locally stable.
+
+  Under suitable
   boundedness conditions, Proposition 1 guarantees $\beta_t \to \beta_f$ **almost
   surely**.
 
@@ -277,21 +328,21 @@ Consequently:
   $P(\beta_t \to \beta_f) = 0$ — convergence is **impossible**.
 
 The stability condition $\text{Re}(\lambda_i(\mathcal{M})) < 0$ for all $i$ is
-what the E-stability literature (see {cite}`Evans1985`) calls **E-stability**: the
+what the E-stability literature (see {cite:t}`Evans1985`) calls **E-stability**: the
 REE is a stable rest point of the "expectational dynamics" $\dot\beta = T(\beta) - \beta$.
 
-### The Projection Facility
+### The projection facility
 
 E-stability is necessary but not quite sufficient for almost-sure convergence.
 
 Ljung's theorem requires the sample path $(\beta_t, R_t)$ to remain in a
 **bounded region** with probability one (assumptions A.6–A.7 of
-{cite}`MarcetSargent1989jet`).
+{cite:t}`MarcetSargent1989jet`).
 
 This boundedness is the job of the **projection
 facility**.
 
-#### What the Projection Facility Does
+#### What the projection facility does
 
 The full learning algorithm augments the plain RLS update with a constraint set
 $D_1 \supset D_2$ in $(\beta, R)$-space.
@@ -319,7 +370,7 @@ The facility can be thought of as forcing agents to **discard observations that
 are inconsistent with their priors** — a form of bounded rationality that is
 necessary for the mathematical argument but innocuous in practice.
 
-#### Why It Is Needed
+#### Why it is needed
 
 Without the projection facility, the stochastic path $(\beta_t, R_t)$ might
 temporarily wander to regions where the system {eq}`eq:actual_lom` is
@@ -329,7 +380,7 @@ Ljung's convergence theorem requires
 the algorithm to revisit a compact set infinitely often; the projection facility
 guarantees this by construction.
 
-Formally, {cite}`MarcetSargent1989jet` require that the ODE trajectories
+Formally, {cite:t}`MarcetSargent1989jet` require that the ODE trajectories
 originating in $D_1$ point **inward** at the boundary $\partial D_1$ — that is,
 the vector field $T(\beta) - \beta$ must point back into $D_1$ everywhere on its
 boundary.
@@ -338,38 +389,46 @@ When this holds (Assumption A.7.2), the projection is **invoked only
 finitely many times** with probability one, and after the last invocation the
 algorithm runs as plain RLS.
 
-Corollary 1 of {cite}`MarcetSargent1989jet`
+Corollary 1 of {cite:t}`MarcetSargent1989jet`
 formalises this: either $\beta_t \to \beta_f$ a.s., or $\beta_t$ clusters on the
 boundary $\partial D_1 \setminus D_2$ — but the latter event has probability zero
 when the ODE trajectories point inward.
 
-#### The Exogenous-Regressor Case (Corollary 2)
+#### The exogenous-regressor case (Corollary 2)
 
 When the regressors $z_{2t}$ are **exogenous** — so that $M_{z_2}(\beta) \equiv M$
 does not depend on $\beta$ — a particularly clean sufficient condition for
-convergence is available (Corollary 2 of {cite}`MarcetSargent1989jet`):
+convergence is available (Corollary 2 of {cite:t}`MarcetSargent1989jet`).
+
+In the notation of the paper, let $H(\beta)$ describe the mean-value slope of
+the small-ODE drift:
 
 $$
-\text{all eigenvalues of } H(\beta) \equiv \frac{d\,\text{col}[T(\beta) - T(\beta_f)]}{d\,\text{col}[\beta - \beta_f]'}
-\text{ have real parts} < 0 \quad \forall\, |\beta - \beta_f| \leq K .
+\operatorname{col}\{[T(\beta)-\beta]-[T(\beta_f)-\beta_f]\}
+=
+H(\beta)\operatorname{col}(\beta-\beta_f).
 $$ (eq:corollary2_cond)
 
+For the scalar linear examples, this reduces to the familiar requirement that
+the slope of $T(\beta)-\beta$ be negative.
+
 Under this condition one can take $D_1$ to be a ball of radius $K$ around
 $\beta_f$, and the boundary condition is automatically satisfied.
 
-For all four
-scalar examples in this lecture, $H(\beta) = \mathcal{M}$ is constant, so
-Corollary 2 reduces simply to E-stability.
+For the first four examples below, $T$ is linear and $M_{z_2}$ is independent of
+$\beta$, so Corollary 2 reduces to checking stability of the small ODE.
 
 ```{note}
-In the scalar examples studied here (Bray, Bray–Savin, present-value model), the
-state $z_{2t} = 1$ is a constant regressor, so $M_{z_2} = 1$ is trivially
-exogenous.  For the investment model with endogenous regressors, verifying the
+In the scalar self-referential examples studied here (Bray, Bray–Savin,
+present-value model), the state $z_{2t} = 1$ is a constant regressor, so
+$M_{z_2} = 1$ is trivially exogenous.
+
+For the investment model with endogenous regressors, verifying the
 boundary condition on $D_1$ is much harder and may require numerical solution of
 the ODE on a grid of boundary points.
 ```
 
-#### Simulating the Projection Facility
+#### Simulating the projection facility
 
 The following code demonstrates the projection facility at work.
 
@@ -382,70 +441,76 @@ often the facility is invoked and show that after a finite number of
 interventions, the path converges normally.
 
 ```{code-cell} ipython3
-def simulate_rls_with_projection(T_map, sigma_u, beta0, K_proj,
+---
+mystnb:
+  figure:
+    caption: Projection facility
+    name: fig-projection-facility
+---
+def simulate_rls_with_projection(T_map, σ_u, β0, K_proj,
                                  T_periods=500, N_paths=50, seed=0):
     """
     Simulate RLS with a scalar projection facility.
 
-    The facility keeps beta_t in [-K_proj, K_proj].  Whenever the unconstrained
-    update would push beta outside this interval, beta is retracted to 0
-    (an arbitrary point in D2 = {|beta| <= K_proj/2}).
+    The facility keeps β_t in [-K_proj, K_proj].  Whenever the unconstrained
+    update would push β outside this interval, β is retracted to 0
+    (an arbitrary point in D2 = {|β| <= K_proj/2}).
 
     Returns
     -------
-    beta_paths      : (N_paths, T_periods) array of belief paths
+    β_paths      : (N_paths, T_periods) array of belief paths
     n_projections   : (N_paths,) array counting projection invocations per path
     first_proj_free : (N_paths,) array of first period with no further projections
     """
     rng = np.random.default_rng(seed)
-    beta_paths    = np.empty((N_paths, T_periods))
+    β_paths    = np.empty((N_paths, T_periods))
     n_projections = np.zeros(N_paths, dtype=int)
     last_proj     = np.full(N_paths, -1, dtype=int)
 
     for i in range(N_paths):
-        beta = beta0
+        β = β0
         R    = 1.0
 
         for t in range(T_periods):
-            u_t = rng.normal(0, sigma_u)
-            z1  = T_map(beta) + u_t          # z2 = 1 (constant regressor)
+            u_t = rng.normal(0, σ_u)
+            z1  = T_map(β) + u_t          # z2 = 1 (constant regressor)
 
             # Unconstrained RLS update
             R_new    = R    + (1.0 / (t + 1)) * (1.0 - R)
-            beta_new = beta + (1.0 / (t + 1)) / R_new * (z1 - beta)
+            β_new = β + (1.0 / (t + 1)) / R_new * (z1 - β)
 
             # Projection facility: retract to D2 = {0} if outside D1
-            if abs(beta_new) > K_proj:
-                beta_new = 0.0           # retract to interior of D2
+            if abs(β_new) > K_proj:
+                β_new = 0.0           # retract to interior of D2
                 n_projections[i] += 1
                 last_proj[i] = t
 
-            beta = beta_new
+            β = β_new
             R    = max(R_new, 1e-8)
-            beta_paths[i, t] = beta
+            β_paths[i, t] = β
 
     # First period after which no further projections occur
     first_proj_free = last_proj + 1   # -1 + 1 = 0 if never projected
 
-    return beta_paths, n_projections, first_proj_free
+    return β_paths, n_projections, first_proj_free
 
 
 # Run the simulation
-a_bray_pf, b_bray_pf, sigma_pf = 1.0, 0.6, 1.5
-T_bray_pf  = lambda beta: a_bray_pf + b_bray_pf * beta
-beta_f_pf  = a_bray_pf / (1 - b_bray_pf)
-beta0_far  = 8.0    # well outside D1 = {|beta| < 5}
+a_bray_pf, b_bray_pf, σ_pf = 1.0, 0.6, 1.5
+T_bray_pf  = lambda β: a_bray_pf + b_bray_pf * β
+β_f_pf  = a_bray_pf / (1 - b_bray_pf)
+β0_far  = 8.0    # well outside D1 = {|β| < 5}
 K_pf       = 5.0
 T_pf_sim   = 600
 N_pf_sim   = 80
 
 paths_pf, n_proj, first_free = simulate_rls_with_projection(
-    T_bray_pf, sigma_pf, beta0_far, K_pf,
+    T_bray_pf, σ_pf, β0_far, K_pf,
     T_periods=T_pf_sim, N_paths=N_pf_sim)
 
 # Also run without projection for comparison
 paths_no_pf = simulate_rls_scalar(
-    T_bray_pf, sigma_pf, beta0_far,
+    T_bray_pf, σ_pf, β0_far,
     T_periods=T_pf_sim, N_paths=N_pf_sim, seed=0)
 
 fig = plt.figure(figsize=(15, 10))
@@ -454,42 +519,36 @@ gs  = GridSpec(2, 2, figure=fig)
 # Top left: paths with projection
 ax1 = fig.add_subplot(gs[0, 0])
 for i in range(min(30, N_pf_sim)):
-    ax1.plot(paths_pf[i], color='steelblue', alpha=0.25, lw=0.8)
+    ax1.plot(paths_pf[i], color='steelblue', alpha=0.25, lw=2)
 ax1.plot(np.mean(paths_pf, axis=0), color='navy', lw=2, label='average')
-ax1.axhline(beta_f_pf, color='red', ls='--', lw=1.5,
-            label=f'$\\beta_f={beta_f_pf:.1f}$')
-ax1.axhline( K_pf, color='gray', ls=':', lw=1.2, label=f'$D_1$ boundary ($K={K_pf}$)')
-ax1.axhline(-K_pf, color='gray', ls=':', lw=1.2)
-ax1.set_title('With Projection Facility ($\\beta_0=8$, $K=5$)')
+ax1.axhline(β_f_pf, color='red', ls='--', lw=2,
+            label=f'$\\beta_f={β_f_pf:.1f}$')
+ax1.axhline( K_pf, color='gray', ls=':', lw=2, label=f'$D_1$ boundary ($K={K_pf}$)')
+ax1.axhline(-K_pf, color='gray', ls=':', lw=2)
 ax1.set_xlabel('$t$'); ax1.set_ylabel('$\\beta_t$'); ax1.legend(fontsize=8)
 
 # Top right: paths without projection
 ax2 = fig.add_subplot(gs[0, 1])
 for i in range(min(30, N_pf_sim)):
-    ax2.plot(paths_no_pf[i], color='darkorange', alpha=0.25, lw=0.8)
+    ax2.plot(paths_no_pf[i], color='darkorange', alpha=0.25, lw=2)
 ax2.plot(np.mean(paths_no_pf, axis=0), color='saddlebrown', lw=2, label='average')
-ax2.axhline(beta_f_pf, color='red', ls='--', lw=1.5,
-            label=f'$\\beta_f={beta_f_pf:.1f}$')
-ax2.set_title('Without Projection Facility ($\\beta_0=8$)')
+ax2.axhline(β_f_pf, color='red', ls='--', lw=2,
+            label=f'$\\beta_f={β_f_pf:.1f}$')
 ax2.set_xlabel('$t$'); ax2.set_ylabel('$\\beta_t$'); ax2.legend(fontsize=8)
 
 # Bottom left: histogram of projection counts
 ax3 = fig.add_subplot(gs[1, 0])
 ax3.hist(n_proj, bins=range(0, int(n_proj.max()) + 2),
          color='steelblue', edgecolor='white', alpha=0.8)
-ax3.set_xlabel('Number of projections invoked')
-ax3.set_ylabel('Number of paths')
-ax3.set_title('Distribution of Projection Invocations\n'
-              '(finite a.s. — Corollary 1)')
+ax3.set_xlabel('number of projections invoked')
+ax3.set_ylabel('number of paths')
 
 # Bottom right: period of last projection
 ax4 = fig.add_subplot(gs[1, 1])
 ax4.hist(first_free[n_proj > 0], bins=20,
          color='darkorange', edgecolor='white', alpha=0.8)
-ax4.set_xlabel('Last period with a projection')
-ax4.set_ylabel('Number of paths')
-ax4.set_title('After the Last Projection, RLS Runs Freely\n'
-              '(projection invoked only finitely many times)')
+ax4.set_xlabel('last period with a projection')
+ax4.set_ylabel('number of paths')
 
 plt.tight_layout()
 plt.show()
@@ -502,20 +561,41 @@ print(f"Mean last-projection period:         {first_free[n_proj>0].mean():.1f}")
 
 The simulation illustrates the key theoretical point from Corollary 1: the
 projection is invoked only a **finite number of times** on almost every sample
-path.  After the last invocation the algorithm runs as unconstrained RLS and
-converges to $\beta_f$ at the usual rate.  The projection does not bias the
+path.
+
+After the last invocation the algorithm runs as unconstrained RLS and
+converges to $\beta_f$ at the usual rate.
+
+The projection does not bias the
 asymptotic estimate — it merely provides the boundedness guarantee that Ljung's
 theorem requires.
 
-## Four Illustrative Examples
+## Five illustrative examples
 
-We now work through four examples from Section 4 of {cite}`MarcetSargent1989jet`,
+We now work through five examples from Section 4 of {cite:t}`MarcetSargent1989jet`,
 computing the ODE, finding the REE, checking E-stability, and simulating the RLS
 learning path.
 
-### Example 1: Bray's Cobweb Model
+### Example 1: ordinary linear stochastic difference equations
+
+The first example in Section 4 has no self-referential component.
+
+Let the actual law of motion be fixed, with $T(\beta)=\Gamma$ for a stable
+matrix $\Gamma$ and with $V(\beta)=I$.
+
+The REE is $\beta_f=\Gamma$.
+
+Since $T$ is constant, $H(\beta)=-I$ and the small ODE is globally stable.
+
+Corollary 2 then implies that recursive least squares converges almost surely
+to the true law of motion.
 
-{cite}`Bray1982` studied a simple cobweb economy in which the equilibrium price
+This benchmark shows that the Marcet-Sargent machinery nests ordinary strong
+consistency of least squares for stable linear stochastic difference equations.
+
+### Example 2: Bray's cobweb model
+
+{cite:t}`Bray1982` studied a simple cobweb economy in which the equilibrium price
 satisfies
 
 $$
@@ -542,14 +622,16 @@ which has the unique fixed point $\beta_f = a/(1-b)$.
 
 Its Jacobian is
 $\mathcal{M} = b - 1 < 0$ when $|b| < 1$, so the REE is E-stable and RLS
-converges almost surely.  When $b > 1$, $\mathcal{M} > 0$ and convergence fails.
+converges almost surely.
 
-### Example 2: Bray–Savin Supply-Shifter Model
+When $b > 1$, $\mathcal{M} > 0$ and convergence fails.
 
-{cite}`BraySavin1984` studied a model where
+### Example 3: Bray–Savin supply-shifter model
+
+{cite:t}`BraySavin1984` studied a model where
 
 $$
-p_t = x_t'(m + a\beta_{t-1}) + \tilde{u}_t , \quad p_t^e = x_t'\beta_{t-1} ,
+p_t = x_t^\top(m + a\beta_{t-1}) + \tilde{u}_t , \quad p_t^e = x_t^\top\beta_{t-1} ,
 $$ (eq:bs_price)
 
 with $x_t$ an exogenous supply-shifter, $a$ a scalar feedback parameter, and
@@ -563,7 +645,7 @@ $$ (eq:bs_ode)
 
 with Jacobian $\mathcal{M} = a - 1 < 0$ iff $a < 1$.
 
-### Example 3: Hyperinflation / Asset Prices (Fourgeaud–Gourieroux–Pradel)
+### Example 4: Hyperinflation / asset prices (Fourgeaud–Gourieroux–Pradel)
 
 Consider the present-value asset pricing model
 
@@ -588,7 +670,7 @@ $$ (eq:pv_ode)
 with Jacobian $\mathcal{M} = \lambda\rho - 1 < 0$ for $|\lambda\rho| < 1$, so
 convergence is guaranteed.
 
-### Example 4: Investment under Uncertainty (Self-Referential with Endogenous Regressors)
+### Example 5: Investment under uncertainty (self-referential with endogenous regressors)
 
 In Sargent's version of the Lucas–Prescott investment model, agents learn about the
 aggregate capital stock $K_t$ by regressing on $(K_{t-1}, w_{t-1})$ where $w_t$
@@ -619,30 +701,36 @@ $$ (eq:inv_ode)
 and E-stability can be verified analytically for $|\beta_1| < b^{-1/2}$ (where
 $b$ is the discount factor).
 
-## Simulating the Learning Dynamics
+## Simulating the learning dynamics
 
-We now simulate all four examples numerically, plotting both the ODE solution
-(continuous-time approximation) and the sample paths of $\beta_t$ under RLS.
+We now simulate the self-referential examples numerically, plotting both the ODE
+solution (continuous-time approximation) and the sample paths of $\beta_t$ under RLS.
 
-### Bray's Model
+### Bray's model
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Bray learning dynamics
+    name: fig-bray-learning-dynamics
+---
 # ------------------------------------------------------------------
-# Bray's cobweb model: T(beta) = a + b*beta,  REE = a/(1-b)
+# Bray's cobweb model: T(β) = a + b*β,  REE = a/(1-b)
 # ------------------------------------------------------------------
-a_bray, b_bray, sigma_bray = 1.0, 0.6, 1.0
-T_bray = lambda beta: a_bray + b_bray * beta
-beta_f_bray = a_bray / (1 - b_bray)
+a_bray, b_bray, σ_bray = 1.0, 0.6, 1.0
+T_bray = lambda β: a_bray + b_bray * β
+β_f_bray = a_bray / (1 - b_bray)
 
-beta0_bray = 0.0   # start well below the REE
+β0_bray = 0.0   # start well below the REE
 T_sim = 400
 N_sim = 80
 
-beta_paths_bray = simulate_rls_scalar(T_bray, sigma_bray, beta0_bray,
+β_paths_bray = simulate_rls_scalar(T_bray, σ_bray, β0_bray,
                                       T_periods=T_sim, N_paths=N_sim)
 
 # ODE solution for two starting values
-ode_bray = lambda beta: a_bray + b_bray * beta - beta
+ode_bray = lambda β: a_bray + b_bray * β - β
 t_ode, sol_low  = solve_ode(ode_bray, 0.0)
 _,     sol_high = solve_ode(ode_bray, 4.5)
 
@@ -650,46 +738,50 @@ fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
 ax = axes[0]
 for i in range(min(30, N_sim)):
-    ax.plot(beta_paths_bray[i], color='steelblue', alpha=0.25, lw=0.8)
-ax.plot(np.mean(beta_paths_bray, axis=0), color='navy', lw=2,
+    ax.plot(β_paths_bray[i], color='steelblue', alpha=0.25, lw=2)
+ax.plot(np.mean(β_paths_bray, axis=0), color='navy', lw=2,
         label='cross-path average')
-ax.axhline(beta_f_bray, color='red', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_bray:.2f}$')
+ax.axhline(β_f_bray, color='red', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_bray:.2f}$')
 ax.set_xlabel('$t$')
 ax.set_ylabel('$\\beta_t$')
-ax.set_title("Bray's Model: RLS Paths ($b=0.6$)")
 ax.legend()
 
 ax = axes[1]
 ax.plot(t_ode, sol_low,  color='steelblue', lw=2, label='ODE from $\\beta_0=0$')
 ax.plot(t_ode, sol_high, color='darkorange', lw=2, label='ODE from $\\beta_0=4.5$')
-ax.axhline(beta_f_bray, color='red', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_bray:.2f}$')
+ax.axhline(β_f_bray, color='red', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_bray:.2f}$')
 ax.set_xlabel('$t$')
 ax.set_ylabel('$\\beta(t)$')
-ax.set_title("Bray's Model: ODE Trajectories")
 ax.legend()
 
 plt.tight_layout()
 plt.show()
-print(f"REE: beta_f = a/(1-b) = {beta_f_bray:.4f}")
+print(f"REE: β_f = a/(1-b) = {β_f_bray:.4f}")
 print(f"Jacobian M = b - 1 = {b_bray - 1:.4f}  (< 0: E-stable)")
 ```
 
-### Bray–Savin Model
+### Bray–Savin model
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Bray-Savin learning dynamics
+    name: fig-bray-savin-learning-dynamics
+---
 # ------------------------------------------------------------------
-# Bray–Savin: T(beta) = m + a*beta,  REE = m/(1-a)
+# Bray–Savin: T(β) = m + a*β,  REE = m/(1-a)
 # ------------------------------------------------------------------
-m_bs, a_bs, sigma_bs = 0.5, 0.7, 1.0
-T_bs = lambda beta: m_bs + a_bs * beta
-beta_f_bs = m_bs / (1 - a_bs)
+m_bs, a_bs, σ_bs = 0.5, 0.7, 1.0
+T_bs = lambda β: m_bs + a_bs * β
+β_f_bs = m_bs / (1 - a_bs)
 
-beta_paths_bs = simulate_rls_scalar(T_bs, sigma_bs, 0.0,
+β_paths_bs = simulate_rls_scalar(T_bs, σ_bs, 0.0,
                                     T_periods=T_sim, N_paths=N_sim)
 
-ode_bs = lambda beta: T_bs(beta) - beta
+ode_bs = lambda β: T_bs(β) - β
 t_ode_bs, sol_bs_low  = solve_ode(ode_bs, 0.0)
 _,         sol_bs_high = solve_ode(ode_bs, 4.0)
 
@@ -697,45 +789,49 @@ fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
 ax = axes[0]
 for i in range(min(30, N_sim)):
-    ax.plot(beta_paths_bs[i], color='darkorange', alpha=0.25, lw=0.8)
-ax.plot(np.mean(beta_paths_bs, axis=0), color='saddlebrown', lw=2,
+    ax.plot(β_paths_bs[i], color='darkorange', alpha=0.25, lw=2)
+ax.plot(np.mean(β_paths_bs, axis=0), color='saddlebrown', lw=2,
         label='cross-path average')
-ax.axhline(beta_f_bs, color='red', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_bs:.2f}$')
+ax.axhline(β_f_bs, color='red', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_bs:.2f}$')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
-ax.set_title('Bray–Savin Model: RLS Paths ($a=0.7$)')
 ax.legend()
 
 ax = axes[1]
 ax.plot(t_ode_bs, sol_bs_low,  color='darkorange', lw=2, label='ODE from $\\beta_0=0$')
 ax.plot(t_ode_bs, sol_bs_high, color='steelblue',  lw=2, label='ODE from $\\beta_0=4$')
-ax.axhline(beta_f_bs, color='red', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_bs:.2f}$')
+ax.axhline(β_f_bs, color='red', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_bs:.2f}$')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta(t)$')
-ax.set_title('Bray–Savin Model: ODE Trajectories')
 ax.legend()
 
 plt.tight_layout()
 plt.show()
-print(f"REE: beta_f = m/(1-a) = {beta_f_bs:.4f}")
+print(f"REE: β_f = m/(1-a) = {β_f_bs:.4f}")
 print(f"Jacobian M = a - 1 = {a_bs - 1:.4f}  (< 0: E-stable)")
 ```
 
-### Present-Value / Hyperinflation Model
+### Present-value / hyperinflation model
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Present-value learning dynamics
+    name: fig-present-value-learning-dynamics
+---
 # ------------------------------------------------------------------
-# Present-value model: T(beta) = (lambda*beta + 1)*rho
-# REE = rho / (1 - lambda*rho)
+# Present-value model: T(β) = (lambda*β + 1)*ρ
+# REE = ρ / (1 - lambda*ρ)
 # ------------------------------------------------------------------
-lam, rho_pv, sigma_pv = 0.8, 0.9, 1.0
-T_pv = lambda beta: (lam * beta + 1) * rho_pv
-beta_f_pv = rho_pv / (1 - lam * rho_pv)
+λ, ρ_pv, σ_pv = 0.8, 0.9, 1.0
+T_pv = lambda β: (λ * β + 1) * ρ_pv
+β_f_pv = ρ_pv / (1 - λ * ρ_pv)
 
-beta_paths_pv = simulate_rls_scalar(T_pv, sigma_pv, 0.0,
+β_paths_pv = simulate_rls_scalar(T_pv, σ_pv, 0.0,
                                     T_periods=T_sim, N_paths=N_sim)
 
-ode_pv = lambda beta: T_pv(beta) - beta
+ode_pv = lambda β: T_pv(β) - β
 t_ode_pv, sol_pv_low  = solve_ode(ode_pv, 0.0, t_span=(0, 50))
 _,         sol_pv_high = solve_ode(ode_pv, 10.0, t_span=(0, 50))
 
@@ -743,74 +839,76 @@ fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
 ax = axes[0]
 for i in range(min(30, N_sim)):
-    ax.plot(beta_paths_pv[i], color='seagreen', alpha=0.25, lw=0.8)
-ax.plot(np.mean(beta_paths_pv, axis=0), color='darkgreen', lw=2,
+    ax.plot(β_paths_pv[i], color='seagreen', alpha=0.25, lw=2)
+ax.plot(np.mean(β_paths_pv, axis=0), color='darkgreen', lw=2,
         label='cross-path average')
-ax.axhline(beta_f_pv, color='red', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_pv:.2f}$')
+ax.axhline(β_f_pv, color='red', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_pv:.2f}$')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
-ax.set_title('Present-Value Model: RLS Paths')
 ax.legend()
 
 ax = axes[1]
 ax.plot(t_ode_pv, sol_pv_low,  color='seagreen',  lw=2, label='ODE from $\\beta_0=0$')
 ax.plot(t_ode_pv, sol_pv_high, color='steelblue', lw=2, label='ODE from $\\beta_0=10$')
-ax.axhline(beta_f_pv, color='red', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_pv:.2f}$')
+ax.axhline(β_f_pv, color='red', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_pv:.2f}$')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta(t)$')
-ax.set_title('Present-Value Model: ODE Trajectories')
 ax.legend()
 
 plt.tight_layout()
 plt.show()
-print(f"REE: beta_f = rho/(1 - lambda*rho) = {beta_f_pv:.4f}")
-print(f"Jacobian M = lambda*rho - 1 = {lam*rho_pv - 1:.4f}  (< 0: E-stable)")
+print(f"REE: β_f = ρ/(1 - lambda*ρ) = {β_f_pv:.4f}")
+print(f"Jacobian M = lambda*ρ - 1 = {λ*ρ_pv - 1:.4f}  (< 0: E-stable)")
 ```
 
-### Instability When E-Stability Fails
+### Instability when E-stability fails
 
 To see what happens when E-stability is violated, we repeat Bray's model with $b > 1$.
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Unstable Bray dynamics
+    name: fig-unstable-bray-dynamics
+---
 # ------------------------------------------------------------------
 # Unstable case: Bray's model with b > 1
 # ------------------------------------------------------------------
 b_unstable = 1.4
-T_unstable = lambda beta: a_bray + b_unstable * beta
-beta_f_unstable = a_bray / (1 - b_unstable)   # negative
+T_unstable = lambda β: a_bray + b_unstable * β
+β_f_unstable = a_bray / (1 - b_unstable)   # negative
 
-beta_paths_unstable = simulate_rls_scalar(
-    T_unstable, sigma_bray, beta0=0.0,
+β_paths_unstable = simulate_rls_scalar(
+    T_unstable, σ_bray, β0=0.0,
     T_periods=200, N_paths=50)
 
-ode_unstable = lambda beta: T_unstable(beta) - beta
+ode_unstable = lambda β: T_unstable(β) - β
 
-# Phase diagram: plot drift for beta in [-5, 5]
-beta_grid = np.linspace(-5, 5, 300)
-drift = np.array([ode_unstable(b) for b in beta_grid])
+# Phase diagram: plot drift for β in [-5, 5]
+β_grid = np.linspace(-5, 5, 300)
+drift = np.array([ode_unstable(b) for b in β_grid])
 
 fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
 ax = axes[0]
 for i in range(min(30, 50)):
-    ax.plot(beta_paths_unstable[i], color='crimson', alpha=0.3, lw=0.8)
-ax.axhline(beta_f_unstable, color='black', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_unstable:.2f}$ (unstable)')
+    ax.plot(β_paths_unstable[i], color='crimson', alpha=0.3, lw=2)
+ax.axhline(β_f_unstable, color='black', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_unstable:.2f}$ (unstable)')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
-ax.set_title('Bray Model with $b=1.4$: RLS Diverges')
 ax.legend()
 
 ax = axes[1]
-ax.plot(beta_grid, drift, color='crimson', lw=2)
-ax.axhline(0, color='black', lw=0.8)
-ax.axvline(beta_f_unstable, color='black', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_unstable:.2f}$')
-ax.fill_between(beta_grid, drift, 0,
+ax.plot(β_grid, drift, color='crimson', lw=2)
+ax.axhline(0, color='black', lw=2)
+ax.axvline(β_f_unstable, color='black', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_unstable:.2f}$')
+ax.fill_between(β_grid, drift, 0,
                 where=(drift > 0), color='crimson', alpha=0.15)
-ax.fill_between(beta_grid, drift, 0,
+ax.fill_between(β_grid, drift, 0,
                 where=(drift < 0), color='steelblue', alpha=0.15)
 ax.set_xlabel('$\\beta$'); ax.set_ylabel('$T(\\beta) - \\beta$')
-ax.set_title('Phase Diagram: Drift Points Away from REE')
 ax.legend()
 
 plt.tight_layout()
@@ -818,9 +916,11 @@ plt.show()
 print(f"Jacobian M = b - 1 = {b_unstable - 1:.2f}  (> 0: NOT E-stable)")
 ```
 
-## Phase Diagrams and E-Stability
+## Phase diagrams and E-stability
+
+The E-stability condition has a clean geometric interpretation.
 
-The E-stability condition has a clean geometric interpretation.  At the REE
+At the REE
 $\beta_f$, the small ODE {eq}`eq:small_ode` must have trajectories **pointing
 inward**.
 
@@ -830,78 +930,87 @@ The figure below plots the phase diagrams for all three scalar examples side by
 side.
 
 ```{code-cell} ipython3
-beta_vec = np.linspace(-1.0, 5.5, 400)
+---
+mystnb:
+  figure:
+    caption: Scalar phase diagrams
+    name: fig-scalar-phase-diagrams
+---
+β_vec = np.linspace(-1.0, 5.5, 400)
 
 models = [
     ("Bray ($b=0.6$)",       lambda b: a_bray + 0.6*b - b,   a_bray/(1-0.6),   'steelblue'),
     ("Bray–Savin ($a=0.7$)", lambda b: m_bs + 0.7*b - b,     m_bs/(1-0.7),     'darkorange'),
-    ("Present-value",        lambda b: T_pv(b) - b,           beta_f_pv,        'seagreen'),
+    ("Present-value",        lambda b: T_pv(b) - b,           β_f_pv,        'seagreen'),
 ]
 
 fig, axes = plt.subplots(1, 3, figsize=(15, 5))
 
 for ax, (name, ode_fn, bf, color) in zip(axes, models):
-    drift = np.array([ode_fn(b) for b in beta_vec])
-    ax.plot(beta_vec, drift, color=color, lw=2)
-    ax.axhline(0, color='black', lw=0.8)
-    ax.axvline(bf, color='red', ls='--', lw=1.5, label=f'$\\beta_f={bf:.2f}$')
-    ax.fill_between(beta_vec, drift, 0, where=(drift > 0),
+    drift = np.array([ode_fn(b) for b in β_vec])
+    ax.plot(β_vec, drift, color=color, lw=2, label=name)
+    ax.axhline(0, color='black', lw=2)
+    ax.axvline(bf, color='red', ls='--', lw=2, label=f'$\\beta_f={bf:.2f}$')
+    ax.fill_between(β_vec, drift, 0, where=(drift > 0),
                     color=color, alpha=0.12)
-    ax.fill_between(beta_vec, drift, 0, where=(drift < 0),
+    ax.fill_between(β_vec, drift, 0, where=(drift < 0),
                     color=color, alpha=0.12)
     # Draw arrows showing direction of drift
-    for bv in np.linspace(beta_vec[20], beta_vec[-20], 7):
+    for bv in np.linspace(β_vec[20], β_vec[-20], 7):
         d = ode_fn(bv)
         ax.annotate('', xy=(bv + 0.3*np.sign(d), 0),
                     xytext=(bv, 0),
-                    arrowprops=dict(arrowstyle='->', color=color, lw=1.5))
+                    arrowprops=dict(arrowstyle='->', color=color, lw=2))
     ax.set_xlabel('$\\beta$')
     ax.set_ylabel('$T(\\beta) - \\beta$')
-    ax.set_title(name)
     ax.legend(fontsize=9)
 
-plt.suptitle('Phase Diagrams of the Small ODE $\\dot{\\beta} = T(\\beta) - \\beta$',
-             y=1.01, fontsize=13)
 plt.tight_layout()
 plt.show()
 ```
 
-## Two-Dimensional Example: The Investment Model
+## Two-dimensional example: the investment model
 
 The investment-under-uncertainty example is two-dimensional and highlights how
 E-stability of the composite map $T(\beta) = (T_1(\beta_1), T_2(\beta_1, \beta_2))$
 works when the ODE is recursive.
 
 ```{code-cell} ipython3
-def T_invest(beta, b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, rho_w=0.5):
+---
+mystnb:
+  figure:
+    caption: Investment phase portrait
+    name: fig-investment-phase-portrait
+---
+def T_invest(β, b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, ρ_w=0.5):
     """
     Mapping T for the investment model (scalar version of equations 11 in
     Marcet–Sargent 1989).
 
-    beta = [beta1, beta2]
-    T1(beta1) = (1 - beta1*b) / (1 - beta1*b + d^{-1} f^2 A1 N)
-    T2(beta1, beta2) = -N/(d*(1-rho_w*b)) * (1 - beta1*b + f^2 A1 beta2 b*rho_w)
-                       / (1 - beta1*b + d^{-1} f^2 A1 N) * rho_w
+    β = [β1, β2]
+    T1(β1) = (1 - β1*b) / (1 - β1*b + d^{-1} f^2 A1 N)
+    T2(β1, β2) = -N/(d*(1-ρ_w*b)) * (1 - β1*b + f^2 A1 β2 b*ρ_w)
+                       / (1 - β1*b + d^{-1} f^2 A1 N) * ρ_w
     """
-    b1, b2 = beta
+    b1, b2 = β
     denom1 = 1 - b1*b + (1/d)*f**2*A1*N
     T1 = (1 - b1*b) / denom1
-    numer2 = (1 - b1*b + f**2*A1*b2*b*rho_w)
-    T2 = (-N / (d*(1 - rho_w*b))) * (numer2 / denom1) * rho_w
+    numer2 = (1 - b1*b + f**2*A1*b2*b*ρ_w)
+    T2 = (-N / (d*(1 - ρ_w*b))) * (numer2 / denom1) * ρ_w
     return np.array([T1, T2])
 
 
-def ode_invest(t, beta, **kwargs):
-    Tb = T_invest(beta, **kwargs)
-    return Tb - beta
+def ode_invest(t, β, **kwargs):
+    Tb = T_invest(β, **kwargs)
+    return Tb - β
 
 
-# REE: solve T(beta) = beta numerically
+# REE: solve T(β) = β numerically
 from scipy.optimize import fsolve
 
-params = dict(b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, rho_w=0.5)
-beta_f_inv = fsolve(lambda b: T_invest(b, **params) - b, [0.5, 0.1])
-print(f"REE: beta_f = {beta_f_inv}")
+params = dict(b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, ρ_w=0.5)
+β_f_inv = fsolve(lambda b: T_invest(b, **params) - b, [0.5, 0.1])
+print(f"REE: β_f = {β_f_inv}")
 
 # Check E-stability via Jacobian
 from numpy import linalg as la
@@ -910,8 +1019,8 @@ eps = 1e-6
 J = np.zeros((2, 2))
 for j in range(2):
     e = np.zeros(2); e[j] = eps
-    J[:, j] = (T_invest(beta_f_inv + e, **params) -
-               T_invest(beta_f_inv - e, **params)) / (2*eps)
+    J[:, j] = (T_invest(β_f_inv + e, **params) -
+               T_invest(β_f_inv - e, **params)) / (2*eps)
 M = J - np.eye(2)
 eigs = la.eigvals(M)
 print(f"Jacobian M eigenvalues: {eigs}")
@@ -927,8 +1036,8 @@ B1, B2 = np.meshgrid(b1_grid, b2_grid)
 U = np.zeros_like(B1); V_field = np.zeros_like(B2)
 for i in range(B1.shape[0]):
     for j in range(B1.shape[1]):
-        beta_ij = np.array([B1[i,j], B2[i,j]])
-        drift = T_invest(beta_ij, **params) - beta_ij
+        β_ij = np.array([B1[i,j], B2[i,j]])
+        drift = T_invest(β_ij, **params) - β_ij
         U[i,j] = drift[0]; V_field[i,j] = drift[1]
 
 speed = np.sqrt(U**2 + V_field**2)
@@ -940,24 +1049,24 @@ ax.streamplot(b1_grid, b2_grid, U, V_field, color=speed,
 starts = [(0.1, 0.0), (0.9, 0.4), (1.1, -0.6), (0.3, -0.7)]
 colors_traj = ['red', 'darkorange', 'green', 'purple']
 for (b10, b20), col in zip(starts, colors_traj):
-    sol = solve_ivp(ode_invest, [0, 30], [b10, b20],
+    sol = solve_ivp(lambda t, β: ode_invest(t, β, **params),
+                    [0, 30], [b10, b20],
                     t_eval=np.linspace(0, 30, 300),
-                    kwargs=params, method='RK45')
+                    method='RK45')
     ax.plot(sol.y[0], sol.y[1], color=col, lw=2)
     ax.plot(b10, b20, 'o', color=col, ms=7)
 
-ax.plot(*beta_f_inv, 'k*', ms=14, label=f'REE $\\beta_f$')
+ax.plot(*β_f_inv, 'k*', ms=14, label=f'REE $\\beta_f$')
 ax.set_xlabel('$\\beta_1$', fontsize=12)
 ax.set_ylabel('$\\beta_2$', fontsize=12)
-ax.set_title('Investment Model: Phase Portrait of $\\dot{\\beta} = T(\\beta) - \\beta$')
 ax.legend()
 plt.tight_layout()
 plt.show()
 ```
 
-## Necessary Condition: Only REE Can Be Limit Points
+## Necessary condition: only REE can be limit points
 
-Proposition 2(i) of {cite}`MarcetSargent1989jet` shows that **non-REE limit points
+Proposition 2(i) of {cite:t}`MarcetSargent1989jet` shows that **non-REE limit points
 have probability zero**: for any $\hat\beta \neq \beta_f$ in the interior of the
 domain,
 
@@ -974,38 +1083,42 @@ The following simulation makes this vivid by starting agents with an initial
 belief that happens to satisfy $T(\beta_0) \approx \beta_0$ only approximately.
 
 ```{code-cell} ipython3
-# Illustration: starting near a non-fixed-point of T still sends beta to beta_f
+---
+mystnb:
+  figure:
+    caption: Non-REE starts
+    name: fig-non-ree-starts
+---
+# Illustration: starting near a non-fixed-point of T still sends β to β_f
 # (Bray model, stable case b=0.6)
-beta_false_rest = 3.0   # T(3.0) = 1 + 0.6*3 = 2.8 ≠ 3
+β_false_rest = 3.0   # T(3.0) = 1 + 0.6*3 = 2.8 ≠ 3
 paths_from_false = simulate_rls_scalar(
-    T_bray, sigma_bray, beta0=beta_false_rest,
+    T_bray, σ_bray, β0=β_false_rest,
     T_periods=300, N_paths=60, seed=7)
 
 fig, ax = plt.subplots(figsize=(10, 5))
 for i in range(60):
-    ax.plot(paths_from_false[i], color='steelblue', alpha=0.2, lw=0.8)
+    ax.plot(paths_from_false[i], color='steelblue', alpha=0.2, lw=2)
 ax.plot(np.mean(paths_from_false, axis=0), color='navy', lw=2,
         label='cross-path average')
-ax.axhline(beta_f_bray, color='red', ls='--', lw=1.5,
-           label=f'REE $\\beta_f = {beta_f_bray:.2f}$')
-ax.axhline(beta_false_rest, color='gray', ls=':', lw=1.5,
-           label=f'False start $\\beta_0 = {beta_false_rest}$')
+ax.axhline(β_f_bray, color='red', ls='--', lw=2,
+           label=f'REE $\\beta_f = {β_f_bray:.2f}$')
+ax.axhline(β_false_rest, color='gray', ls=':', lw=2,
+           label=f'False start $\\beta_0 = {β_false_rest}$')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
-ax.set_title('RLS from a Non-Equilibrium Start Always Converges to the REE\n'
-             '(Proposition 2(i): only the REE is a possible limit point)')
 ax.legend()
 plt.tight_layout()
 plt.show()
 ```
 
-## Connection to Rational Learning 
+## Connection to rational learning
 
-The {cite}`MarcetSargent1989jet` framework belongs to the programme of learning
+The framework of {cite:t}`MarcetSargent1989jet` belongs to the programme of learning
 *about* a rational expectations equilibrium, as distinct from learning *within*
-one — a distinction emphasised by {cite}`BrayKreps1987`.
+one — a distinction emphasised by {cite:t}`BrayKreps1987`.
 
 **Learning *within* an REE** (the subject of the companion lecture
-{doc}`rational_learning_re`) refers to Bayesian inference inside a correctly
+[](rational_learning_re)) refers to Bayesian inference inside a correctly
 specified model.
 
 In that setting the data-generating process is stationary from
@@ -1018,7 +1131,7 @@ Because the agent's beliefs shift
 the equilibrium price, the data the agent uses to update beliefs are themselves
 generated by a non-stationary process.
 
-As {cite}`MarcetSargent1989jet` note (p.
+As {cite:t}`MarcetSargent1989jet` note (p.
 338, footnote 2):
 
 > *"The models do not incorporate fully optimal behavior or rational expectations,
@@ -1037,16 +1150,25 @@ The E-stability condition thus plays the same role in this literature that the
 prior-support condition plays in the Bayesian learning literature: it tells us
 when the learning algorithm can find its way to the equilibrium.
 
+The paper also marks the limits of the argument.
+
+Stability of the small ODE is a local condition, while global convergence still
+requires the larger ODE and the boundedness or projection assumptions.
+
+The framework also does not directly cover hidden-state or private-information
+models, where agents learn from signals rather than directly observed state
+variables.
+
 
 ## Summary
 
-This lecture has presented the {cite}`MarcetSargent1989jet` framework for analysing
+This lecture has presented the framework of {cite:t}`MarcetSargent1989jet` for analysing
 least squares learning in self-referential linear stochastic models.
 
 Key takeaways:
 
 1. **Self-referential structure**: the actual law of motion depends on the
-   perceived law of motion through the mapping $T$.  A rational expectations
+   perceived law of motion through the mapping $T$, and a rational expectations
    equilibrium is a fixed point $\beta_f = T(\beta_f)$.
 
 2. **Recursive least squares**: agents update their beliefs by running RLS,
@@ -1054,7 +1176,7 @@ Key takeaways:
    assumption that the environment is stationary.
 
 3. **The governing ODE**: the almost-sure limiting behaviour of $\beta_t$ is
-   described by the small ODE $\dot\beta = T(\beta) - \beta$.  Only fixed
+   described by the small ODE $\dot\beta = T(\beta) - \beta$, and only fixed
    points of this ODE (REE) are possible limit points of RLS.
 
 4. **E-stability**: the REE is the almost-sure limit of RLS if and only if
@@ -1068,14 +1190,14 @@ Key takeaways:
 6. **Connection to the rational learning literature**: the RLS algorithm
    studies learning *about* a rational expectations equilibrium; it is
    complementary to the Bayesian learning *within* an REE studied by
-   {cite}`BrayKreps1987`.
+   {cite:t}`BrayKreps1987`.
 
 ## Exercises
 
 ```{exercise}
 :label: ls_ex1
 
-**E-Stability and the Slope of T**
+E-stability and the slope of $T$
 
 Consider the scalar model with $T(\beta) = a + b\beta$.
 
@@ -1085,8 +1207,10 @@ Consider the scalar model with $T(\beta) = a + b\beta$.
 and only if $b < 1$.
 
 (c) Simulate $N = 200$ paths of length $T = 500$ for $a = 1$ and each of
-$b \in \{0.3, 0.7, 0.9, 0.99\}$ (all less than 1).  Plot the cross-path
-average of $\beta_t$ for each $b$ value on the same figure.  Comment on how the
+$b \in \{0.3, 0.7, 0.9, 0.99\}$ (all less than 1).
+
+Plot the cross-path
+average of $\beta_t$ for each $b$ value on the same figure and comment on how the
 rate of convergence changes as $b \to 1$.
 ```
 
@@ -1101,6 +1225,7 @@ $$
 $$
 
 **(b)** The small ODE is $\dot\beta = a + b\beta - \beta = a - (1-b)\beta$.
+
 This is linear with slope $-(1-b)$, so the unique fixed point $\beta_f = a/(1-b)$
 is globally stable iff $1-b > 0$, i.e., $b < 1$.
 
@@ -1113,8 +1238,8 @@ colors_ex = ['steelblue', 'darkorange', 'seagreen', 'purple']
 
 fig, ax = plt.subplots(figsize=(11, 5))
 for b_val, col in zip(b_values, colors_ex):
-    T_fn = lambda beta, bv=b_val: a_ex + bv * beta
-    paths = simulate_rls_scalar(T_fn, sigma_u=1.0, beta0=0.0,
+    T_fn = lambda β, bv=b_val: a_ex + bv * β
+    paths = simulate_rls_scalar(T_fn, σ_u=1.0, β0=0.0,
                                 T_periods=T_ex, N_paths=N_ex, seed=0)
     bf = a_ex / (1 - b_val)
     ax.plot(np.mean(paths, axis=0), color=col, lw=2,
@@ -1137,17 +1262,22 @@ print("return to the fixed point.  Convergence still occurs but takes longer.")
 ```{exercise}
 :label: ls_ex2
 
-**Necessary Condition: Non-REE Limit Points**
+Necessary condition: non-REE limit points
 
-Proposition 2(i) of {cite}`MarcetSargent1989jet` states that $P(\beta_t \to \hat\beta) = 0$
+Proposition 2(i) of {cite:t}`MarcetSargent1989jet` states that $P(\beta_t \to \hat\beta) = 0$
 for any $\hat\beta \neq \beta_f$ in the interior.
 
 (a) Using the Bray model with $a=1$, $b=0.6$, simulate 100 paths of length
-$T = 600$ starting from $\beta_0 = 6$ (far from $\beta_f = 2.5$).  Show that
+$T = 600$ starting from $\beta_0 = 6$ (far from $\beta_f = 2.5$).
+
+Show that
 paths still converge to $\beta_f$.
 
-(b) Now consider the **unstable** case $b = 1.5$.  Simulate 50 paths of length
+(b) Now consider the **unstable** case $b = 1.5$.
+
+Simulate 50 paths of length
 $T = 200$ starting from $\beta_0 = 0.1$ (close to the REE $\beta_f = -2$).
+
 Describe what happens.
 
 (c) For the unstable case, plot the phase diagram and explain geometrically why
@@ -1164,27 +1294,27 @@ the paths diverge.
 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
 
 # (a) far start, stable case
-T_st = lambda beta: 1.0 + 0.6*beta
-paths_far = simulate_rls_scalar(T_st, 1.0, beta0=6.0,
+T_st = lambda β: 1.0 + 0.6*β
+paths_far = simulate_rls_scalar(T_st, 1.0, β0=6.0,
                                 T_periods=600, N_paths=100, seed=1)
 ax = axes[0]
 for i in range(40):
-    ax.plot(paths_far[i], color='steelblue', alpha=0.2, lw=0.8)
+    ax.plot(paths_far[i], color='steelblue', alpha=0.2, lw=2)
 ax.plot(np.mean(paths_far, axis=0), color='navy', lw=2, label='average')
-ax.axhline(2.5, color='red', ls='--', lw=1.5, label='$\\beta_f = 2.5$')
+ax.axhline(2.5, color='red', ls='--', lw=2, label='$\\beta_f = 2.5$')
 ax.set_title('Stable ($b=0.6$): far start still converges')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$'); ax.legend()
 
 # (b) unstable case, start near REE
-T_un = lambda beta: 1.0 + 1.5*beta
-beta_f_un = 1.0 / (1 - 1.5)   # = -2
-paths_un = simulate_rls_scalar(T_un, 1.0, beta0=0.1,
+T_un = lambda β: 1.0 + 1.5*β
+β_f_un = 1.0 / (1 - 1.5)   # = -2
+paths_un = simulate_rls_scalar(T_un, 1.0, β0=0.1,
                                T_periods=200, N_paths=50, seed=2)
 ax = axes[1]
 for i in range(50):
-    ax.plot(paths_un[i], color='crimson', alpha=0.3, lw=0.8)
-ax.axhline(beta_f_un, color='black', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_un}$ (unstable)')
+    ax.plot(paths_un[i], color='crimson', alpha=0.3, lw=2)
+ax.axhline(β_f_un, color='black', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_un}$ (unstable)')
 ax.set_title('Unstable ($b=1.5$): diverges even near REE')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$'); ax.legend()
 
@@ -1195,17 +1325,17 @@ plt.show()
 **(c)** Phase diagram of the unstable case:
 
 ```{code-cell} ipython3
-beta_g = np.linspace(-8, 6, 400)
-drift_un = np.array([1.0 + 1.5*b - b for b in beta_g])
+β_g = np.linspace(-8, 6, 400)
+drift_un = np.array([1.0 + 1.5*b - b for b in β_g])
 
 fig, ax = plt.subplots(figsize=(8, 4))
-ax.plot(beta_g, drift_un, color='crimson', lw=2)
-ax.axhline(0, color='black', lw=0.8)
-ax.axvline(beta_f_un, color='black', ls='--', lw=1.5,
-           label=f'$\\beta_f = {beta_f_un}$')
-ax.fill_between(beta_g, drift_un, 0, where=(drift_un > 0),
+ax.plot(β_g, drift_un, color='crimson', lw=2)
+ax.axhline(0, color='black', lw=2)
+ax.axvline(β_f_un, color='black', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_un}$')
+ax.fill_between(β_g, drift_un, 0, where=(drift_un > 0),
                 color='crimson', alpha=0.15)
-ax.fill_between(beta_g, drift_un, 0, where=(drift_un < 0),
+ax.fill_between(β_g, drift_un, 0, where=(drift_un < 0),
                 color='steelblue', alpha=0.15)
 ax.set_xlabel('$\\beta$'); ax.set_ylabel('$T(\\beta) - \\beta$')
 ax.set_title('Phase Diagram: Unstable REE ($b=1.5$)\n'
@@ -1214,8 +1344,8 @@ ax.legend()
 plt.tight_layout()
 plt.show()
 
-print("Geometrically: the slope dT/d(beta) - 1 = b - 1 = 0.5 > 0 at the REE,")
-print("so the ODE pushes beta AWAY from beta_f in both directions.")
+print("Geometrically: the slope dT/d(β) - 1 = b - 1 = 0.5 > 0 at the REE,")
+print("so the ODE pushes β AWAY from β_f in both directions.")
 ```
 
 ```{solution-end}
@@ -1224,7 +1354,7 @@ print("so the ODE pushes beta AWAY from beta_f in both directions.")
 ```{exercise}
 :label: ls_ex3
 
-**The Present-Value Model: Effect of $\lambda$ on E-Stability**
+The present-value model: effect of $\lambda$ on E-stability
 
 In the present-value model {eq}`eq:pv_model`, $T(\beta) = (\lambda\beta + 1)\rho$
 and the Jacobian is $\mathcal{M} = \lambda\rho - 1$.
@@ -1237,8 +1367,12 @@ and the Jacobian is $\mathcal{M} = \lambda\rho - 1$.
 plot the cross-path average against the ODE solution.
 
 (c) At $\lambda = 1$, $\mathcal{M} = \rho - 1 < 0$ (still E-stable when
-$|\rho| < 1$).  Simulate paths for this case and compare the convergence
-speed with the $\lambda = 0.5$ case.  Provide an intuitive explanation.
+$|\rho| < 1$).
+
+Simulate paths for this case and compare the convergence
+speed with the $\lambda = 0.5$ case.
+
+Provide an intuitive explanation.
 ```
 
 ```{solution-start} ls_ex3
@@ -1248,14 +1382,14 @@ speed with the $\lambda = 0.5$ case.  Provide an intuitive explanation.
 **(a)**
 
 ```{code-cell} ipython3
-rho_ex = 0.9
-lambdas = [0.5, 0.8, 0.95, 1.0]
+ρ_ex = 0.9
+λ_values = [0.5, 0.8, 0.95, 1.0]
 
-print(f"{'lambda':>8}  {'beta_f':>10}  {'M = lam*rho-1':>15}  {'E-stable':>10}")
+print(f"{'lambda':>8}  {'β_f':>10}  {'M = λ*ρ-1':>15}  {'E-stable':>10}")
 print("-" * 50)
-for lv in lambdas:
-    bf = rho_ex / (1 - lv * rho_ex) if abs(lv * rho_ex) < 1 else float('inf')
-    M_jac = lv * rho_ex - 1
+for lv in λ_values:
+    bf = ρ_ex / (1 - lv * ρ_ex) if abs(lv * ρ_ex) < 1 else float('inf')
+    M_jac = lv * ρ_ex - 1
     estab = "YES" if M_jac < 0 else "NO"
     print(f"{lv:>8.2f}  {bf:>10.4f}  {M_jac:>15.4f}  {estab:>10}")
 ```
@@ -1264,42 +1398,39 @@ for lv in lambdas:
 
 ```{code-cell} ipython3
 fig, axes = plt.subplots(2, 2, figsize=(14, 10))
-colors_lam = ['steelblue', 'darkorange', 'seagreen', 'purple']
+colors_λ = ['steelblue', 'darkorange', 'seagreen', 'purple']
 
-for ax, lv, col in zip(axes.flat, lambdas, colors_lam):
-    T_fn = lambda beta, l=lv: (l * beta + 1) * rho_ex
-    ode_fn = lambda beta, l=lv: T_fn(beta, l) - beta
-    bf = rho_ex / (1 - lv * rho_ex) if abs(lv * rho_ex) < 1 else None
+for ax, lv, col in zip(axes.flat, λ_values, colors_λ):
+    T_fn = lambda β, l=lv: (l * β + 1) * ρ_ex
+    ode_fn = lambda β, l=lv: T_fn(β, l) - β
+    bf = ρ_ex / (1 - lv * ρ_ex) if abs(lv * ρ_ex) < 1 else None
 
-    paths_lam = simulate_rls_scalar(T_fn, 1.0, beta0=0.0,
+    paths_λ = simulate_rls_scalar(T_fn, 1.0, β0=0.0,
                                     T_periods=400, N_paths=100, seed=3)
     for i in range(20):
-        ax.plot(paths_lam[i], color=col, alpha=0.2, lw=0.8)
-    ax.plot(np.mean(paths_lam, axis=0), color=col, lw=2, label='RLS average')
+        ax.plot(paths_λ[i], color=col, alpha=0.2, lw=2)
+    ax.plot(np.mean(paths_λ, axis=0), color=col, lw=2, label='RLS average')
 
     if bf is not None:
         # ODE solution
         t_o, sol_o = solve_ode(ode_fn, 0.0, t_span=(0, 400), n_points=400)
-        ax.plot(t_o, sol_o, color='black', ls='--', lw=1.5, label='ODE')
-        ax.axhline(bf, color='red', ls=':', lw=1.2,
+        ax.plot(t_o, sol_o, color='black', ls='--', lw=2, label='ODE')
+        ax.axhline(bf, color='red', ls=':', lw=2,
                    label=f'$\\beta_f={bf:.2f}$')
 
-    M_jac = lv * rho_ex - 1
+    M_jac = lv * ρ_ex - 1
     ax.set_title(f'$\\lambda={lv}$,  $\\mathcal{{M}}={M_jac:.3f}$')
     ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
     ax.legend(fontsize=8)
 
-plt.suptitle('Present-Value Model: Convergence for Different $\\lambda$ Values',
-             y=1.02, fontsize=13)
 plt.tight_layout()
 plt.show()
 
-print("\n(c) When lambda=1, M = rho-1 ≈ -0.1 (small in absolute value).")
-print("    This means the ODE is very 'flat' near beta_f: the restoring force")
+print("\n(c) When lambda=1, M = ρ-1 ≈ -0.1 (small in absolute value).")
+print("    This means the ODE is very 'flat' near β_f: the restoring force")
 print("    is weak and convergence is slow.  When lambda=0.5, M = -0.55,")
 print("    giving a stronger restoring force and faster convergence.")
 ```
 
 ```{solution-end}
 ```
-
diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index 3b291b3f6..cffd1eef6 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -30,124 +30,160 @@ kernelspec:
 
 This lecture explores a classic question in economic theory: can agents **learn** their way to a rational expectations equilibrium?
 
-{cite}`BrayKreps1987` examine this question in a rigorously specified model.
+{cite:t}`BrayKreps1987` examine this question in a rigorously specified model.
 
 In a rational expectations equilibrium, agents use market prices to make inferences about other agents' private information.
 
 Each agent knows the **statistical relationship** between prices and the underlying payoff-relevant variables and that relationship is **correct** given the equilibrium.
 
-But this raises a  question: where does that knowledge come from?
+But this raises a question: where does that knowledge come from?
 
 The **rational learning** approach studied by Bray and Kreps asks whether agents who start with uncertainty about the equilibrium price function can, over time, learn it from observations of past prices.
 
 The key findings are:
 
-* In a benchmark example, a rational (Bayesian) uninformed agent **does learn** the equilibrium price function as data accumulate.
-* The beliefs of the uninformed agent converge (weakly) to a point mass at the true equilibrium parameter.
-* In more general economies, this convergence can fail, especially when **multiple equilibria** exist or when the uninformed agent's model is **misspecified**.
+* In every rational learning model, posterior assessments converge because they are bounded martingales.
+* In the paper's benchmark example, the uninformed agent learns the informed agent's risk tolerance.
+* Correct learning requires identification, smooth equilibrium price maps, and positive prior probability for the true model.
 
-This lecture presents the Bray–Kreps framework, works through their benchmark example in detail, and provides Python code to simulate Bayesian learning dynamics.
+This lecture presents the Bray–Kreps framework, explains their benchmark example, and provides Python code for a simplified Bayesian learning illustration.
 
 
-We focus on  {cite}`BrayKreps1987`, Chapter 19 in *Advances in Economic Theory* (1987), which synthesizes earlier work by {cite}`Bray1982`, {cite}`BraySavin1984`, and the rational expectations literature of {cite}`Radner1979`, {cite}`grossman1976`, and {cite}`Jordan1982`.
+We focus on {cite:t}`BrayKreps1987`, published in *Arrow and the Ascent of Modern Economic Theory*, which synthesizes earlier work by {cite:t}`Bray1982`, {cite:t}`BraySavin1984`, and the rational expectations literature of {cite:t}`Radner1979`, {cite:t}`grossman1976`, and {cite:t}`Jordan1982`.
 
+The local PDF version is the June 1981 Stanford Research Paper version of the same work.
 
 Let's start with the necessary imports.
 
 ```{code-cell} ipython3
 import numpy as np
 import matplotlib.pyplot as plt
-from scipy.stats import norm
-from scipy.integrate import quad
-import warnings
-warnings.filterwarnings('ignore')
 ```
 
-## The Economy
+## The economy
 
-### Agents and Assets
+### Agents and assets
 
-The economy has two types of agents and two assets:
+The paper's example is an infinitely repeated version of the information model in {cite:t}`GrossmanStiglitz1980`.
 
-* A **safe asset** with net return normalized to zero.
-* A **risky asset** traded at date $t$ at spot price $p_t$.
-
-At each date $t = 0, 1, 2, \ldots$ the risky asset yields a gross return $r_t$ that is drawn IID from a distribution with mean zero and variance $\sigma^2$.
+Each date is economically disconnected from the others, so agents start each period afresh.
 
-There are two agents:
+There are two types of agents and two assets:
 
-* **Agent $I$ (Informed)**: observes $r_t$ before trading at date $t$.
-* **Agent $U$ (Uninformed)**: cannot observe $r_t$ directly, but can observe the price $p_t$.
-
-### Preferences
+* A **safe asset** with net return normalized to zero.
+* A **risky asset** endowed one unit per agent and traded at date $t$ at spot price $p_t$.
 
-Both agents have von Neumann–Morgenstern utility with coefficient of absolute risk tolerance equal to $2$.
+At each date $t = 0, 1, 2, \ldots$ the risky asset yields a gross return $r_t$ at date $t+1/2$.
 
-Agent $n \in \{I, U\}$ chooses holdings $x^n$ of the risky asset to maximize
+An informed signal $s_t$ satisfies
 
 $$
--e^{-\frac{1}{2}(r_{t+1} + b \cdot y^n_t) x^n}
+r_t = s_t + \epsilon_t,
+\qquad
+\epsilon_t \sim \mathcal N(0,\sigma^2),
 $$
 
-where $y^n_t$ is agent $n$'s information at date $t$, and $b$ is agent $n$'s holding of the safe asset (treated as a budget constraint).
-
-At date $t$:
-* Agent $I$ observes $(r_t, p_t)$, so $y^I_t = (r_t, p_t)$.
-* Agent $U$ observes $p_t$ only, so $y^U_t = p_t$.
+where $\{s_t\}$ and $\{\epsilon_t\}$ are IID normal sequences and are mutually independent.
 
-Net supply of the risky asset is zero, so market clearing requires $x^I_t + x^U_t = 2$ (both normalized to share the supply equally, then net supply equals zero).
+There are two representative agents:
 
-### Rational Expectations Equilibrium
+* **Agent $I$ (informed)** observes $s_t$ before trading at date $t$.
+* **Agent $U$ (uninformed)** observes the equilibrium price $p_t$ but not $s_t$ before trading.
 
-Following {cite}`Radner1979`, a **rational expectations equilibrium** (REE) is a price function $p(\cdot)$ such that:
+Both agents observe the previous return before current trading.
 
-1. Each agent maximizes expected utility given their information (which includes $p_t$).
-2. Markets clear at each date.
-3. The price function is **correct** in the sense that agents' beliefs about the relationship between $p_t$ and $r_t$ are consistent with the actual relationship generated by the equilibrium.
+### Preferences
 
-In this environment, {cite}`grossman1976` showed that under certain conditions a **fully revealing** equilibrium exists in which $p_t$ perfectly reveals $r_t$ to the uninformed agent.
+Agent $n \in \{I, U\}$ has constant absolute risk tolerance $\theta^n$.
 
-The unique rational expectations equilibrium has the linear price function
+If agent $n$ holds $x^n$ units of the risky asset and $y^n$ units of the safe asset between dates $t$ and $t+1/2$, period utility is
 
 $$
-p_t = a + b r_t
-$$ (eq:req_price)
+-\exp\left[-\frac{x^n r_t + y^n}{\theta^n}\right].
+$$
 
-where the coefficients $a$ and $b$ are determined by market clearing and the agents' optimization.
+Thus $1/\theta^n$ is the coefficient of absolute risk aversion.
 
-**Equilibrium coefficient values** (derived from market clearing with risk tolerance $= 2$ and supply normalized to $2$):
+Given the signal $s_t$, the informed agent's demand is
 
 $$
-a = 0, \qquad b = 1
+x^I_t
+=
+\frac{\theta^I}{\sigma^2}(s_t - p_t).
+$$ (eq:bk-informed-demand)
+
+Because each of the two agents is endowed with one unit of the risky asset, market clearing is
+
+$$
+x^I_t + x^U_t = 2.
 $$
 
-so that $p_t = r_t$ (i.e., the price fully reveals the fundamental).
+### Rational expectations equilibrium
 
-More generally, with parameters $(\theta^I, \theta^U)$ denoting risk tolerances and $\sigma^2$ the variance of $r_t$:
+If all agents knew $s_t$, agent $n$ would demand
 
 $$
-b = \frac{\theta^I + \theta^U}{\sigma^2 (\theta^U)^{-1} + \sigma^2 (\theta^I)^{-1}} = \frac{2\sigma^2}{\sigma^2} = 2
+x^n_t = \frac{\theta^n}{\sigma^2}(s_t - p_t).
 $$
 
-Bray and Kreps work with a parametrization in which $b$ takes the value
+With $N$ agents and total risky-asset supply $N$, market clearing gives the **full communication price**
 
 $$
-b = \frac{\theta^U(a_{t-1} + b_{t-1} p_{t-1} - p_{t-1})}{\sigma^2}
+p_t
+=
+s_t
+-
+\frac{N\sigma^2}{\sum_{n=1}^N \theta^n}.
+$$ (eq:bk-full-communication-price)
+
+Thus if $\sum_n \theta^n$ is known, the price fully reveals $s_t$.
+
+Following {cite:t}`Radner1979`, Bray and Kreps call this a full communication rational expectations equilibrium.
+
+The paper's learning problem starts when $\theta^I$ is unknown to agent $U$.
+
+Agent $U$ knows $\sigma^2$ and $\theta^U$, and starts with a prior density over $\theta^I$ on an interval $[a,b] \subset (0,\infty)$.
+
+At a date when agent $U$ has posterior density $f$ over $\theta^I$, his own trade reveals $x^I_t=2-x^U_t$ through market clearing.
+
+Combining this inferred $x^I_t$ with {eq}`eq:bk-informed-demand`, each candidate $\theta^I$ implies
+
 $$
+s_t
+=
+p_t
++
+\frac{\sigma^2 x^I_t}{\theta^I}.
+$$ (eq:bk-signal-implied)
+
+After trading, agent $U$ observes $r_t$.
+
+Bayes' rule then updates the posterior over $\theta^I$ using the normal density of the signal implied by {eq}`eq:bk-signal-implied` conditional on the realized return.
 
-and for concreteness set $\theta^I + \theta^U = \sigma^2 = 1$ so that the equilibrium value is $b^* = 2\sigma^2/(\theta^I + \theta^U) = 2$.
+This is the main object learned in Bray and Kreps' benchmark example.
 
-For the numerical example below we follow Bray–Kreps directly and use:
+They emphasize that the equilibrium can be defined recursively, but closed-form prices are "out of the question" even in this simple case.
 
+## A simplified Gaussian illustration
+
+The code below is a pedagogical simplification of the Bayesian consistency logic.
+
+Instead of solving the full Bray--Kreps equilibrium with a posterior over risk tolerance, it studies a linear observation model
+
+$$
+p_t = b r_t,
 $$
-p_t = a + b r_t, \quad \text{with } a = 0, \; b^* = 2
-$$ (eq:bk_price)
 
-## The Learning Model
+where the single unknown coefficient $b$ plays the role of an identifiable structural parameter.
+
+The point is to illustrate how Bayesian posteriors concentrate when the likelihood is correctly specified and the true parameter is identified by observations.
+
+## The simplified learning model
 
 ### Setup
 
 Agent $U$ **does not know** the equilibrium price function.
+
 Specifically, $U$ does not know $b^*$.
 
 However, $U$ does know:
@@ -157,7 +193,7 @@ However, $U$ does know:
 
 So $U$'s task is to learn the single parameter $b$ from observations of prices and (eventually) returns.
 
-### Observing the Signal
+### Observing the signal
 
 At date $t$, agent $U$ observes $p_t$.
 
@@ -171,7 +207,7 @@ where $b_{t-1}$ is $U$'s current estimate of $b^*$.
 
 After date $t$ trading and before date $t+1$, $U$ observes $r_t$ (the actual return is revealed, say through dividend payments).
 
-### Bayesian Updating
+### Bayesian updating
 
 Agent $U$ begins with a **prior** distribution on $b$:
 
@@ -205,11 +241,11 @@ Equations {eq}`eq:posterior_precision` and {eq}`eq:posterior_mean` follow from t
 Each observation $(r_s, p_s)$ with $p_s = b r_s + 0$ is treated as a noisy signal of $b$ with signal-to-noise ratio $r_s^2 / \sigma^2$.
 
 
-### The Key Convergence Result
+### The simplified convergence result
 
-{cite}`BrayKreps1987` prove the following in their Proposition 1:
+For the simplified Gaussian model, standard Bayesian linear regression implies the following result.
 
-**Proposition (Bray–Kreps):** *For any prior $(μ_0, v_0)$ with $v_0 < \infty$, as $t \to \infty$:*
+**Proposition:** *For any prior $(\mu_0, v_0)$ with $v_0 < \infty$, as $t \to \infty$:*
 
 $$
 \mu_t \xrightarrow{a.s.} b^*, \qquad v_t \xrightarrow{a.s.} 0
@@ -217,6 +253,10 @@ $$
 
 *That is, agent $U$'s posterior distribution on $b$ converges almost surely to a point mass at the true equilibrium value $b^*$.*
 
+This statement is included to make the simulation transparent.
+
+The formal propositions in {cite:t}`BrayKreps1987` are more general martingale convergence results for posterior assessments, and they are discussed below.
+
 The intuition is straightforward:
 
 * Each period adds a new observation $(r_t, p_t)$ with information content proportional to $r_t^2$.
@@ -224,9 +264,9 @@ The intuition is straightforward:
 * Therefore the posterior precision $v_t^{-1} \to \infty$, which means $v_t \to 0$.
 * Since the observations are generated by the true $b^*$, the posterior mean $\mu_t$ converges to $b^*$.
 
-The proof follows from standard results on Bayesian consistency for correctly specified models.
+The proof follows from standard results on Bayesian consistency for correctly specified Gaussian linear models.
 
-## Simulating Bayesian Learning
+## Simulating Bayesian learning
 
 We now implement the Bayesian learning dynamics and verify convergence numerically.
 
@@ -234,86 +274,93 @@ We now implement the Bayesian learning dynamics and verify convergence numerical
 
 ```{code-cell} ipython3
 # True equilibrium parameters
-a_true = 0.0
 b_true = 2.0        # true b* in the REE
 
 # Distribution of fundamentals
-sigma2 = 1.0        # variance of r_t
+σ2 = 1.0            # variance of r_t
 
 # Prior on b
-mu_0  = 0.5         # prior mean (misspecified, true is 2.0)
-v_0   = 2.0         # prior variance (diffuse)
+μ_0 = 0.5           # prior mean (misspecified, true is 2.0)
+v_0 = 2.0           # prior variance (diffuse)
 
 # Simulation settings
-T     = 300         # time periods
-N     = 200         # number of Monte Carlo paths
+T = 300             # time periods
+N = 200             # number of Monte Carlo paths
 
 np.random.seed(42)
 ```
 
-### Bayesian Updating Function
+### Bayesian updating function
 
 ```{code-cell} ipython3
-def simulate_bayesian_learning(b_true, sigma2, mu_0, v_0, T, N):
+def simulate_bayesian_learning(b_true, σ2, μ_0, v_0, T, N):
     """
     Simulate Bayesian learning of the REE slope parameter b*.
 
     Parameters
     ----------
-    b_true  : true equilibrium slope
-    sigma2  : variance of fundamentals r_t
-    mu_0    : prior mean on b
-    v_0     : prior variance on b
-    T       : number of time periods
-    N       : number of Monte Carlo paths
+    b_true : true equilibrium slope
+    σ2     : variance of fundamentals r_t
+    μ_0    : prior mean on b
+    v_0    : prior variance on b
+    T      : number of time periods
+    N      : number of Monte Carlo paths
 
     Returns
     -------
-    mu_paths : array (N, T) of posterior means over time
-    v_paths  : array (N, T) of posterior variances over time
+    μ_paths : array (N, T) of posterior means over time
+    v_paths : array (N, T) of posterior variances over time
     """
     # Draw fundamentals r_t for all paths
-    r = np.random.normal(0, np.sqrt(sigma2), size=(N, T))
+    r = np.random.normal(0, np.sqrt(σ2), size=(N, T))
 
     # Equilibrium prices: p_t = b_true * r_t
     p = b_true * r
 
     # Arrays to store posterior parameters
-    mu_paths = np.empty((N, T))
-    v_paths  = np.empty((N, T))
+    μ_paths = np.empty((N, T))
+    v_paths = np.empty((N, T))
 
     for i in range(N):
         # Initialize prior
         precision = 1.0 / v_0
-        weighted_sum = mu_0 / v_0
+        weighted_sum = μ_0 / v_0
 
         for t in range(T):
             # Each observation: p_s = b * r_s  =>  b = p_s / r_s (when r_s != 0)
-            # Likelihood contribution: precision += r_s^2 / sigma2
-            #                          weighted_sum += r_s * p_s / sigma2
-            precision    += r[i, t]**2 / sigma2
-            weighted_sum += r[i, t] * p[i, t] / sigma2
+            # Likelihood contribution: precision += r_s^2 / σ2
+            #                          weighted_sum += r_s * p_s / σ2
+            precision += r[i, t]**2 / σ2
+            weighted_sum += r[i, t] * p[i, t] / σ2
 
-            v_t  = 1.0 / precision
-            mu_t = v_t * weighted_sum
+            v_t = 1.0 / precision
+            μ_t = v_t * weighted_sum
 
-            mu_paths[i, t] = mu_t
-            v_paths[i, t]  = v_t
+            μ_paths[i, t] = μ_t
+            v_paths[i, t] = v_t
 
-    return mu_paths, v_paths
+    return μ_paths, v_paths
 ```
 
-### Running the Simulation
+### Running the simulation
 
 ```{code-cell} ipython3
-mu_paths, v_paths = simulate_bayesian_learning(
-    b_true, sigma2, mu_0, v_0, T, N
+μ_paths, v_paths = simulate_bayesian_learning(
+    b_true, σ2, μ_0, v_0, T, N
 )
 ```
 
-### Plotting Results
+### Plotting results
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: posterior learning paths
+    name: fig-rle-posterior-learning
+  image:
+    alt: Posterior mean and posterior variance paths over time
+---
 fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
 t_range = np.arange(1, T + 1)
@@ -321,31 +368,29 @@ t_range = np.arange(1, T + 1)
 # --- Left panel: posterior means ---
 ax = axes[0]
 for i in range(min(30, N)):
-    ax.plot(t_range, mu_paths[i, :], color='steelblue', alpha=0.25, lw=0.8)
+    ax.plot(t_range, μ_paths[i, :], color='steelblue', alpha=0.2, lw=2)
 
-ax.plot(t_range, np.mean(mu_paths, axis=0), color='navy', lw=2,
+ax.plot(t_range, np.mean(μ_paths, axis=0), color='navy', lw=2,
         label='cross-path average')
-ax.axhline(b_true, color='red', ls='--', lw=1.5, label=f'$b^* = {b_true}$')
-ax.axhline(mu_0,   color='gray', ls=':',  lw=1.2, label=f'prior mean $= {mu_0}$')
+ax.axhline(b_true, color='red', ls='--', lw=2, label=f'$b^* = {b_true}$')
+ax.axhline(μ_0, color='gray', ls=':', lw=2, label=f'prior mean $= {μ_0}$')
 ax.set_xlabel('$t$')
 ax.set_ylabel('posterior mean $\\mu_t$')
-ax.set_title('Posterior Mean Converges to $b^*$')
 ax.legend()
 
 # --- Right panel: posterior variances ---
 ax = axes[1]
 for i in range(min(30, N)):
-    ax.plot(t_range, v_paths[i, :], color='darkorange', alpha=0.25, lw=0.8)
+    ax.plot(t_range, v_paths[i, :], color='darkorange', alpha=0.2, lw=2)
 
 ax.plot(t_range, np.mean(v_paths, axis=0), color='saddlebrown', lw=2,
         label='cross-path average')
 
-# Theoretical rate: v_t ≈ sigma2 / (t * sigma2) = 1/t  for large t
-ax.plot(t_range, 1.0 / t_range, color='black', ls='--', lw=1.5,
+# Theoretical rate: v_t ≈ σ2 / (t * σ2) = 1/t for large t
+ax.plot(t_range, 1.0 / t_range, color='black', ls='--', lw=2,
         label='$1/t$ (theory)')
 ax.set_xlabel('$t$')
 ax.set_ylabel('posterior variance $v_t$')
-ax.set_title('Posterior Variance Shrinks to 0')
 ax.legend()
 
 plt.tight_layout()
@@ -356,7 +401,7 @@ The left panel shows that regardless of the (misspecified) prior mean, agent $U$
 
 The right panel confirms that the posterior variance vanishes at rate $1/t$, consistent with the formula in {eq}`eq:posterior_precision`.
 
-## The Demand and Equilibrium
+## Demand and equilibrium
 
 To connect the learning story to market equilibrium, we can track how agent $U$'s **equilibrium demand** for the risky asset evolves.
 
@@ -371,23 +416,23 @@ As $\mu_t \to b^*$, this demand function converges to the demand implied by the
 The following code computes the demand trajectories.
 
 ```{code-cell} ipython3
-def compute_demand(mu_t, p_t, sigma2=1.0, theta_U=0.5):
+def compute_demand(μ_t, p_t, σ2=1.0, θ_U=0.5):
     """
-    Compute agent U's demand for the risky asset given beliefs mu_t.
+    Compute agent U's demand for the risky asset given beliefs μ_t.
 
-    x^U = (theta_U / sigma2) * (r_hat - p_t)
-    where r_hat = p_t / mu_t is U's signal extraction.
+    x^U = (θ_U / σ2) * (r_hat - p_t)
+    where r_hat = p_t / μ_t is U's signal extraction.
     """
-    r_hat = p_t / mu_t
-    return (theta_U / sigma2) * (r_hat - p_t)
+    r_hat = p_t / μ_t
+    return (θ_U / σ2) * (r_hat - p_t)
 
 # Single representative path
 i_rep = 0
-r_rep = np.random.normal(0, np.sqrt(sigma2), T)
+r_rep = np.random.normal(0, np.sqrt(σ2), T)
 p_rep = b_true * r_rep
 
 demand_path = np.array([
-    compute_demand(mu_paths[i_rep, t], p_rep[t])
+    compute_demand(μ_paths[i_rep, t], p_rep[t])
     for t in range(T)
 ])
 
@@ -397,79 +442,97 @@ demand_ree = np.array([
     for t in range(T)
 ])
 
+```
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: demand convergence
+    name: fig-rle-demand-convergence
+  image:
+    alt: Learning demand and rational expectations demand over time
+---
 fig, ax = plt.subplots(figsize=(10, 5))
 ax.plot(t_range, demand_path, color='steelblue', alpha=0.7,
-        label="$x^U_t$ (learning)")
-ax.plot(t_range, demand_ree, color='red', ls='--', lw=1.5,
+        lw=2, label="$x^U_t$ (learning)")
+ax.plot(t_range, demand_ree, color='red', ls='--', lw=2,
         label="$x^U_t$ (REE)")
 ax.set_xlabel('$t$')
 ax.set_ylabel("agent $U$'s demand $x^U_t$")
-ax.set_title("Demand Converges to REE Demand as $\\mu_t \\to b^*$")
 ax.legend()
 plt.tight_layout()
 plt.show()
 ```
 
-## When Does Learning Fail?
+## Two toy extensions
 
-The convergence result above relies on several assumptions that may fail in richer environments.
+The next two simulations are not in Bray and Kreps.
 
-{cite}`BrayKreps1987` identify several obstacles:
+They are included as small numerical illustrations of themes that appear in the paper: identification and feedback from beliefs to prices.
 
-### 1. Multiple Equilibria
+### 1. Two possible parameters
 
-When there are multiple rational expectations equilibria, the uninformed agent's beliefs may converge to the **wrong** equilibrium.
+First suppose the simplified linear model can be generated by one of two possible values of $b^*$.
 
-In the example with two potential equilibrium parameters $b_1^*$ and $b_2^*$, the agent's posterior mean can converge to either one depending on the history.
+If the data identify which value is operating, Bayesian learning separates the two cases.
 
-The following code illustrates this with a mixture prior.
+The following code illustrates this point with a mixture prior.
 
 ```{code-cell} ipython3
-def simulate_two_equilibria(b_values, sigma2, T, N, seed=0):
+def simulate_two_parameters(b_values, σ2, T, N, seed=0):
     """
-    Simulate learning when the prior is spread over two possible equilibrium values.
-    Nature uses b_values[0] as the true equilibrium with probability 0.5.
+    Simulate learning when the prior is spread over two possible parameter values.
+    Nature draws the true value from b_values.
     """
     rng = np.random.default_rng(seed)
     b_true_draw = rng.choice(b_values, size=N)
 
-    mu_paths_all = np.empty((N, T))
+    μ_paths_all = np.empty((N, T))
 
     for i in range(N):
         b_i = b_true_draw[i]
-        r = rng.normal(0, np.sqrt(sigma2), T)
+        r = rng.normal(0, np.sqrt(σ2), T)
         p = b_i * r
 
         # Diffuse prior centered between the two equilibria
-        mu_prior    = np.mean(b_values)
-        prec_prior  = 1.0 / 4.0
-        w_sum       = mu_prior * prec_prior
-        prec        = prec_prior
+        μ_prior = np.mean(b_values)
+        prec_prior = 1.0 / 4.0
+        w_sum = μ_prior * prec_prior
+        prec = prec_prior
 
         for t in range(T):
-            prec  += r[t]**2 / sigma2
-            w_sum += r[t] * p[t] / sigma2
-            mu_paths_all[i, t] = w_sum / prec
+            prec += r[t]**2 / σ2
+            w_sum += r[t] * p[t] / σ2
+            μ_paths_all[i, t] = w_sum / prec
 
-    return mu_paths_all, b_true_draw
+    return μ_paths_all, b_true_draw
 
 b_values = [1.0, 3.0]
-mu_two, b_drawn = simulate_two_equilibria(b_values, sigma2=1.0, T=200, N=300)
+μ_two, b_drawn = simulate_two_parameters(b_values, σ2=1.0, T=200, N=300)
+```
 
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: two-parameter learning
+    name: fig-rle-two-parameters
+  image:
+    alt: Posterior mean paths converging to two possible parameter values
+---
 fig, ax = plt.subplots(figsize=(10, 5))
 
 colors = {b_values[0]: 'steelblue', b_values[1]: 'darkorange'}
 for i in range(len(b_drawn)):
     c = colors[b_drawn[i]]
-    ax.plot(np.arange(1, 201), mu_two[i, :], color=c, alpha=0.15, lw=0.6)
+    ax.plot(np.arange(1, 201), μ_two[i, :], color=c, alpha=0.1, lw=2)
 
 for bv, c in colors.items():
     ax.axhline(bv, color=c, ls='--', lw=2, label=f'$b^* = {bv}$')
 
 ax.set_xlabel('$t$')
 ax.set_ylabel('posterior mean $\\mu_t$')
-ax.set_title('Learning with Two Possible Equilibria\n'
-             '(blue paths: true $b^*=1$; orange paths: true $b^*=3$)')
 ax.legend()
 plt.tight_layout()
 plt.show()
@@ -477,293 +540,294 @@ plt.show()
 
 As expected, agent $U$ learns the **correct** equilibrium as long as the model is correctly specified and the true equilibrium generates the data.
 
-The more subtle failure mode, identified by Bray and Kreps, arises when agents' learning rules themselves **change the equilibrium**, creating a feedback loop that may or may not converge.
+The paper's non-identification example is different: with two informed agents, prices can reveal only the sum of their risk tolerances.
 
-### 2. Self-Referential Learning Dynamics
+### 2. A self-referential price rule
+
+The next toy model lets the price at date $t$ depend directly on agent $U$'s current belief $\mu_t$.
 
-In the fully general setting, the price at date $t$ depends on $U$'s current beliefs $\mu_t$.
 But $\mu_t$ is updated based on past prices.
 
 This creates a **self-referential** system: beliefs drive prices, and prices update beliefs.
 
-{cite}`BrayKreps1987` show (their Proposition 2 and Section 5) that this feedback can lead to **non-stationary** dynamics and that convergence to the rational expectations equilibrium requires additional conditions. Essentially, the economy "settles down" to a stationary relationship before agents learn the parameters of that relationship.
+This is a deliberately simple stand-in for the paper's warning that learning changes behavior, which changes the data that agents observe.
 
-The next section illustrates the self-referential dynamics.
+The formal Bray--Kreps model handles this by making the whole price process part of a grand rational expectations equilibrium over an expanded state space.
 
 ```{code-cell} ipython3
-def simulate_self_referential(b_true, sigma2, mu_0, v_0, T, N,
-                              alpha_demand=0.5):
+def simulate_self_referential(b_true, σ2, μ_0, v_0, T, N,
+                              α_demand=0.5):
     """
     Simulate the self-referential learning model where prices depend on
-    current beliefs mu_t.
+    current beliefs μ_t.
 
-    p_t = b_true * r_t + alpha_demand * (mu_t - b_true) * r_t
+    p_t = b_true * r_t + α_demand * (μ_t - b_true) * r_t
 
     This captures the idea that as U's beliefs deviate from b*, the
     equilibrium price is distorted.
     """
     rng = np.random.default_rng(10)
-    r_all = rng.normal(0, np.sqrt(sigma2), (N, T))
+    r_all = rng.normal(0, np.sqrt(σ2), (N, T))
 
-    mu_paths_sr = np.empty((N, T))
-    p_paths_sr  = np.empty((N, T))
+    μ_paths_sr = np.empty((N, T))
+    p_paths_sr = np.empty((N, T))
 
     for i in range(N):
-        prec  = 1.0 / v_0
-        w_sum = mu_0 / v_0
-        mu_t  = mu_0
+        prec = 1.0 / v_0
+        w_sum = μ_0 / v_0
+        μ_t = μ_0
 
         for t in range(T):
             r_t = r_all[i, t]
             # Price is partly driven by current beliefs
-            p_t = b_true * r_t + alpha_demand * (mu_t - b_true) * r_t
+            p_t = b_true * r_t + α_demand * (μ_t - b_true) * r_t
 
             # Update beliefs with this price
-            prec  += r_t**2 / sigma2
-            w_sum += r_t * p_t / sigma2
-            mu_t   = w_sum / prec
+            prec += r_t**2 / σ2
+            w_sum += r_t * p_t / σ2
+            μ_t = w_sum / prec
 
-            mu_paths_sr[i, t] = mu_t
-            p_paths_sr[i, t]  = p_t
+            μ_paths_sr[i, t] = μ_t
+            p_paths_sr[i, t] = p_t
 
-    return mu_paths_sr, p_paths_sr
+    return μ_paths_sr, p_paths_sr
 
-mu_sr, p_sr = simulate_self_referential(
-    b_true, sigma2, mu_0, v_0, T=200, N=100, alpha_demand=0.3
+μ_sr, p_sr = simulate_self_referential(
+    b_true, σ2, μ_0, v_0, T=200, N=100, α_demand=0.3
 )
+```
 
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: self-referential learning
+    name: fig-rle-self-referential
+  image:
+    alt: Self-referential posterior means and price paths over time
+---
 fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
 ax = axes[0]
 for i in range(30):
-    ax.plot(np.arange(1, 201), mu_sr[i, :], color='steelblue', alpha=0.3, lw=0.8)
-ax.plot(np.arange(1, 201), np.mean(mu_sr, axis=0), color='navy', lw=2,
+    ax.plot(np.arange(1, 201), μ_sr[i, :], color='steelblue', alpha=0.2, lw=2)
+ax.plot(np.arange(1, 201), np.mean(μ_sr, axis=0), color='navy', lw=2,
         label='average $\\mu_t$')
-ax.axhline(b_true, color='red', ls='--', lw=1.5, label=f'$b^* = {b_true}$')
+ax.axhline(b_true, color='red', ls='--', lw=2, label=f'$b^* = {b_true}$')
 ax.set_xlabel('$t$')
 ax.set_ylabel('$\\mu_t$')
-ax.set_title('Self-Referential Learning: Posterior Means')
 ax.legend()
 
 ax = axes[1]
 for i in range(30):
-    ax.plot(np.arange(1, 201), p_sr[i, :], color='darkorange', alpha=0.2, lw=0.6)
+    ax.plot(np.arange(1, 201), p_sr[i, :], color='darkorange', alpha=0.15, lw=2)
 ax.plot(np.arange(1, 201), np.mean(np.abs(p_sr), axis=0), color='saddlebrown', lw=2,
         label='average $|p_t|$')
 ax.set_xlabel('$t$')
 ax.set_ylabel('$p_t$')
-ax.set_title('Self-Referential Learning: Price Paths')
 ax.legend()
 
 plt.tight_layout()
 plt.show()
 ```
 
-## Convergence of Beliefs: The General Result
-
-Section 3 of {cite}`BrayKreps1987` proves the following general convergence theorem.
+## Convergence of beliefs
 
-Let $\Omega$ be the set of states of nature, $G$ a probability measure, and $H_t$ the sigma-field generated by the history up to date $t$.
+Section 3 of {cite:t}`BrayKreps1987` proves two general convergence results.
 
-Suppose agent $U$ maintains a prior $P$ over the probability space $(\Omega, G)$.
+Let $\Omega$ be the underlying state space, and let $H_t^n(p)$ be the information generated for agent $n$ by private information and observed equilibrium prices up to date $t$.
 
-**Proposition 2 (Bray–Kreps, Rational Learning of Beliefs):**
-
-*Under regularity conditions, for any event $A$:*
+For any event $A$, the posterior assessment
 
 $$
-P(A \mid H_t) \xrightarrow{a.s.} \mathbf{1}_A
+P^n(A \mid H_t^n(p))
 $$
 
-*That is, conditional probabilities converge almost surely to the truth.*
+is a bounded martingale in $t$.
+
+Their Proposition 1 is therefore
 
-This is the **Bayesian consistency** result: a rational Bayesian agent who assigns positive prior probability to the truth will eventually learn it.
+$$
+P^n(A \mid H_t^n(p))
+\xrightarrow{a.s.}
+P^n(A \mid H_\infty^n(p)),
+\qquad
+H_\infty^n(p)=\bigvee_{t \geq 0} H_t^n(p).
+$$
 
-The key caveat: the agent must assign **positive prior probability** to the true data-generating process.
-If the agent's model is misspecified --- if the true equilibrium is outside the support of the agent's prior --- convergence to the truth is not guaranteed.
+This is convergence of posterior assessments, not yet convergence to the truth.
 
-A corollary to this general result is that for the specific model described above, the uninformed agent's posterior on $b$ converges to the truth as long as the prior assigns positive density to a neighborhood of $b^*$.
+If two agents' priors are mutually singular, the almost-sure statements need not hold on a common objective-probability set.
 
-## Convergence to a Stationary Rational Expectations Equilibrium
+If their priors have the same null sets, simultaneous convergence is obtained outside a common null set.
 
-Section 4 of {cite}`BrayKreps1987` specializes the convergence results to the context of rational expectations equilibria in markets.
+Their Proposition 2 strengthens the result from events to whole posterior distributions.
 
-The main result (Proposition 3) states that even in large general-equilibrium economies with $N$ agents and $M$ assets, agents' beliefs converge weakly to a stationary rational expectations equilibrium, provided that:
+When the parameter space is a complete separable metric space with its Borel sigma-field, regular posterior measures over that parameter space converge weakly almost surely.
 
-1. Agents form **rational (Bayesian) forecasts** given their information.
-2. The equilibrium is **unique** (no multiplicity problem).
-3. The model is **correctly specified** in the sense that the true equilibrium lies in the support of agents' priors.
+Thus rational Bayesian learning always produces a limiting posterior, but additional identification assumptions are needed to say that the limiting posterior is correct.
 
-The formal statement requires some notation.
+## Identification in the Section 2 example
 
-Let $\theta$ be the vector of unknown parameters of the economy (e.g., preferences, endowments), and let $\phi$ be the state space.
+Section 4 returns to the two-agent example in which agent $U$ is uncertain about $\theta^I$.
 
-Denote by $F_t(\cdot; \theta)$ the agents' conditional distribution function for $\theta$ at date $t$.
+Let $F_t$ be agent $U$'s posterior distribution over $\theta^I$ after observing the previous price, allocation, and return data.
 
-**Theorem (Convergence to REE):**
-*If conditions (1)–(3) hold, then $F_t(\cdot; \theta)$ converges weakly (P–a.s.) to a point mass at the true $\theta^*$, and equilibrium prices and allocations converge to those of the REE.*
+By weak convergence of posteriors, $F_t$ converges almost surely to a limiting distribution $F_\infty$.
 
-The proof involves three steps:
+Bray and Kreps then show why this limiting posterior must be a point mass at the true $\theta^I$ in their example.
 
-* **Step 1A**: The conditional probability $P(A \mid H_t)$ forms a martingale with respect to $H_t$ (by the law of iterated expectations).
-* **Step 1B**: The martingale converges a.s. by Doob's martingale convergence theorem.
-* **Step 2**: The equilibrium price function, which maps $(p, \theta)$ space to prices, is continuous (under a linear model assumption).
-* **Step 3–4**: By combining Step 1 and Step 2, the joint distribution of prices and beliefs converges.
+The argument has three parts.
 
-## Obstacles to Convergence
+First, because current equilibrium demand is continuous in the posterior distribution, prices converge to a limiting price functional
 
-While the positive convergence results are elegant, {cite}`BrayKreps1987` are careful to document when learning **fails** to produce convergence to REE.
+$$
+p_\infty(s_t; F_\infty, \theta^I, \theta^U).
+$$
 
-### Obstacle 1: Multiple Equilibria
+Second, since the signals are IID, the empirical distribution of observed prices converges to the distribution of this limiting price functional.
 
-When the economy admits multiple rational expectations equilibria, agents learning within one equilibrium may receive price signals that are informative about the *current* equilibrium but not necessarily about which equilibrium will prevail in the long run.
+Third, in this example that limiting price distribution is stochastically decreasing in $\theta^I$ when $F_\infty$ and $\theta^U$ are fixed.
 
-A concrete example: suppose there are two spot market equilibria for some payoff-relevant variable $\theta$: one equilibrium at $\theta_1$ and another at $\theta_2$.
+Therefore the long-run distribution of prices identifies the true value of $\theta^I$.
 
-The informed agents choose randomly among these each period (since they are indifferent).
+This is the paper's concrete route from convergence of beliefs to convergence to correct beliefs.
 
-The uninformed agent's posterior mean can never converge to a single value. 
+It relies on smoothness, ergodicity, and identification, rather than on martingale convergence alone.
 
-It will bounce between neighborhoods of $\theta_1$ and $\theta_2$.
+## Obstacles to convergence
 
-### Obstacle 2: Non-Stationarity of Beliefs
+While the positive convergence results are elegant, {cite:t}`BrayKreps1987` are careful to document when learning **fails** to produce convergence to REE.
 
-Even if the economy has a unique REE, if agents' beliefs are updating over time, the **realized** price process is non-stationary.
+### Obstacle 1: price maps might not settle down
 
-In that case, past data provides **biased** information about the future.
+The step from weak convergence of posteriors to convergence of prices requires smoothness of the equilibrium price functional.
 
-This is a **philosophical problem** with the idea of learning in equilibrium: one cannot use data generated by a learning process (in which prices depend on beliefs that are changing) to learn the *stationary* equilibrium relationship.
+Bray and Kreps stress that this can be hard, because small changes in a price function can produce large changes in the information communicated by prices.
 
-### Obstacle 3: Misspecified Models
+Thus martingale convergence of beliefs does not by itself guarantee that the economy settles into a stationary price relation.
 
-If $U$'s prior assigns zero probability to $b^*$, that is if $U$'s model is misspecified, then convergence to $b^*$ is impossible by Bayesian consistency.
+### Obstacle 2: prices might not identify the full parameter
 
-{cite}`BrayKreps1987` note (p. 622) that this is a subtle but important caveat: convergence is guaranteed only when the "true $\theta$ may lie outside the set of states $\Omega$" to which the agent's prior assigns positive probability is not the case.
+Even if prices settle down, the long-run distribution of prices need not identify every structural parameter.
 
-## Learning *Within* versus Learning *About* a Rational Expectations Equilibrium
+The paper gives a simple variant with two informed agents whose risk tolerances $\theta^{I1}$ and $\theta^{I2}$ are both unknown to the uninformed agent.
 
-One of the deepest conceptual contributions of {cite}`BrayKreps1987` is a distinction they draw in their concluding section between two fundamentally different notions of learning in a rational expectations context.
+In that case, prices reveal only the sum $\theta^{I1}+\theta^{I2}$.
 
-### The Distinction
+The uninformed agent cannot disentangle the two risk tolerances from price data alone.
 
-**Learning *within* a rational expectations equilibrium** is the subject of this lecture.
-The phrase refers to Bayesian inference that takes place *inside* a correctly specified model of the economy.
+For decisions in that example, learning the sum is enough, but it is not learning the full state.
 
-The uninformed agent knows the true structural form of the price function (that it is linear, that $a = 0$), knows the true distribution of fundamentals, and entertains uncertainty only about the single unknown parameter $b^*$.
+### Obstacle 3: the truth might be outside the model
 
-Because the true $b^*$ lies in the support of agent $U$'s prior, the agent's model is **correctly specified**.
+Section 5 compares the paper's rational-learning model with an example of {cite:t}`BlumeEasley1982`.
 
-The Bayesian updating rule is therefore fully rationalized: it is exactly what a rational agent with a correct model would do.
+In that example, agents can converge to an incorrect model because the true stable price relation has zero prior probability under the models they entertain.
 
-Convergence of beliefs to $b^*$ then follows from the standard Bayesian consistency theorem (Proposition 2 of {cite}`BrayKreps1987`).
+Bray and Kreps argue that this cannot occur in their rational-learning formulation except on a prior-null event.
 
-**Learning *about* a rational expectations equilibrium** is a quite different enterprise.
-Here the agent does not know the statistical relationship between prices and fundamentals, and that relationship is itself an *endogenous* object
+The reason is that rational learning puts the possible price relations generated by the expanded state space inside the Bayesian model from the start.
 
-It is determined in equilibrium by the very beliefs the agent is trying to learn.
+## Learning *within* versus learning *about* a rational expectations equilibrium
 
-As Bray and Kreps put it (p. 601):
+One of the deepest conceptual contributions of {cite:t}`BrayKreps1987` is a distinction they draw in their concluding section between two fundamentally different notions of learning in a rational expectations context.
 
-> *"The question is whether this sequence of stationary relationships, and the equilibrium it engenders, will converge to some stationary relationship, and then agents can learn that stationary relationship long enough to hold on to their initial beliefs."*
+### The distinction
 
-The difficulty is that during the learning phase, agents' beliefs are changing, which changes the equilibrium price function, which changes the data used to update beliefs.
+**Learning *within* a rational expectations equilibrium** is the subject of this lecture.
 
-The learning process and the equilibrium are **simultaneously evolving**, so the data are generated by a **non-stationary** process that is itself a function of beliefs.
+The phrase refers to Bayesian inference that takes place *inside* a correctly specified model of the economy.
 
-### Why Learning *About* an REE Requires Non-Bayesian Updating
+In Bray and Kreps' rational-learning formulation, agents are uncertain about parameters such as other agents' risk tolerances.
 
-This simultaneity creates a fundamental obstacle to fully rational Bayesian learning.
+But for every possible parameter realization, they are assumed to know the equilibrium price and allocation maps.
 
-To see why, suppose agent $U$ attempts to learn $b^*$ by treating the problem as Bayesian inference in a fixed, correctly specified model.
+Their Bayesian learning model is therefore a large rational expectations equilibrium over an expanded state space.
 
-For that to be valid, the agent would need to know:
+This is why the martingale convergence theorem can be applied so cleanly.
 
-1. The true structural form of the price function (which depends on the equilibrium).
-2. The distribution of prices conditional on the unknown parameter (which also depends on the equilibrium).
+**Learning *about* a rational expectations equilibrium** is a quite different enterprise.
 
-But both of these are themselves functions of the equilibrium that agent $U$ is trying to learn.
+Here agents do not begin with the equilibrium map already embedded in their model.
 
-If $U$'s beliefs at date $t$ are $\mu_t \neq b^*$, then $U$'s model of the price process is **misspecified**.
+Instead, they try to infer the price-state relation from data generated while beliefs and behavior are changing.
 
-The prices generated in the economy reflect other agents' optimization given the *actual* (possibly non-stationary) beliefs of $U$, not the stationary REE price function that $U$ is treating as fixed.
+This is the original problem mentioned at the start of the paper: learning changes behavior, and behavior changes the price-state relation being learned.
 
-Thus, the agent's model can be correctly specified *only if* the economy is already at the rational expectations equilibrium.
+### Why rational learning has limited reach
 
-But if the economy were already there, there would be nothing to learn.
+Bray and Kreps call the expanded-state-space formulation natural but also identify its main flaw.
 
-Bray and Kreps make this point sharply at the end of Section 5 (p. 620):
+It avoids the question of how agents learn the relation between prices and states by assuming that agents already know the equilibrium for every possible economy in the state space.
 
-> *"Note that it is unnecessary to tell U about the allocation contained in previous and current equilibria information, all the information that U could exceed the amount of information in equilibrium prices* [because] *information contained in those equilibrium prices could reflect more information than all agents put together possess.*"
+In their conclusion, they say that their results do not satisfactorily answer the question "How does a rational expectations equilibrium come about?"
 
-And in their concluding section they observe that the rational-learning model is:
+The reason is not that Bayesian convergence is false.
 
-> *"...concerned with learning* within *and learning* about *an equilibrium, and then the sense of* rational learning *within* ... is equivalent to* rational learning about *in some sense other than as formally equivalent to* rational expectations equilibrium."*
+The reason is that the Bayesian agents must have extraordinary insight into the structure of the economy and the implied probabilities of events.
 
-The distinction is that learning *within* an REE, our Bayesian model above, is consistent with full rationality because the agent's model is correct.
+This is why the paper is useful both as a benchmark and as a warning.
 
-Learning *about* an REE, by contrast, requires the agent to use data generated by a **non-stationary** process as if it were generated by a stationary REE, which is a form of model misspecification that cannot be rationalized as Bayesian updating with a correct prior.
+It gives sharp restrictions on what rational learning can imply, but it does not provide a plausible behavioral story for attaining rational expectations.
 
-### The Role of "Irrational" Learning Algorithms
+### The role of "irrational" learning algorithms
 
-This explains why the literature on learning *about* rational expectations equilibria --- going back to {cite}`Bray1982` and {cite}`BraySavin1984`, and extended in the influential work of {cite}`MarcetSargent1989` --- tends to rely on **ordinary least squares (OLS)** or other adaptive algorithms rather than Bayes' rule.
+This explains why the literature on learning *about* rational expectations equilibria --- going back to {cite:t}`Bray1982` and {cite:t}`BraySavin1984`, and extended in the influential work of {cite:t}`MarcetSargent1989jet` --- tends to rely on **ordinary least squares (OLS)** or other adaptive algorithms rather than Bayes' rule.
 
 ```{note}
-{cite}`MarcetSargent1989` use some theorems about stochastic approximation to extend some of Bray and 
-Savin's results to other settings.
+{cite:t}`MarcetSargent1989jet` use some theorems about stochastic approximation to extend some of Bray and Savin's results to other settings.
 ```
 
-In those models, agent $U$ runs a regression of observed prices on observed fundamentals, updating the estimated coefficient as new data arrive.
+In those models, agents estimate perceived laws of motion from observed data and update the estimates as new observations arrive.
 
-OLS is consistent and computationally tractable, but it is *not* the optimal rule for an agent who knows the true data-generating process.
+Such rules are computationally tractable and can converge in important examples.
 
-It is, as Bray and Kreps call it, a form of **"irrational" learning**: rational in the limited sense of using past data intelligently, but not derivable from Bayes' theorem applied to a correctly specified model.
+But they are **"irrational"** in Bray and Kreps' specific sense.
 
-An OLS learner implicitly assumes the data-generating process is stationary --- that is to say the relationship between prices and fundamentals is the same in every period.
+An agent who already understood the full equilibrium model would not generally use those rules as the Bayesian optimum.
 
-But during the learning transition, it is not: the price function shifts as beliefs shift.
+The attraction of these rules is precisely that they ask a different question.
 
-OLS ignores this, treating past and present observations as exchangeable draws from a fixed distribution.
+They ask whether agents using standard statistical procedures on the data generated by the model could eventually learn to form rational expectations.
 
-This is a misspecification, and the resulting estimates are biased in finite samples, even if they converge in the long run.
+Bray and Kreps are skeptical that rational Bayesian learning is behaviorally plausible, but they also use it to discipline adaptive learning stories.
 
-Bray and Kreps note (pp. 598–599) that in the models studied by {cite}`Bray1982` and {cite}`BraySavin1984`:
+Their proposed discipline is that a stationary limiting equilibrium should not leave agents' beliefs systematically contradicted by observations.
 
-> *"Agents are doing Bayesian updating, but their model is, almost by construction, wrong --- they are learning as if the environment were stationary when it is not."*
+In the long run, they argue, equilibrium expectations must either keep changing or become rational.
 
 There is a fundamental **epistemic tension** at the heart of learning about rational expectations equilibria:
 
 * A fully rational (Bayesian, correctly specified) learner can only apply Bayes' rule to a model whose structure is *already known* but the structure of the REE is exactly what the agent is trying to learn.
 * A learner who uses an adaptive algorithm (OLS, least-mean-squares, etc.) can potentially converge to the REE, but only by using a rule that cannot be derived from Bayesian rationality applied to a correctly specified model.
 
-The benchmark model in this lecture avoids this tension by assumption: agent $U$ knows the structural form of the price function and needs only to learn one parameter.
+The Bray--Kreps rational-learning model avoids this tension by assumption: agent $U$ knows how each possible risk tolerance would map histories into equilibrium prices and trades.
 
-That is learning *within* an REE, a clean, tractable, and fully rational exercise, but it is also a special case that sidesteps the deeper difficulty of learning *about* an REE from scratch.
+The simplified Gaussian code example avoids it even more directly by replacing the equilibrium calculation with a fixed linear observation equation.
+
+Both devices make Bayesian consistency transparent, but both sidestep the deeper difficulty of learning *about* an REE from scratch.
 
 
 ## Summary
 
-This lecture has discussed  ideas from {cite}`BrayKreps1987`:
+This lecture has discussed ideas from {cite:t}`BrayKreps1987`:
 
-1. **Rational expectations equilibria** require agents to know the statistical relationship between prices and fundamentals but this knowledge is typically assumed, not derived.
+1. **Rational learning** is modeled by expanding the state space to include unknown structural parameters such as risk tolerances.
 
-2. **Rational learning** asks whether Bayesian agents can *learn* the REE from data.
-   In a benchmark linear model, the answer is yes: the uninformed agent's posterior on the slope parameter $b^*$ converges almost surely to the truth.
+2. **Posterior assessments converge** because conditional probabilities form bounded martingales.
 
-3. The convergence relies on **Bayesian consistency**: the uninformed agent accumulates sufficient information to identify $b^*$ from observed prices and returns.
+3. **Posterior measures converge weakly** under standard topological assumptions on the parameter space.
 
-4. Convergence can **fail** when:
-   - There are **multiple equilibria** and agents' learning rules interact with equilibrium selection.
-   - The agent's **model is misspecified** (prior assigns zero weight to the truth).
-   - The learning process generates **non-stationary** prices that contaminate inference.
+4. **Correct learning** requires more than martingale convergence, because the limiting price distribution must identify the true parameter.
 
-5. A **general convergence theorem** guarantees that under correct specification and unique equilibria, Bayesian posteriors converge weakly to a point mass at the truth.
+5. **In the paper's two-agent example**, the uninformed agent learns the informed agent's risk tolerance because the limiting price distribution is monotone in that parameter.
 
-6. **Learning *within* versus *about* an REE** is a crucial distinction.
-   The benchmark model in this lecture exemplifies learning *within* an REE: agent $U$ knows the structural form of the price function and uses a correctly specified Bayesian model.
+6. **Identification can fail** when prices reveal only a composite parameter, such as the sum of two informed agents' risk tolerances.
 
-   Learning *about* an REE, where the equilibrium price function is itself the unknown object, is fundamentally harder, because the data-generating process shifts as beliefs shift.
+7. **Misspecification matters** because a stable price relation outside the learner's prior support cannot be learned by Bayes' rule.
 
-   This non-stationarity means that learning *about* an REE cannot in general be rationalized as Bayes' rule applied to a correctly specified model, which is why the literature on this topic relies on adaptive algorithms such as OLS rather than fully Bayesian updating.
+8. **The simplified Gaussian simulation** illustrates posterior concentration in a fixed correctly specified model, not the full Bray--Kreps equilibrium calculation.
 
 The broader message of Bray and Kreps is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle and the conditions under which learning succeeds are more restrictive than they might appear.
 
@@ -807,6 +871,7 @@ $$
 So $v_t^{-1} \sim t$ and $v_t \to 0$ almost surely.
 
 **(b)** From the above, $t \cdot v_t^{-1} \to 1$ implies $t \cdot v_t \to 1 / 1 = 1 / \sigma^2 \cdot \sigma^2 = 1$ when $\sigma^2 = 1$.
+
 More precisely, $t \cdot v_t \to \sigma^2 / \sigma^2 = 1$ (since $v_t \approx \sigma^2 / (t \sigma^2) = 1/t$ for large $t$ when $\sigma^2 = 1$).
 
 So $t \cdot v_t \to 1$ (when $\sigma^2 = 1$).
@@ -814,35 +879,36 @@ So $t \cdot v_t \to 1$ (when $\sigma^2 = 1$).
 **(c)**
 
 ```{code-cell} ipython3
-sigma2_ex = 1.0
+σ2_ex = 1.0
 T_ex = 500
 v0_ex = 2.0
-mu0_ex = 0.0
 
 np.random.seed(7)
-r_ex = np.random.normal(0, np.sqrt(sigma2_ex), T_ex)
+r_ex = np.random.normal(0, np.sqrt(σ2_ex), T_ex)
 
 precisions = np.empty(T_ex)
 prec = 1.0 / v0_ex
 for t in range(T_ex):
-    prec += r_ex[t]**2 / sigma2_ex
+    prec += r_ex[t]**2 / σ2_ex
     precisions[t] = prec
 
 v_t_ex = 1.0 / precisions
 
 fig, axes = plt.subplots(1, 2, figsize=(12, 4))
 
-axes[0].plot(np.arange(1, T_ex + 1), v_t_ex, label='$v_t$')
+axes[0].plot(np.arange(1, T_ex + 1), v_t_ex, lw=2, label='$v_t$')
 axes[0].plot(np.arange(1, T_ex + 1), 1.0 / np.arange(1, T_ex + 1),
-             '--', label='$1/t$')
-axes[0].set_xlabel('$t$'); axes[0].set_ylabel('$v_t$')
+             '--', lw=2, label='$1/t$')
+axes[0].set_xlabel('$t$')
+axes[0].set_ylabel('$v_t$')
 axes[0].set_title('Posterior Variance Decay')
 axes[0].legend()
 
 axes[1].plot(np.arange(1, T_ex + 1),
-             np.arange(1, T_ex + 1) * v_t_ex, label='$t \\cdot v_t$')
-axes[1].axhline(1.0, color='red', ls='--', label='limit = 1')
-axes[1].set_xlabel('$t$'); axes[1].set_ylabel('$t \\cdot v_t$')
+             np.arange(1, T_ex + 1) * v_t_ex, lw=2, label='$t \\cdot v_t$')
+axes[1].axhline(1.0, color='red', ls='--', lw=2, label='limit = 1')
+axes[1].set_xlabel('$t$')
+axes[1].set_ylabel('$t \\cdot v_t$')
 axes[1].set_title('Normalized Variance Converges to 1')
 axes[1].legend()
 
@@ -873,7 +939,7 @@ Suppose agent $U$ starts with a prior mean $\mu_0$ far from the true value $b^*
 
 ```{code-cell} ipython3
 b_true_ex = 2.0
-sigma2_ex = 1.0
+σ2_ex = 1.0
 T_ex = 400
 N_ex = 100
 t_range_ex = np.arange(1, T_ex + 1)
@@ -882,27 +948,33 @@ t_range_ex = np.arange(1, T_ex + 1)
 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
 
 ax = axes[0]
-for mu0 in [-3, 0, 1, 3, 5]:
-    mu_p, _ = simulate_bayesian_learning(
-        b_true_ex, sigma2_ex, mu0, v_0=1.0, T=T_ex, N=N_ex
+for μ0 in [-3, 0, 1, 3, 5]:
+    μ_p, _ = simulate_bayesian_learning(
+        b_true_ex, σ2_ex, μ0, v_0=1.0, T=T_ex, N=N_ex
     )
-    ax.plot(t_range_ex, np.mean(mu_p, axis=0), label=f'$\\mu_0 = {mu0}$')
+    ax.plot(t_range_ex, np.mean(μ_p, axis=0), lw=2,
+            label=f'$\\mu_0 = {μ0}$')
 
-ax.axhline(b_true_ex, color='black', ls='--', lw=1.5, label=f'$b^* = {b_true_ex}$')
-ax.set_xlabel('$t$'); ax.set_ylabel('$E[\\mu_t]$')
+ax.axhline(b_true_ex, color='black', ls='--', lw=2,
+           label=f'$b^* = {b_true_ex}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$E[\\mu_t]$')
 ax.set_title('Effect of Prior Mean on Convergence')
 ax.legend(fontsize=8)
 
 # (c): different prior variances
 ax = axes[1]
 for v0 in [0.1, 1.0, 10.0]:
-    mu_p, _ = simulate_bayesian_learning(
-        b_true_ex, sigma2_ex, mu_0=0.0, v_0=v0, T=T_ex, N=N_ex
+    μ_p, _ = simulate_bayesian_learning(
+        b_true_ex, σ2_ex, μ_0=0.0, v_0=v0, T=T_ex, N=N_ex
     )
-    ax.plot(t_range_ex, np.mean(mu_p, axis=0), label=f'$v_0 = {v0}$')
+    ax.plot(t_range_ex, np.mean(μ_p, axis=0), lw=2,
+            label=f'$v_0 = {v0}$')
 
-ax.axhline(b_true_ex, color='black', ls='--', lw=1.5, label=f'$b^* = {b_true_ex}$')
-ax.set_xlabel('$t$'); ax.set_ylabel('$E[\\mu_t]$')
+ax.axhline(b_true_ex, color='black', ls='--', lw=2,
+           label=f'$b^* = {b_true_ex}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$E[\\mu_t]$')
 ax.set_title('Effect of Prior Variance on Convergence')
 ax.legend()
 
@@ -952,52 +1024,52 @@ So $E[r_t^2] = \sigma^2 > 0$ and the strong law of large numbers guarantees $\su
 **(b) and (c)**
 
 ```{code-cell} ipython3
-def simulate_learning_mixture(b_true, sigma2, mu_0, v_0, T, N):
+def simulate_learning_mixture(b_true, σ2, μ_0, v_0, T, N):
     """
     Simulate Bayesian learning with mixture fundamentals:
-    r_t = 0 with prob 0.5, else N(0, 2*sigma2) with prob 0.5.
+    r_t = 0 with prob 0.5, else N(0, 2*σ2) with prob 0.5.
     """
     rng = np.random.default_rng(42)
 
-    mu_paths = np.empty((N, T))
-    v_paths  = np.empty((N, T))
+    μ_paths = np.empty((N, T))
+    v_paths = np.empty((N, T))
 
     for i in range(N):
-        prec  = 1.0 / v_0
-        w_sum = mu_0 / v_0
+        prec = 1.0 / v_0
+        w_sum = μ_0 / v_0
 
         for t in range(T):
             # Draw from mixture
             if rng.random() < 0.5:
                 r_t = 0.0
             else:
-                r_t = rng.normal(0, np.sqrt(2 * sigma2))
+                r_t = rng.normal(0, np.sqrt(2 * σ2))
 
             p_t = b_true * r_t
 
-            prec  += r_t**2 / sigma2
-            w_sum += r_t * p_t / sigma2
+            prec += r_t**2 / σ2
+            w_sum += r_t * p_t / σ2
 
-            v_t   = 1.0 / prec
-            mu_t  = v_t * w_sum
+            v_t = 1.0 / prec
+            μ_t = v_t * w_sum
 
-            mu_paths[i, t] = mu_t
-            v_paths[i, t]  = v_t
+            μ_paths[i, t] = μ_t
+            v_paths[i, t] = v_t
 
-    return mu_paths, v_paths
+    return μ_paths, v_paths
 
-sigma2_ex = 1.0
+σ2_ex = 1.0
 T_ex = 500
 N_ex = 50
 
 # Gaussian case
-mu_gauss, v_gauss = simulate_bayesian_learning(
-    b_true=2.0, sigma2=sigma2_ex, mu_0=0.5, v_0=2.0, T=T_ex, N=N_ex
+μ_gauss, v_gauss = simulate_bayesian_learning(
+    b_true=2.0, σ2=σ2_ex, μ_0=0.5, v_0=2.0, T=T_ex, N=N_ex
 )
 
 # Mixture case
-mu_mix, v_mix = simulate_learning_mixture(
-    b_true=2.0, sigma2=sigma2_ex, mu_0=0.5, v_0=2.0, T=T_ex, N=N_ex
+μ_mix, v_mix = simulate_learning_mixture(
+    b_true=2.0, σ2=σ2_ex, μ_0=0.5, v_0=2.0, T=T_ex, N=N_ex
 )
 
 t_range_ex = np.arange(1, T_ex + 1)
@@ -1005,21 +1077,23 @@ t_range_ex = np.arange(1, T_ex + 1)
 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
 
 ax = axes[0]
-ax.plot(t_range_ex, np.mean(mu_gauss, axis=0), label='Gaussian $r_t$',
+ax.plot(t_range_ex, np.mean(μ_gauss, axis=0), label='Gaussian $r_t$',
         color='steelblue', lw=2)
-ax.plot(t_range_ex, np.mean(mu_mix,   axis=0), label='Mixture $r_t$',
+ax.plot(t_range_ex, np.mean(μ_mix, axis=0), label='Mixture $r_t$',
         color='darkorange', lw=2)
-ax.axhline(2.0, color='red', ls='--', lw=1.5, label='$b^* = 2$')
-ax.set_xlabel('$t$'); ax.set_ylabel('$E[\\mu_t]$')
+ax.axhline(2.0, color='red', ls='--', lw=2, label='$b^* = 2$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$E[\\mu_t]$')
 ax.set_title('Posterior Mean: Gaussian vs Mixture')
 ax.legend()
 
 ax = axes[1]
 ax.plot(t_range_ex, np.mean(v_gauss, axis=0), label='Gaussian $r_t$',
         color='steelblue', lw=2)
-ax.plot(t_range_ex, np.mean(v_mix,   axis=0), label='Mixture $r_t$',
+ax.plot(t_range_ex, np.mean(v_mix, axis=0), label='Mixture $r_t$',
         color='darkorange', lw=2)
-ax.set_xlabel('$t$'); ax.set_ylabel('$E[v_t]$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$E[v_t]$')
 ax.set_title('Posterior Variance: Gaussian vs Mixture')
 ax.legend()
 

From b8bb502560bd91c679bdd43a268ae665ca3fd7a4 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Thu, 28 May 2026 23:21:36 +1000
Subject: [PATCH 09/25] updates

---
 lectures/long_run_risk_operator.md | 317 +++++++++++++++++------------
 1 file changed, 185 insertions(+), 132 deletions(-)

diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index ef4e48c2c..4919aca0c 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -477,7 +477,7 @@ ideas, see {doc}`advanced:asset_pricing_lph`; for an estimation perspective
 on Euler-equation-based asset pricing, see {doc}`hansen_singleton_1982`.
 
 The key starting point is that a valuation functional $V$ must satisfy the
-no-arbitrage requirement that $VS$ is a martingale (Definition
+no-arbitrage requirement that $VS$ is a martingale (
 {prf:ref}`lrr-def-valuation-functional`).
 
 We parameterize the stochastic discount factor $S$ and valuation functional
@@ -485,22 +485,39 @@ $V$ as additive functionals with coefficients $(\beta^s,\gamma^s,\kappa^s)$
 and $(\beta^v,\gamma^v,\kappa^v)$ respectively, in the notation of
 {eq}`eq:additive-functional`.
 
-For a generic positive multiplicative functional with parameters
-$(\beta,\gamma,\kappa)$, applying Itô's formula and zeroing out the drift
-gives the **local martingale restriction**
+For a generic positive multiplicative functional $M = \exp(A)$ with
+parameters $(\beta,\gamma,\kappa)$, applying Itô's formula to
+$\exp(A_t)$ and requiring the drift of $dM_t/M_{t-}$ to vanish gives the
+**local martingale restriction**:
 
 $$
-    \beta
-    + \frac{\gamma^\top\gamma}{2}
-    + \int \left(\exp[\kappa(y,\cdot)]-1\right)\eta(dy \mid \cdot)
-    = 0.
+    \beta(x)
+    + \frac{\gamma(x)^\top\gamma(x)}{2}
+    + \int \big(\exp[\kappa(y,x)] - 1\big)\, \eta(dy \mid x)
+    = 0 .
 $$ (eq:local-martingale-restriction)
 
-(The drift coefficient $\beta$, the Itô correction $\gamma^\top \gamma/2$,
-and the jump compensator sum to zero.)
+The three terms correspond to:
+
+* the drift of $A$ itself,
+* the Itô correction from the Brownian part (because $M = e^A$ picks up a
+  quadratic-variation contribution), and
+* the compensated jumps of $M$ at the multiplier $\exp[\kappa(y,x)]$.
+
+The full derivation is the content of Exercise {ref}`lrr_ex_local_mg`.
 
-Applying this to $VS$ --- whose parameters add: $(\beta^v + \beta^s,
-\gamma^v + \gamma^s, \kappa^v + \kappa^s)$ --- gives the **local pricing
+```{note}
+An equivalent way to see {eq}`eq:local-martingale-restriction`: $M$ is a
+martingale iff $E[M_t \mid X_0 = x] = 1$ for all $t$, i.e. the constant
+function $\psi \equiv 1$ is an eigenfunction of the semigroup with
+eigenvalue $0$. Applying the generator formula
+{eq}`eq:extended-generator` (introduced below) to $\phi \equiv 1$ kills
+all the derivative terms and leaves exactly the left-hand side of
+{eq}`eq:local-martingale-restriction`.
+```
+
+Applying this to $VS$, whose parameters add: $(\beta^v + \beta^s,
+\gamma^v + \gamma^s, \kappa^v + \kappa^s)$, gives the **local pricing
 restriction**
 
 $$
@@ -1278,6 +1295,17 @@ print("\nlimit =", limit)
 The rescaled value converges to the same limiting vector regardless of the
 starting state --- exactly what {eq}`eq:long-run-limit` predicts.
 
+```{note}
+The *rate* of convergence is the **spectral gap** of $A$: the difference
+between $\rho$ (the dominant real eigenvalue) and the next-largest real
+part of the spectrum.
+
+This is the operator generalisation of the gap between the leading and
+sub-leading eigenvalues that controls mixing of a stationary Markov chain.
+Exercise {ref}`lrr_ex3` works through a three-state example where the gap
+can be read off directly.
+```
+
 ### Adding jumps
 
 State transitions in this chain are discontinuous, so it is natural to allow
@@ -1608,7 +1636,9 @@ preferences in a different setting is
 {doc}`survival_recursive_preferences`.
 
 The block below derives the SDF coefficients for the unit-elasticity
-recursive specification. You can skip on a first read and come back later
+recursive specification. 
+
+You can skip on a first read and come back later
 --- the numerical example uses the simpler Breeden parameters above.
 
 For the unit-elasticity recursive specification, conjecture a continuation
@@ -2369,123 +2399,6 @@ print(f"finite-difference slope = {finite_difference:.6f}")
 print(f"formula                 = {long_run_price_o:.6f}")
 ```
 
-### Limiting holding-period return
-
-The same machinery gives the limiting one-period holding-period return on a
-claim to far-future cash flows.
-
-This is the *gross return* on holding an asset for a single period when its
-cash flow lies far in the future.
-
-For $D_t=D_0\, G_t\, \psi(X_t)$ and $M=GS$, the principal eigenpair
-$(\rho,\phi)$ implies
-
-$$
-    \lim_{t\to\infty}
-    \frac{E[S_t D_t / S_1 \mid \mathcal F_1]}
-         {E[S_t D_t \mid \mathcal F_0]}
-    =
-    \exp(-\rho)\, G_1\, \frac{\phi(X_1)}{\phi(X_0)} .
-$$ (eq:limiting-holding-period-return)
-
-The limit has three factors:
-
-* a **cash-flow growth** component $G_1$,
-* a **discount** component $\exp(-\rho)$ governed by the principal
-  eigenvalue, and
-* a **state-dependent** component $\phi(X_1)/\phi(X_0)$ governed by the
-  eigenfunction.
-
-A striking feature: the transient payoff shape $\psi$ drops out of the
-limiting return, so the long-run holding-period return on *every* claim to
-a far-future cash flow looks the same up to the cash-flow growth factor.
-
-## Perron-Frobenius dominance
-
-In the finite-state chain, the long-run limit {eq}`eq:long-run-limit` is
-exactly Perron-Frobenius theory in action.
-
-The positive semigroup generated by $A$ in {eq}`eq:finite-a` has a unique
-dominant real eigenvalue, and contributions from the remaining eigenvalues
-decay at an exponential rate equal to the **spectral gap** --- the
-difference between $\rho$ and the next-largest real part.
-
-The rate at which $\exp(-\rho t)\mathbb M_t\psi$ converges to its long-run
-limit is exactly this spectral gap.
-
-```{note}
-In general state spaces, the same intuition holds but the argument is
-substantially more subtle: the martingale component $\hat M$ changes
-probability measure, and *stability* of the twisted process is what
-selects the eigenfunction governing the long-run approximation. The
-finite-state case is a window onto the general theory.
-```
-
-We illustrate the connection on a three-state chain: compute the spectral
-gap directly, then show convergence happens at that rate.
-
-```{code-cell} ipython3
-state_names = ["expansion", "normal", "contraction"]
-
-U3 = np.array([[-0.40,  0.30,  0.10],
-               [ 0.20, -0.50,  0.30],
-               [ 0.10,  0.20, -0.30]])
-
-r3 = np.array([0.06, 0.04, 0.01])
-κ3 = np.zeros((3, 3))
-
-A3 = build_generator(U3, r3, κ3)
-ρ3, φ3 = principal_eigenpair(A3)
-A3_hat = twisted_generator(A3, ρ3, φ3)
-ς3 = stationary_distribution(A3_hat)
-
-print(f"ρ = {ρ3:.6f}")
-print(f"φ = {φ3}")
-print(f"ς_hat = {ς3}")
-
-eigs3 = np.sort(eig(A3, right=False).real)[::-1]
-print("eigenvalues by real part:")
-print(np.round(eigs3, 6))
-```
-
-```{code-cell} ipython3
-ψ_list = {
-    "$\\psi=(1,0,0)$": np.array([1.0, 0.0, 0.0]),
-    "$\\psi=(0,1,0)$": np.array([0.0, 1.0, 0.0]),
-    "$\\psi=(1,2,3)$": np.array([1.0, 2.0, 3.0]),
-}
-
-t_grid = np.linspace(0, 35, 220)
-colors = ["C0", "C1", "C2"]
-
-fig, axes = plt.subplots(1, 3, figsize=(14, 4))
-
-for ax, (label, ψ) in zip(axes, ψ_list.items()):
-    limit = φ3 * np.sum((ψ / φ3) * ς3)
-    values = np.array([
-        np.exp(-ρ3 * t_val) * expm(t_val * A3) @ ψ
-        for t_val in t_grid
-    ])
-
-    for i, color in enumerate(colors):
-        ax.plot(t_grid, values[:, i], color=color, lw=1.5,
-                label=state_names[i])
-        ax.axhline(limit[i], color=color, ls="--", lw=1)
-
-    ax.set_title(label)
-    ax.set_xlabel("$t$")
-    ax.set_ylabel("$\\exp(-\\rho t)\\mathbb{M}_t\\psi$")
-
-axes[0].legend()
-plt.tight_layout()
-plt.show()
-```
-
-For each choice of $\psi$ and each initial state, the rescaled value
-$\exp(-\rho t)\mathbb M_t \psi$ converges to the dashed horizontal line --- the
-long-run limit $\phi \int (\psi/\phi)\, d\hat\varsigma$ --- at a rate
-controlled by the spectral gap.
-
 ## Assumptions behind the scenes
 
 The examples above make the eigenfunction calculation look mechanical.
@@ -2799,9 +2712,23 @@ long-run price converges to the local price.
 ```{exercise}
 :label: lrr_ex3
 
-Using the three-state example, let $\psi=(3,1,2)$.
+Consider a three-state chain with states {expansion, normal, contraction},
+intensity matrix
 
-a. Compute the theoretical limit
+$$
+U =
+\begin{pmatrix}
+    -0.40 &  0.30 &  0.10 \\
+     0.20 & -0.50 &  0.30 \\
+     0.10 &  0.20 & -0.30
+\end{pmatrix},
+$$
+
+decay-rate vector $r = (0.06, 0.04, 0.01)$, and no jumps in the
+multiplicative functional. Let $\psi=(3,1,2)$.
+
+a. Compute the principal eigenpair $(\rho,\phi)$ and twisted stationary
+distribution $\hat\varsigma$, and report the theoretical limit
 
 $$
     \phi \sum_i \frac{\psi_i}{\phi_i}\hat\varsigma_i .
@@ -2831,6 +2758,20 @@ second-largest real parts of the eigenvalues of $A$.
 Here is one solution:
 
 ```{code-cell} ipython3
+state_names = ["expansion", "normal", "contraction"]
+
+U3 = np.array([[-0.40,  0.30,  0.10],
+               [ 0.20, -0.50,  0.30],
+               [ 0.10,  0.20, -0.30]])
+
+r3 = np.array([0.06, 0.04, 0.01])
+κ3 = np.zeros((3, 3))
+
+A3 = build_generator(U3, r3, κ3)
+ρ3, φ3 = principal_eigenpair(A3)
+A3_hat = twisted_generator(A3, ρ3, φ3)
+ς3 = stationary_distribution(A3_hat)
+
 ψ = np.array([3.0, 1.0, 2.0])
 limit = φ3 * np.sum((ψ / φ3) * ς3)
 
@@ -2867,6 +2808,118 @@ above.
 ```{solution-end}
 ```
 
+```{exercise}
+:label: lrr_ex_local_mg
+
+Derive the local martingale restriction
+{eq}`eq:local-martingale-restriction` from Itô's formula.
+
+Let $M = \exp(A)$ for the additive functional $A$ in
+{eq}`eq:additive-functional`, with parameters $(\beta,\gamma,\kappa)$.
+
+a. Decompose $A_t = A_t^c + A_t^j$ into its continuous and pure-jump parts
+and write down $dA_t^c$ and the jump magnitudes $\Delta A_t$.
+
+b. Apply Itô's formula for semimartingales to $f(a) = e^a$ to show that
+
+$$
+    dM_t
+    =
+    M_{t-}\, dA_t^c
+    + \tfrac{1}{2}\, M_{t-}\, d\langle A^c, A^c\rangle_t
+    + M_{t-}\big(\exp[\Delta A_t] - 1\big)\quad\text{at jumps}.
+$$
+
+c. Use $d\langle A^c, A^c\rangle_t = \gamma^\top\gamma\, dt$ and rewrite the
+jump term as an integral against the random counting measure $\zeta$.
+
+d. Split $\zeta$ into its compensator $\eta(dy \mid X_{t-})\, dt$ and the
+compensated martingale measure
+$\tilde\zeta = \zeta - \eta(dy\mid X_{t-})\, dt$.
+
+e. Collect drift (predictable) and martingale terms and conclude that $M$ is
+a local martingale iff the drift vanishes at every state, which gives
+{eq}`eq:local-martingale-restriction`.
+```
+
+```{solution-start} lrr_ex_local_mg
+:class: dropdown
+```
+
+Here is one solution.
+
+*a.* From the parameterization {eq}`eq:additive-functional`,
+
+$$
+    dA_t^c = \beta(X_t)\, dt + \gamma(X_{t-})^\top\, dB_t,
+    \qquad
+    \Delta A_t = \kappa(X_t, X_{t-}) \text{ at a jump time}.
+$$
+
+*b.* For $f(a) = e^a$ we have $f'(a) = f''(a) = e^a$, so $f'(A_{t-}) =
+f''(A_{t-}) = M_{t-}$. Itô's formula for a semimartingale gives
+
+$$
+    dM_t
+    =
+    f'(A_{t-})\, dA_t^c
+    + \tfrac{1}{2}\, f''(A_{t-})\, d\langle A^c, A^c\rangle_t
+    + \big[f(A_t) - f(A_{t-})\big] .
+$$
+
+Since $A_t = A_{t-} + \Delta A_t$ at a jump, $f(A_t) - f(A_{t-}) =
+M_{t-}\big(\exp[\Delta A_t] - 1\big)$, which is the stated expression.
+
+*c.* Substituting $d\langle A^c, A^c\rangle_t = \gamma(X_{t-})^\top
+\gamma(X_{t-})\, dt$ and rewriting the jump contribution as an integral
+against the random counting measure $\zeta$ of $(X, A)$ gives
+
+$$
+\begin{aligned}
+\frac{dM_t}{M_{t-}}
+&=
+\beta(X_t)\, dt
++ \gamma(X_{t-})^\top dB_t
++ \tfrac{1}{2}\,\gamma(X_{t-})^\top\gamma(X_{t-})\, dt
+\\
+&\quad
++ \int \big(\exp[\kappa(y, X_{t-})] - 1\big)\, \zeta(dy, dt) .
+\end{aligned}
+$$
+
+*d.* Writing $\zeta = \tilde\zeta + \eta(dy\mid X_{t-})\, dt$ separates the
+jump integral into a martingale and a predictable drift contribution:
+
+$$
+\int \big(\exp[\kappa(y,X_{t-})] - 1\big)\zeta(dy,dt)
+=
+\int\big(\exp[\kappa(y,X_{t-})] - 1\big)\tilde\zeta(dy,dt)
++ \int\big(\exp[\kappa(y,X_{t-})] - 1\big)\eta(dy\mid X_{t-})\, dt .
+$$
+
+*e.* Collecting drift and martingale terms,
+
+$$
+\begin{aligned}
+\frac{dM_t}{M_{t-}}
+&=
+\bigg[\beta(X_t)
++ \tfrac{1}{2}\gamma^\top\gamma
++ \int\big(\exp[\kappa(y,X_{t-})] - 1\big)\,\eta(dy\mid X_{t-})\bigg] dt
+\\
+&\quad
++ \gamma^\top dB_t
++ \int\big(\exp[\kappa(y,X_{t-})] - 1\big)\,\tilde\zeta(dy,dt) .
+\end{aligned}
+$$
+
+The Brownian and compensated-jump terms are local martingales, so $M$ is a
+local martingale iff the bracketed drift vanishes for every state $x$,
+giving {eq}`eq:local-martingale-restriction`.
+
+```{solution-end}
+```
+
 ```{exercise}
 :label: lrr_ex4
 

From 01021f7c80c7a21e8ec06364a837b9e7d7641dc9 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Fri, 29 May 2026 15:25:49 +1000
Subject: [PATCH 10/25] updates

---
 lectures/long_run_risk_operator.md | 629 ++++++++++++++++++-----------
 1 file changed, 391 insertions(+), 238 deletions(-)

diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index 4919aca0c..cb0c34546 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -31,7 +31,7 @@ kernelspec:
 How should we value a cash flow that pays off thirty years from now?
 
 Standard short-horizon asset pricing tells us how investors are compensated
-for tiny, instantaneous exposures to shocks --- the *short end* of the term
+for tiny, instantaneous exposures to shocks, the *short end* of the term
 structure of risk prices.
 
 But many of the most interesting asset-pricing questions, the equity
@@ -104,7 +104,7 @@ The plan is:
 1. Set up positive multiplicative functionals $M$ (discount factors, returns,
    stochastic growth) and the valuation semigroups they generate.
 
-2. Introduce the **generator** of a semigroup --- the local operator whose
+2. Introduce the **generator** of a semigroup, the local operator whose
    eigenvalue problem controls long-run behaviour.
 
 3. Find the principal eigenfunction $\phi$ and derive the factorization.
@@ -164,7 +164,7 @@ $$
 $$
 
 where $\zeta$ is the random counting measure of jumps and
-$\eta(dy \mid X_{t-})\, dt$ is its compensator --- the rate at which $X$
+$\eta(dy \mid X_{t-})\, dt$ is its compensator, the rate at which $X$
 jumps from $X_{t-}$ to a region $dy$.
 
 We also impose two simplifying assumptions:
@@ -173,7 +173,7 @@ We also impose two simplifying assumptions:
   occur on any bounded interval, which keeps integrals against the jump
   measure well-defined and finite.
 * **Sufficient rank in $\Gamma$** so that the Brownian shocks relevant for
-  pricing can be recovered from the state history --- this is what makes the
+  pricing can be recovered from the state history, which is what makes the
   Markov state $X$ "rich enough" to be a sufficient statistic for valuation.
 
 They let us write the generator
@@ -194,7 +194,7 @@ constructed from the history of $X$, so that $M_t$ is
 $\mathcal F_t$-measurable for each $t$.
 ```
 
-We will always work with the **càdlàg** version of a functional --- the
+We will always work with the **càdlàg** version of a functional, the
 French acronym for "right-continuous with left limits".
 
 Concretely, for almost every sample path $\omega$,
@@ -257,7 +257,7 @@ $$
 
 For the price to depend only on the current Markov state $X_\tau$ (and not on
 the entire history up to $\tau$), the ratio $S_t/S_\tau$ must be a function
-only of the Markov path *after* $\tau$ --- that is,
+only of the Markov path *after* $\tau$. That is,
 $S_{\tau+u}/S_\tau = S_u(\theta_\tau)$, which is exactly
 {eq}`eq:multiplicative`.
 
@@ -338,7 +338,7 @@ You should read $\mathbb M_t \psi(x)$ as "the date-$0$ value, starting from
 state $x$, of a date-$t$ payoff $\psi(X_t)$", weighted by $M_t$.
 
 The family of operators $\{\mathbb M_t\}_{t \geq 0}$ has a key compositional
-structure --- the *semigroup property*.
+structure, the *semigroup property*.
 
 ```{prf:definition} One-Parameter Semigroup
 :label: lrr-def-one-parameter-semigroup
@@ -441,7 +441,9 @@ lecture: it tells us how current prices value cash-flow growth risk that
 materializes far in the future.
 
 ```{note}
-The split $D_t=D_0 G_t \psi(X_t)$ is not unique --- for any positive function
+The factorization $D_t=D_0 G_t \psi(X_t)$ is not unique.
+
+For any positive function
 $\varphi$,
 
 $$
@@ -473,12 +475,13 @@ That is the standard instantaneous risk-return relation.
 This will give us a benchmark to compare long-run risk prices against later.
 
 For a textbook discrete-time treatment of the same SDF-based asset-pricing
-ideas, see {doc}`advanced:asset_pricing_lph`; for an estimation perspective
+ideas, see {doc}`advanced:asset_pricing_lph`. 
+
+For an estimation perspective
 on Euler-equation-based asset pricing, see {doc}`hansen_singleton_1982`.
 
 The key starting point is that a valuation functional $V$ must satisfy the
-no-arbitrage requirement that $VS$ is a martingale (
-{prf:ref}`lrr-def-valuation-functional`).
+no-arbitrage requirement that $VS$ is a martingale ({prf:ref}`lrr-def-valuation-functional`).
 
 We parameterize the stochastic discount factor $S$ and valuation functional
 $V$ as additive functionals with coefficients $(\beta^s,\gamma^s,\kappa^s)$
@@ -504,17 +507,7 @@ The three terms correspond to:
   quadratic-variation contribution), and
 * the compensated jumps of $M$ at the multiplier $\exp[\kappa(y,x)]$.
 
-The full derivation is the content of Exercise {ref}`lrr_ex_local_mg`.
-
-```{note}
-An equivalent way to see {eq}`eq:local-martingale-restriction`: $M$ is a
-martingale iff $E[M_t \mid X_0 = x] = 1$ for all $t$, i.e. the constant
-function $\psi \equiv 1$ is an eigenfunction of the semigroup with
-eigenvalue $0$. Applying the generator formula
-{eq}`eq:extended-generator` (introduced below) to $\phi \equiv 1$ kills
-all the derivative terms and leaves exactly the left-hand side of
-{eq}`eq:local-martingale-restriction`.
-```
+We ask you to verify this in {ref}`lrr_ex_local_mg`.
 
 Applying this to $VS$, whose parameters add: $(\beta^v + \beta^s,
 \gamma^v + \gamma^s, \kappa^v + \kappa^s)$, gives the **local pricing
@@ -575,15 +568,16 @@ the same exposure units as $\gamma^v(x)$.
 
 Jump risk is priced through the function $\kappa^s$.
 
-This local relation is one end of the term structure of risk prices; the
-eigenvalue calculations below describe the other end.
+This local relation is one end of the term structure of risk prices. 
+
+The eigenvalue calculations below describe the other end.
 
 ## The generator
 
 So far we have a family of operators $\{\mathbb M_t\}_{t \geq 0}$, one for each
 horizon $t$.
 
-That is more information than we can analyze directly --- and what we really
+That is more information than we can analyze directly and what we really
 want is the behaviour of $\mathbb M_t \psi$ as $t \to \infty$.
 
 The **generator** $\mathbb A$ compresses the entire semigroup into one
@@ -609,7 +603,9 @@ $$
     E\left[M_1 \psi(X_1) \mid X_0=x\right].
 $$
 
-Iterating gives the $n$-period operator $K^n$ --- exactly the logic by which a
+Iterating gives the $n$-period operator $K^n$ 
+
+This is exactly the logic by which a
 transition matrix $P$ produces $n$-step probabilities through $P^n$, except
 that $K$ also carries the payoff weight $M_1$.
 
@@ -647,65 +643,116 @@ $$
     - \sum_{j=0}^{n-1} M_j (K\psi - \psi)(X_j) .
 $$
 
-Two roles, one operator: $K-I$ is the *local* rate of change of
+Here $K-I$ is the *local* rate of change of
 $M_n \psi(X_n)$, and through $K^n$ it also controls long-run growth.
 
 ### From discrete to continuous time
 
 Continuous time keeps the same logic.
 
-The natural replacement for $K-I$ is the derivative of the semigroup at zero:
+The natural replacement for $K-I$ is the **infinitesimal generator** of the
+semigroup $\{\mathbb M_t\}$, the time derivative at zero:
+
+$$
+    \mathbb A \psi(x)
+    :=
+    \lim_{h \downarrow 0}
+    \frac{\mathbb M_h \psi(x) - \psi(x)}{h},
+$$
+
+so that for small $h>0$,
 
 $$
     \mathbb M_h \psi(x)
     \approx
-    \psi(x) + h \mathbb A \psi(x)
-    \quad \text{for small } h > 0.
+    \psi(x) + h\, \mathbb A \psi(x) .
 $$
 
-The operator $\mathbb A$ is local in time and conditional on the current state,
-with the jump term integrating over possible post-jump states rather than over
-a realized path on $[0,t]$.
+```{note}
+When $M \equiv 1$, the
+multiplicative semigroup reduces to the standard Markov transition semigroup
+$\mathbb M_t \psi(x) = E[\psi(X_t) \mid X_0=x]$, and $\mathbb A$ becomes the
+familiar infinitesimal generator $\mathcal L$ of $X$ from textbook stochastic
+calculus.
+
+For general $M$, $\mathbb A$ is the same kind of object but it carries extra terms that
+encode the instantaneous "yield" of $M$: discount rates, Brownian risk
+prices, jump multipliers.
+
+That is exactly what we will see when we write down its closed form in
+{eq}`eq:extended-generator` below.
+```
+
+Why is this the right object? 
 
-If $\mathbb A\phi = \rho \phi$, then
+Because if $\mathbb A\phi = \rho \phi$, then
+iterating gives
 
 $$
-    \mathbb M_t \phi = \exp(\rho t)\phi ,
+    \mathbb M_t \phi = \exp(\rho t)\, \phi ,
 $$
 
 the continuous-time analogue of $K^n \phi = \lambda^n \phi$.
 
-So the long-run behaviour of $\mathbb M_t$ is encoded in an eigenvalue problem
-for the local operator $\mathbb A$.
+So the long-run behaviour of $\mathbb M_t$ is encoded in an eigenvalue
+problem for the local operator $\mathbb A$.
 
 ### Extended generator
 
-For the Markov processes we use, the derivative form above is heuristic --- it
-may not be well-defined for every $\psi$ of interest.
+There is a catch with the limit definition above.
+
+To make the limit $h \downarrow 0$ rigorous, the textbook definition
+requires $(\mathbb M_h\psi - \psi)/h$ to converge to $\mathbb A\psi$ in a
+chosen norm, typically uniform convergence over bounded continuous
+functions, where the semigroup is a contraction.
+
+But the functions we care about most are the principal eigenfunctions
+$\phi$ solving $\mathbb A\phi = \rho\phi$, and these typically *grow* with
+the state $X$ (in the affine-Gaussian benchmark, $\phi$ is
+exponential-affine in $x$). 
+
+So they do not lie in this space, and the
+limit need not converge for them.
+
+{cite:t}`HansenScheinkman2009` sidestep this by characterizing $\mathbb A$
+through a *Doob-Meyer style* semimartingale decomposition of
+$M_t \psi(X_t)$, a pathwise condition that does not require any norm.
+
+This is the continuous-time analogue of writing $K - I$ as the predictable
+rate of change of $M_n\psi(X_n)$ in discrete time.
+
+The resulting **extended generator** admits unbounded $\psi$, has a
+strictly larger domain than the textbook generator, and agrees with it
+where both are defined.
 
-We instead define $\mathbb A$ through the *martingale decomposition*, which
-mirrors the discrete-time identity in which $K-I$ is the predictable rate of
-change of $M_n\psi(X_n)$.
+Concretely:
 
 ```{prf:definition} Extended Generator
 :label: lrr-def-extended-generator
 
-A Borel function $\psi$ belongs to the domain of the **extended generator**
-$\mathbb A$ of the multiplicative functional $M$ if there is a Borel function
-$\chi$ such that
+Fix a Borel function $\psi$, and look for a second Borel function $\chi$ that
+will play the role of "the instantaneous rate of change of $M_t \psi(X_t)$
+at the current state". Precisely, we ask whether there exists $\chi$ such
+that
 
 $$
     N_t
     =
     M_t \psi(X_t)
     - \psi(X_0)
-    - \int_0^t M_s \chi(X_s) ds
+    - \int_0^t M_s \chi(X_s)\, ds
 $$
 
 is a local martingale.
 
-In this case, the extended generator assigns $\chi$ to $\psi$, and we write
-$\mathbb A \psi = \chi$.
+We then say $\psi$
+lies in the domain of the **extended generator** $\mathbb A$ of $M$, and we
+*define* the operator by
+
+$$
+\mathbb A \psi := \chi.
+$$
+
 ```
 
 The three terms play the same roles as in discrete time:
@@ -726,7 +773,7 @@ state.
   = \lim_{t \downarrow 0} t^{-1}\bigl[E\psi(X_t) - \psi(x)\bigr]$.
 
 * When $X$ is a jump diffusion, Itô's formula applied to $M_t\psi(X_t)$
-  produces the closed-form expression for $\mathbb A\psi$ below.
+  produces the closed-form expression for $\mathbb A\psi$ in {eq}`eq:extended-generator` below.
 
 ### A closed form for jump diffusions
 
@@ -776,22 +823,22 @@ $$ (eq:extended-generator)
 
 The four terms have transparent interpretations:
 
-1. The first term is the standard Markov drift, modified by $\Gamma\gamma$
-   --- a *covariance correction* between the Brownian shocks driving $X$ and
+1. The first term is the standard Markov drift, modified by $\Gamma\gamma$,
+   a *covariance correction* between the Brownian shocks driving $X$ and
    those driving $M$.
 2. The second is the standard diffusion (Itô) term.
 3. The third integrates $\phi$ against the jump-compensated transition rates,
    reweighted by the jump multiplier $\exp[\kappa(y,x)]$.
-4. The fourth is a multiplicative *yield-like* term --- it multiplies $\phi(x)$
+4. The fourth is a multiplicative *yield-like* term that multiplies $\phi(x)$
    itself and combines the drift of $M$, the Brownian Itô correction, and the
    compensated jumps.
 
 ```{note}
 When $M=S$ is a stochastic discount factor, the term multiplying $\phi(x)$
-in the fourth line encodes local prices of Brownian and jump risk --- the
+in the fourth line encodes local prices of Brownian and jump risk, the
 short-end of the term structure we will revisit later.
 
-Derivation of {eq}`eq:extended-generator` is the content of Exercise
+We ask readers to verify the derivation of {eq}`eq:extended-generator` in
 {ref}`lrr_ex4`.
 ```
 
@@ -828,92 +875,100 @@ denominators throughout: it has to be safe to divide by it.
 Why does an eigenfunction of $\mathbb A$ give us the multiplicative
 factorization {eq}`eq:hs-factorization`?
 
-The cleanest way to see it is again through the discrete-time analogy.
-
-If $K\phi = \lambda\phi$ in discrete time, then
+The discrete-time analogy points the way. If $K\phi = \lambda\phi$, then
 
 $$
     \lambda^{-n}\, M_n\, \frac{\phi(X_n)}{\phi(X_0)}
 $$
 
-is a martingale --- the eigenvalue equation exactly cancels the one-step
-drift of $M_n\phi(X_n)$ once we divide by $\lambda^n$.
+is a martingale: the eigenvalue equation absorbs the one-step drift of
+$M_n\phi(X_n)$ exactly.
 
-In continuous time, $\lambda^n$ is replaced by $\exp(\rho t)$, and our
-candidate martingale becomes
+In continuous time, $\lambda^n$ becomes $\exp(\rho t)$, and the candidate
+martingale is
 
 $$
     \hat M_t
     =
-    \exp(-\rho t)\, M_t\,
-    \frac{\phi(X_t)}{\phi(X_0)} .
+    \exp(-\rho t)\, M_t\, \frac{\phi(X_t)}{\phi(X_0)} .
 $$ (eq:mhat)
 
-The eigenfunction equation $\mathbb A\phi = \rho\phi$ is exactly what we
-need to make this candidate work.
-
-To verify, apply the definition of the extended generator to $M_t\phi(X_t)$:
-
-$$
-    M_t \phi(X_t) - \phi(X_0) - \int_0^t M_s\, \mathbb A\phi(X_s)\, ds
-$$
-
-is a local martingale.
+Let's check that $\hat M$ is indeed a local martingale.
 
-Substituting $\mathbb A\phi = \rho \phi$ reduces this to
+By the definition of $\mathbb A$, the semimartingale
+$Z_t := M_t\phi(X_t)$ admits the decomposition
 
 $$
-    M_t \phi(X_t) - \phi(X_0) - \rho \int_0^t M_s \phi(X_s)\, ds,
+    dZ_t = M_t\, \mathbb A\phi(X_t)\, dt + dN_t ,
 $$
 
-so the predictable drift of $M_t\phi(X_t)$ is $\rho M_t \phi(X_t)\, dt$.
+where $N$ is a local martingale. 
 
-For $Z_t := M_t \phi(X_t)$, integration by parts gives
+The eigenvalue equation
+$\mathbb A\phi = \rho\phi$ replaces the drift by $\rho Z_t\, dt$, and
+integration by parts gives
 
 $$
     d\bigl(\exp(-\rho t) Z_t\bigr)
-    = -\rho \exp(-\rho t) Z_t\, dt + \exp(-\rho t)\, dZ_t ,
+    = \exp(-\rho t)\,\bigl[dZ_t - \rho Z_t\, dt\bigr]
+    = \exp(-\rho t)\, dN_t .
 $$
 
-and the drift term in $dZ_t$ is exactly $\rho Z_t\, dt$, so the two drift
-contributions cancel.
-
-Hence $\hat M_t$ has zero drift and is a local martingale.
+So $\exp(-\rho t) Z_t$ is a local martingale, and dividing by $\phi(X_0)$
+shows the same for $\hat M$.
 
-Rearranging {eq}`eq:mhat` for $M_t$ gives the factorization
-{eq}`eq:hs-factorization` from the overview, with $\hat M$ playing the role
-of the promised martingale component.
+Rearranging {eq}`eq:mhat` for $M_t$ recovers the factorization
+{eq}`eq:hs-factorization`, with $\hat M$ in the role of the martingale
+component.
 
 ```{prf:definition} Martingale Component and Twisted Measure
 :label: lrr-def-martingale-component
 
-When $\hat M$ in {eq}`eq:mhat` is a martingale, it is the **martingale
-component** associated with $(\rho,\phi)$ and defines the **twisted probability
-measure** by weighting date-$t$ events with $\hat M_t$.
-
-For $F \in \mathcal F_t$, the twisted conditional probability is
+When $\hat M$ is a martingale (not merely a local martingale), it is the
+**martingale component** associated with $(\rho,\phi)$ and defines the
+**twisted probability measure** by weighting date-$t$ events with
+$\hat M_t$:
 
 $$
     \widehat{\Pr}(F \mid X_0=x)
     =
-    E[\hat M_t 1_F \mid X_0=x].
+    E[\hat M_t \mathbf 1_F \mid X_0=x],
+    \qquad F \in \mathcal F_t.
 $$
 ```
 
-The candidate $\hat M$ is always a nonnegative local martingale, hence a
-supermartingale.
-Therefore
+The verification only showed that $\hat M$ is a *local* martingale, but
+the definition above (and the change-of-measure interpretation of
+$\hat M$) both require it to be a martingale.
+
+We close this gap by adopting Assumption 6.1 of
+{cite:t}`HansenScheinkman2009`:
+
+> The local martingale $\hat M$ defined in {eq}`eq:mhat` is a martingale,
+> i.e. $E[\hat M_t \mid X_0 = x] = 1$ for every $t \geq 0$ and $x$.
+
+We carry this assumption from here on.
+
+Even without closing the gap we still get one-sided control.
+
+Since $\hat M$ is nonnegative with $\hat M_0 = 1$, it is a supermartingale,
+so $E[\hat M_t \mid X_0=x] \leq 1$.
+
+Taking expectations in {eq}`eq:mhat`,
 
 $$
-    \mathbb M_t \phi \leq \exp(\rho t)\phi.
+    \mathbb M_t \phi(x) \leq \exp(\rho t)\, \phi(x) ,
 $$
 
-When $\hat M$ is a true martingale, taking expectations on both sides of
-{eq}`eq:mhat` and using $E\hat M_t = 1$ gives the semigroup eigenvalue
-equation
+so $\rho$ is at least an upper bound on the long-run growth rate of
+$\mathbb M_t \phi$.
+
+When $\hat M$ is in fact a martingale, $E\hat M_t = 1$, the inequality
+becomes equality, and the local condition $\mathbb A\phi = \rho\phi$ lifts
+to the semigroup eigenvalue equation
 
 $$
-    \mathbb M_t \phi = \exp(\rho t)\phi,
+    \mathbb M_t \phi = \exp(\rho t)\, \phi,
     \qquad t \geq 0.
 $$ (eq:semigroup-eigen)
 
@@ -923,19 +978,19 @@ We now have a factorization {eq}`eq:hs-factorization` for *any* principal
 eigenfunction.
 
 But for $(\rho,\phi)$ to actually describe **long-run** behaviour of
-$\mathbb M_t$ --- not just produce a valid algebraic identity --- the twisted
+$\mathbb M_t$  the twisted
 process must settle into a stationary regime as $t \to \infty$.
 
-If it doesn't, the transient factor $\phi(X_0)/\phi(X_t)$ will not wash out
+If it doesn't, the transient factor $\phi(X_0)/\phi(X_t)$ will not vanish,
 and we cannot read off the asymptotics from $\rho$ alone.
 
-We need three conditions, each ruling out a specific failure mode.
+We need three conditions, each ruling out a specific way in which the twisted process can fail to settle down.
 
 Let $\hat E$ and $\widehat{\Pr}$ denote expectation and probability under the
 twisted measure, and let $\hat{\mathbb A}$ be the generator of $X$ under
 that measure.
 
-**Condition 1: a stationary distribution exists.**
+*Condition 1: a stationary distribution exists.*
 
 ```{prf:definition} Stationary Distribution of the Twisted Process
 :label: lrr-def-stationary-distribution
@@ -950,11 +1005,13 @@ $$
 for every $\psi$ in the $L^\infty$ domain of $\hat{\mathbb A}$.
 ```
 
-*Why we need it:* $\hat\varsigma$ is the candidate long-run distribution. If
+$\hat\varsigma$ is the candidate long-run distribution.
+
+If
 it doesn't exist, the twisted process has no steady state for $X_t$ to settle
 into, and the long-run limit cannot be expressed as a state-space integral.
 
-**Condition 2: every important region is reachable.**
+*Condition 2: every important region is reachable.*
 
 ```{prf:definition} Irreducible Skeleton
 :label: lrr-def-irreducible-skeleton
@@ -973,12 +1030,11 @@ $$
 $$
 ```
 
-*Why we need it:* Without it, the long-run distribution could depend on the
-starting state --- different basins of attraction would give different
-limits. The discrete sampling (with spacing $\Delta$) avoids period-2-style
-pathologies that can arise in continuous time.
+Without it, the long-run distribution could depend on the
+starting state; different basins of attraction would give different
+limits. 
 
-**Condition 3: every important region is visited infinitely often.**
+*Condition 3: every important region is visited infinitely often.*
 
 ```{prf:definition} Harris Recurrence
 :label: lrr-def-harris-recurrence
@@ -995,9 +1051,11 @@ $$
 $$
 ```
 
-*Why we need it:* Reachability (Condition 2) is not enough --- a region
+Reachability (Condition 2) is not enough. A region
 might be reachable but visited only with small probability, so time averages
-fail to converge to $\hat\varsigma$-averages. Harris recurrence is the
+fail to converge to $\hat\varsigma$-averages. 
+
+Harris recurrence is the
 continuous-state replacement for "recurrent state" in a finite chain.
 
 Bundling these together:
@@ -1027,9 +1085,9 @@ $$ (eq:long-run-limit)
 Read this as follows:
 
 * The factor $\exp(\rho t)$ captures the exponential growth or decay of the
-  semigroup. After we strip it off, what remains has a finite limit.
-* The state dependence in that limit is *entirely* in $\phi(x)$ --- this is
-  the sense in which $\phi$ is the long-run shape of the state dependence.
+  semigroup. 
+  - After we divide it, what remains has a finite limit.
+* The state dependence in that limit is *entirely* captured by $\phi(x)$.
 * The scalar $\int (\psi/\phi)\, d\hat\varsigma$ is the **long-run intensity**
   of the payoff $\psi$, weighted by $1/\phi$ and averaged against the
   twisted stationary distribution.
@@ -1043,14 +1101,11 @@ The mode of convergence depends on how nice $\psi$ is:
   bounded.
 
 ```{note}
-Strict positivity of $\phi$ is also why uniqueness can fail in general state
-spaces: there can be more than one positive eigenfunction yielding a true
+As we noted before, there can be more than one positive eigenfunction yielding a true
 martingale $\hat M$.
 
-What stochastic stability buys is *selection*: among all candidate
-eigenpairs, the principal eigenfunction selected by stochastic stability is
-the one whose eigenvalue is smallest, and any other positive eigenfunction
-with that eigenvalue is proportional to $\phi$, $\hat\varsigma$-a.s.
+Stochastic stability selects the one that matters for long-run behaviour, and rules out any other positive eigenfunction with the same eigenvalue that fails to be
+proportional to $\phi$ $\hat\varsigma$-a.s.
 
 This is the analogue of "the Perron-Frobenius eigenvector is unique up to
 scaling" in finite dimensions.
@@ -1058,11 +1113,15 @@ scaling" in finite dimensions.
 
 ## A finite-state Markov chain
 
-To see the whole framework in action, we start with the simplest possible
+Now we are all set to apply the framework to a concrete example!
+
+We start with the simplest possible
 case: a finite-state Markov chain.
 
 For background on finite Markov chains in discrete time, see
-{doc}`finite_markov`; for the asset-pricing applications of finite-state
+{doc}`finite_markov`. 
+
+For the asset-pricing applications of finite-state
 chains that motivate the construction here, see {doc}`markov_asset`.
 
 Here, every abstract object collapses to a familiar one:
@@ -1076,8 +1135,10 @@ Here, every abstract object collapses to a familiar one:
 | Principal eigenvalue $\rho$ | dominant real eigenvalue of $A$ |
 | Stationary distribution $\hat\varsigma$ | left eigenvector of twisted generator |
 
-So the long-run analysis is exactly Perron-Frobenius theory --- nothing more,
-nothing less.
+So the long-run analysis reduces to Perron-Frobenius theory.
+
+(For an introduction to the Perron-Frobenius theorem, see
+{doc}`intro:eigen_II`.)
 
 ### Setup
 
@@ -1125,7 +1186,7 @@ The semigroup is then just $\mathbb M_t = \exp(tA)$.
 For an irreducible chain with strictly positive jump multipliers, the
 principal eigenvalue $\rho$ is the unique real eigenvalue of $A$ with
 largest real part, and the associated right eigenvector is strictly
-positive --- this is the Perron-Frobenius theorem.
+positive.
 
 The twisted generator under the principal eigenpair $(\rho,\phi)$ is
 
@@ -1133,8 +1194,12 @@ $$
     \hat A = D_\phi^{-1} A D_\phi - \rho I,
 $$
 
-where $D_\phi = \operatorname{diag}(\phi)$. The row sums of $\hat A$ vanish,
-so $\hat A$ is itself a valid intensity matrix; the stationary distribution
+where $D_\phi = \operatorname{diag}(\phi)$. 
+
+The row sums of $\hat A$ vanish,
+so $\hat A$ is itself a valid intensity matrix. 
+
+The stationary distribution
 $\hat\varsigma$ solves $\hat\varsigma^\top \hat A = 0$.
 
 The helper functions below implement these three calculations.
@@ -1169,7 +1234,8 @@ def principal_eigenpair(A):
         if np.min(φ) > -1e-10:
             φ = np.maximum(φ, 0)
         else:
-            raise ValueError("Dominant eigenvector is not strictly positive.")
+            raise ValueError(
+                "Dominant eigenvector is not strictly positive.")
 
     φ = φ / φ.mean()
     return ρ, φ
@@ -1243,8 +1309,8 @@ for t in [1.0, 5.0, 25.0]:
     print(f"t = {t:4.1f}, error = {err:.2e}")
 ```
 
-The error decays towards zero --- the equation holds to machine precision
-(small errors are floating-point noise from the eigendecomposition).
+The error is at machine precision for every $t$, so the equation holds
+exactly up to floating-point noise from the eigendecomposition.
 
 Next we compute the twisted generator $\hat A$ and the stationary
 distribution $\hat\varsigma$ of the chain under the twisted measure.
@@ -1264,10 +1330,10 @@ print(f"  boom      {ς_hat[0]:.4f}")
 print(f"  recession {ς_hat[1]:.4f}")
 ```
 
-This twisted stationary distribution --- not the original chain's
-stationary distribution --- is what determines long-horizon valuations.
+This twisted stationary distribution (not the original chain's stationary
+distribution) is what determines long-horizon valuations.
 
-It differs from the original distribution because the eigenfunction $\phi$
+It differs from the stationary distribution of the original chain because the eigenfunction $\phi$
 reweights states by how persistently they affect the multiplicative
 functional.
 
@@ -1292,29 +1358,54 @@ for t in [1, 5, 20, 80]:
 print("\nlimit =", limit)
 ```
 
-The rescaled value converges to the same limiting vector regardless of the
-starting state --- exactly what {eq}`eq:long-run-limit` predicts.
+The numerical values converge to the limit; the next plot shows the same
+convergence pictorially.
+
+```{code-cell} ipython3
+t_grid = np.linspace(0.01, 80, 400)
+rescaled = np.array([np.exp(-ρ * t) * expm(t * A) @ ψ for t in t_grid])
+
+fig, ax = plt.subplots()
+ax.plot(t_grid, rescaled[:, 0], lw=2, label="starting from boom")
+ax.plot(t_grid, rescaled[:, 1], lw=2, label="starting from recession")
+ax.axhline(limit[0], color="C0", ls="--", lw=1, alpha=0.7)
+ax.axhline(limit[1], color="C1", ls="--", lw=1, alpha=0.7)
+ax.set_xlabel("$t$")
+ax.set_ylabel(r"$e^{-\rho t}\,(\mathbb{M}_t \psi)(x)$")
+ax.set_title("Convergence of the rescaled semigroup to its long-run limit")
+ax.legend()
+plt.show()
+```
+
+The two solid curves trace the rescaled values starting from each state,
+and the dashed horizontal lines mark the limits predicted by
+{eq}`eq:long-run-limit`.
+
+Both curves settle onto their predicted limits, confirming that the
+long-run behaviour depends on the starting state only through $\phi$.
 
 ```{note}
-The *rate* of convergence is the **spectral gap** of $A$: the difference
-between $\rho$ (the dominant real eigenvalue) and the next-largest real
-part of the spectrum.
+The *rate* of convergence is the **spectral gap** of $A$.
 
 This is the operator generalisation of the gap between the leading and
-sub-leading eigenvalues that controls mixing of a stationary Markov chain.
-Exercise {ref}`lrr_ex3` works through a three-state example where the gap
-can be read off directly.
+sub-leading eigenvalues that controls mixing of a stationary Markov chain. 
+
+{ref}`lrr_ex3` works through a three-state example where the gap
+can be checked directly.
 ```
 
 ### Adding jumps
 
 State transitions in this chain are discontinuous, so it is natural to allow
-the multiplicative functional to jump at the transition times --- the
-analogue of the $\kappa$ function in the jump-diffusion parameterization.
+the multiplicative functional to jump at the transition times. 
 
-A natural case for a stochastic discount factor: it jumps *up* when the
-economy moves from recession into boom (good news, marginal utility falls)
-and *down* on the reverse transition.
+These
+jumps are the analogue of the $\kappa$ function in the jump-diffusion
+parameterization.
+
+A natural example arises with a stochastic discount factor that jumps
+*up* when the economy moves from recession into boom (good news, marginal
+utility falls) and *down* on the reverse transition.
 
 The matrix `κ_jump` below encodes this.
 
@@ -1365,10 +1456,10 @@ We now move to a continuous-state model.
 We will use a two-factor affine specification that captures the two main
 empirical features of asset returns:
 
-* **stochastic volatility** --- the dispersion of shocks is itself a state
-  variable, and
-* **predictable growth** --- there is a small, persistent state variable
-  shifting expected growth rates.
+* **stochastic volatility**, where the dispersion of shocks is itself a
+  state variable, and
+* **predictable growth**, where a small, persistent state variable shifts
+  expected growth rates.
 
 This is the kind of state process used in long-run risk models like
 {cite:t}`Bansal_Yaron_2004`.
@@ -1427,16 +1518,12 @@ and proportional to $\sqrt{X^f}$ in the $B^f$ direction.
 
 ### Why exponential-affine eigenfunctions work
 
-The key observation is a closure property: when the state is affine and
-the drift of $A$ is affine, applying the generator to an
-exponential-affine function $\phi(x^f,x^o) = \exp(c_f x^f + c_o x^o)$
-returns another exponential-affine function.
-
-The eigenvalue equation $\mathbb A\phi = \rho\phi$ then collapses to a
-small number of algebraic equations in $(c_f, c_o, \rho)$.
+When the state is affine and the drift of $A$ is affine, applying the
+generator to an exponential-affine function 
+$\phi(x^f,x^o) = \exp(c_f x^f + c_o x^o)$ returns another exponential-affine function.
 
-This is the *continuous-state analogue* of the matrix Perron-Frobenius
-problem: we replace eigenvectors with exponential-affine eigenfunctions.
+This closure property turns the eigenvalue equation 
+$\mathbb A\phi = \rho\phi$ into a small system of algebraic equations in $(c_f, c_o, \rho)$.
 
 ```{prf:definition} Exponential-Affine Eigenfunction
 :label: lrr-def-exponential-affine-eigenfunction
@@ -1498,11 +1585,12 @@ $$
 + c_o^2 \frac{\sigma_o^2}{2}.
 $$ (eq:affine-rho)
 
-**Picking the right root.** Equation {eq}`eq:cf-roots` gives two candidate
+Equation {eq}`eq:cf-roots` gives two candidate
 values of $c_f$, and we need to know which one is the principal
 eigenfunction.
 
-This is where the stochastic-stability condition does real work.
+This is where stochastic stability
+({prf:ref}`lrr-def-stochastic-stability`) does real work.
 
 Under the twisted measure, the drift of $X^f$ is
 
@@ -1521,7 +1609,7 @@ If this is positive, the twisted square-root process stays stationary; if
 it is negative, the twisted process is explosive and the eigenfunction is
 not the long-run-relevant one.
 
-So we **pick the root that keeps the twisted process mean-reverting** ---
+So we **pick the root that keeps the twisted process mean-reverting**,
 exactly the way stochastic stability selects the principal eigenfunction
 in the abstract theory.
 
@@ -1635,7 +1723,7 @@ A QuantEcon lecture that studies long-run dynamics under recursive
 preferences in a different setting is
 {doc}`survival_recursive_preferences`.
 
-The block below derives the SDF coefficients for the unit-elasticity
+This section derives the SDF coefficients for the unit-elasticity
 recursive specification. 
 
 You can skip on a first read and come back later
@@ -1793,7 +1881,7 @@ The long-run zero-coupon yield $-\rho_s$ represents the asymptotic decay
 rate in the SDF expectation $E[S_t]$.
 
 We can also check that the rejected root for $c_f$ would have produced a
-non-stationary twisted process --- a clear example of stochastic stability
+non-stationary twisted process, a clear example of stochastic stability
 selecting one of two algebraically valid eigenfunctions.
 
 ```{code-cell} ipython3
@@ -1838,7 +1926,7 @@ $$
 $$
 
 The first line is the Brownian integral (the "exponential martingale" piece);
-the second is the Itô correction making it a true martingale.
+the second is the Itô correction needed to make it a martingale.
 
 Under the twisted measure induced by $\hat M$, the drifts of the state
 variables shift to
@@ -1961,23 +2049,25 @@ plt.tight_layout()
 plt.show()
 ```
 
+Indeed the factorization holds up to numerical noise, and we can see how the three components evolve over time.
+
 ## Long-run risk prices
 
 We can now use the factorization to compute long-run analogues of the
 instantaneous risk prices that come out of standard continuous-time asset
 pricing.
 
-The economic question is sharp:
+We can ask:
 
 > If an investor takes on a small exposure to a shock today, how much extra
-> expected return do they need --- as compensation --- when we measure that
+> expected return do they need, as compensation, when we measure that
 > compensation as a long-horizon rate rather than as an instantaneous one?
 
-The two answers --- local and long-run --- need not agree.
+The two answers, local and long-run, need not agree.
 
-The reason: a shock that moves a persistent state variable has a small
-*immediate* effect on the cash flow but a large *cumulative* effect on
-future expected growth and discounting.
+A shock that moves a persistent state variable has a small *immediate*
+effect on the cash flow but a large *cumulative* effect on future expected
+growth and discounting, which is why the two answers come apart.
 
 So the long-run risk price is the local price *plus a persistence
 correction.*
@@ -1997,7 +2087,7 @@ $-\gamma^v_i \gamma^s_i$ per unit time, so a unit of exposure $\gamma^v_i$
 is priced at $-\gamma^s_i$.
 ```
 
-The local price is easy: read it directly off the SDF coefficients.
+For the local price we just read off the SDF coefficients.
 
 ```{prf:definition} Long-Run Risk Price
 :label: lrr-def-long-run-risk-price
@@ -2011,21 +2101,26 @@ the $GS$ semigroup and $\delta$ is the trend growth rate, held fixed.
 ```
 
 Computing the long-run price requires solving the principal eigenvalue
-problem --- it captures how a shock propagates through the persistent state
-component.
+problem, which captures how a shock propagates through the persistent
+state component.
 
 ### Two frontiers
 
 We will see *two* related ways to vary risk exposure, each leading to a
 slightly different long-run risk price:
 
-1. **Valuation-functional frontier.** Hold the SDF $S$ fixed and vary the
-   asset's Brownian exposures $(\gamma^v_f, \gamma^v_o)$. Use the local
+1. **Valuation-functional frontier:** 
+   - Hold the SDF $S$ fixed and vary the
+   asset's Brownian exposures $(\gamma^v_f, \gamma^v_o)$. 
+   
+   - Use the local
    pricing restriction to determine the drift $\beta^v$, then compute
    $\rho^v$ for the $V$-semigroup.
 
-2. **Cash-flow frontier.** Hold the SDF $S$ fixed and vary the cash-flow's
-   growth exposures $(\gamma^g_f, \gamma^g_o)$. Set $M = GS$ and compute
+2. **Cash-flow frontier:** 
+   - Hold the SDF $S$ fixed and vary the cash-flow's
+   growth exposures $(\gamma^g_f, \gamma^g_o)$. 
+   - Set $M = GS$ and compute
    the principal eigenvalue $\rho$ of the cash-flow valuation semigroup.
 
 These two frontiers coincide in simple log-normal examples, but they can
@@ -2106,7 +2201,7 @@ It arises because:
   on future SDF growth scales like $1/\xi_o$.
 
 As $\xi_o$ shrinks, persistence grows and the long-run price diverges from
-the local one --- which is the central economic content of long-run risk
+the local one, which is the central economic content of long-run risk
 models.
 
 The local price of $B^f$ exposure is state dependent (it scales with
@@ -2145,10 +2240,13 @@ ax.legend()
 plt.show()
 ```
 
+As we can see, the long-run price of $B^o$ exposure is sensitive to
+the mean-reversion speed $\xi_o$, but the local price is a constant.
+
 ### Changing valuation functionals
 
 Now we work out the long-run risk price formula by varying the asset's
-exposure --- the **valuation-functional frontier** introduced above.
+exposure, the **valuation-functional frontier** introduced above.
 
 We hold the SDF $S$ fixed and pick Brownian exposures
 $(\gamma_f^v,\gamma_o^v)$ for the asset return, parameterizing the
@@ -2229,8 +2327,9 @@ $$
     - \frac{\beta_o^s}{\xi_o}\sigma_o .
 $$ (eq:valuation-long-run-price-o)
 
-This matches the formula {eq}`eq:long-run-price-o` we previewed above ---
-the local price plus the persistence correction $-(\beta_o^s/\xi_o)\sigma_o$.
+This matches the formula {eq}`eq:long-run-price-o` we previewed above,
+namely the local price plus the persistence correction
+$-(\beta_o^s/\xi_o)\sigma_o$.
 
 Let's verify the formula numerically by finite-differencing the eigenvalue
 computation.
@@ -2316,12 +2415,12 @@ $$
 $$
 
 which keeps the growth-twisted square-root volatility process from hitting
-zero --- i.e., it preserves stochastic stability under the growth-twisted
+zero, so stochastic stability is preserved under the growth-twisted
 measure.
 
 ```{note}
 This Feller restriction is a concrete instance of a general point we
-flagged earlier: changing growth risk can destroy stability and invalidate
+flagged earlier: changing growth risk can violate stability and invalidate
 the long-run approximation, so the choice of $(\gamma_f^g, \gamma_o^g)$
 isn't free.
 ```
@@ -2403,7 +2502,7 @@ print(f"formula                 = {long_run_price_o:.6f}")
 
 The examples above make the eigenfunction calculation look mechanical.
 
-For finite-state chains and the affine model, it really is mechanical ---
+For finite-state chains and the affine model, it really is mechanical;
 Perron-Frobenius theory and closed-form algebra handle every requirement.
 
 But in a general state space, three things can go wrong, and each
@@ -2411,14 +2510,15 @@ corresponds to one of the assumptions we have been carrying along.
 
 This section walks through what they are and why they matter.
 
-### Issue 1: $\hat M$ might fail to be a true martingale
+### Issue 1: $\hat M$ might fail to be a martingale
 
 A positive eigenfunction $\phi$ gives us a candidate martingale $\hat M$
-from {eq}`eq:mhat`, but $\hat M$ is *automatically* only a nonnegative local
-martingale --- hence a supermartingale.
+from {eq}`eq:mhat`, but $\hat M$ is only a nonnegative
+local martingale, hence a supermartingale.
 
 A supermartingale is not enough to define a probability measure: we need
-$E\hat M_t = 1$, i.e. a *true* martingale.
+$E\hat M_t = 1$, i.e. a genuine martingale, the content of Assumption 6.1
+in {cite:t}`HansenScheinkman2009`.
 
 A standard way to verify this is a two-sided **Girsanov construction**:
 write the drift and jump distortion induced by $\hat M$, check that the
@@ -2427,17 +2527,16 @@ density (the inverse of $\hat M$) is locally integrable.
 
 ### Issue 2: the twisted process might fail to be stable
 
-Even with $\hat M$ a true martingale, the long-run limit
+Even with $\hat M$ a martingale, the long-run limit
 {eq}`eq:long-run-limit` requires that the twisted process actually settles
 into a steady state.
 
-This is where stochastic stability --- our trio of stationary distribution,
-irreducibility of the skeleton, and Harris recurrence --- does real work.
+This is where stochastic stability (our trio of stationary distribution,
+irreducibility of the skeleton, and Harris recurrence) does real work.
 
-The affine example illustrates this concretely: we *rejected* one of the
-two algebraically valid eigenfunctions because it implied an explosive
-twisted square-root process. The math admitted two roots; stochastic
-stability picked the right one.
+The affine example illustrates this concretely, since we *rejected* one
+of the two algebraically valid eigenfunctions because it implied an
+explosive twisted square-root process.
 
 ### Issue 3: a principal eigenfunction might not exist at all
 
@@ -2452,7 +2551,9 @@ $$
     \frac{\mathbb A V}{V} \leq a_0 .
 $$
 
-Roughly: $V$ doesn't grow too fast under the semigroup. With this in hand,
+Roughly: $V$ doesn't grow too fast under the semigroup. 
+
+With this in hand,
 for any $\alpha > a_0$ define the **resolvent operator**
 
 $$
@@ -2473,28 +2574,33 @@ The existence proof then proceeds in three steps:
 
 1. **Irreducibility for the resolvent.** There exists a reference measure
    $\nu$ such that $F_\alpha\mathbf 1_\Lambda(x) > 0$ for every $x$
-   whenever $\nu(\Lambda) > 0$ --- so the resolvent doesn't "miss" any
+   whenever $\nu(\Lambda) > 0$, so the resolvent doesn't "miss" any
    region of state space.
 
 2. **Nummelin minorization.** Irreducibility yields a lower bound
-   $F_\alpha \psi \geq s\int \psi\, d\nu$ for nonnegative $\psi$. This is a
+   $F_\alpha \psi \geq s\int \psi\, d\nu$ for nonnegative $\psi$. 
+   - This is a
    classical tool from general-state-space Markov-chain theory; the
    constant $s>0$ is the *minorization strength*.
 
 3. **Eigenfunction extraction.** The minorization, combined with additional
    boundedness or strengthened drift assumptions, identifies a critical
    spectral value for $F_\alpha$ and an associated positive
-   eigenfunction. Inverting the resolvent transform produces a positive
+   eigenfunction. 
+   
+   - Inverting the resolvent transform produces a positive
    eigenfunction for the original semigroup.
 
-### Summary of the assumption hierarchy
+These steps are all nontrivial and is out of the scope of this lecture. 
+
+The details are in Section 9 of {cite:t}`HansenScheinkman2009`.
 
 We can summarize the chain of conditions as:
 
 | Want | Need |
 |:---|:---|
 | A factorization {eq}`eq:hs-factorization` | A positive eigenfunction $\phi$ |
-| $\hat M$ to define a probability measure | $\hat M$ is a true martingale |
+| $\hat M$ to define a probability measure | $\hat M$ is a martingale (Assumption 6.1) |
 | The long-run limit {eq}`eq:long-run-limit` | Stochastic stability of the twisted process |
 | A unique principal eigenfunction | Stability selects among positive eigenfunctions |
 
@@ -2560,14 +2666,14 @@ $$
 Let the multiplicative functional have decay rate $r_1>0$ in state 1, decay
 rate $r_2=0$ in state 2, and no jumps.
 
-a. Write down the generator matrix $A$.
+1. Write down the generator matrix $A$.
 
-b. Find the principal eigenvalue $\rho$ in terms of $\lambda$, $\mu$, and
+2. Find the principal eigenvalue $\rho$ in terms of $\lambda$, $\mu$, and
 $r_1$.
 
-c. Verify numerically with $\lambda=0.4$, $\mu=0.6$, and $r_1=0.05$.
+3. Verify numerically with $\lambda=0.4$, $\mu=0.6$, and $r_1=0.05$.
 
-d. Show that $-r_1 < \rho < 0$.
+4. Show that $-r_1 < \rho < 0$.
 ```
 
 ```{solution-start} lrr_ex1
@@ -2576,7 +2682,7 @@ d. Show that $-r_1 < \rho < 0$.
 
 Here is one solution:
 
-*a.* The generator is
+*1.* The generator is
 
 $$
 A =
@@ -2586,7 +2692,7 @@ A =
 \end{pmatrix}.
 $$
 
-*b.* The characteristic equation is
+*2.* The characteristic equation is
 
 $$
     \rho^2 + (\lambda+\mu+r_1)\rho + \mu r_1 = 0.
@@ -2603,7 +2709,7 @@ $$
 }{2}.
 $$
 
-*c.* Numerical verification:
+*3.* Numerical verification:
 
 ```{code-cell} ipython3
 λ, μ, r1 = 0.4, 0.6, 0.05
@@ -2624,7 +2730,7 @@ print(f"numeric  ρ = {ρ_numeric:.8f}")
 print(f"difference   = {abs(ρ_formula-ρ_numeric):.2e}")
 ```
 
-*d.* Let
+*4.* Let
 
 $$
 q(x)=x^2+(\lambda+\mu+r_1)x+\mu r_1.
@@ -2727,14 +2833,14 @@ $$
 decay-rate vector $r = (0.06, 0.04, 0.01)$, and no jumps in the
 multiplicative functional. Let $\psi=(3,1,2)$.
 
-a. Compute the principal eigenpair $(\rho,\phi)$ and twisted stationary
+1. Compute the principal eigenpair $(\rho,\phi)$ and twisted stationary
 distribution $\hat\varsigma$, and report the theoretical limit
 
 $$
     \phi \sum_i \frac{\psi_i}{\phi_i}\hat\varsigma_i .
 $$
 
-b. Plot
+2. Plot
 
 $$
     \max_i
@@ -2747,7 +2853,7 @@ $$
 
 on a logarithmic scale.
 
-c. Compare the convergence rate to the spectral gap between the largest and
+3. Compare the convergence rate to the spectral gap between the largest and
 second-largest real parts of the eigenvalues of $A$.
 ```
 
@@ -2817,10 +2923,10 @@ Derive the local martingale restriction
 Let $M = \exp(A)$ for the additive functional $A$ in
 {eq}`eq:additive-functional`, with parameters $(\beta,\gamma,\kappa)$.
 
-a. Decompose $A_t = A_t^c + A_t^j$ into its continuous and pure-jump parts
+1. Decompose $A_t = A_t^c + A_t^j$ into its continuous and pure-jump parts
 and write down $dA_t^c$ and the jump magnitudes $\Delta A_t$.
 
-b. Apply Itô's formula for semimartingales to $f(a) = e^a$ to show that
+2. Apply Itô's formula for semimartingales to $f(a) = e^a$ to show that
 
 $$
     dM_t
@@ -2830,14 +2936,14 @@ $$
     + M_{t-}\big(\exp[\Delta A_t] - 1\big)\quad\text{at jumps}.
 $$
 
-c. Use $d\langle A^c, A^c\rangle_t = \gamma^\top\gamma\, dt$ and rewrite the
+3. Use $d\langle A^c, A^c\rangle_t = \gamma^\top\gamma\, dt$ and rewrite the
 jump term as an integral against the random counting measure $\zeta$.
 
-d. Split $\zeta$ into its compensator $\eta(dy \mid X_{t-})\, dt$ and the
+4. Split $\zeta$ into its compensator $\eta(dy \mid X_{t-})\, dt$ and the
 compensated martingale measure
 $\tilde\zeta = \zeta - \eta(dy\mid X_{t-})\, dt$.
 
-e. Collect drift (predictable) and martingale terms and conclude that $M$ is
+5. Collect drift (predictable) and martingale terms and conclude that $M$ is
 a local martingale iff the drift vanishes at every state, which gives
 {eq}`eq:local-martingale-restriction`.
 ```
@@ -2848,7 +2954,7 @@ a local martingale iff the drift vanishes at every state, which gives
 
 Here is one solution.
 
-*a.* From the parameterization {eq}`eq:additive-functional`,
+*1.* From the parameterization {eq}`eq:additive-functional`,
 
 $$
     dA_t^c = \beta(X_t)\, dt + \gamma(X_{t-})^\top\, dB_t,
@@ -2856,8 +2962,10 @@ $$
     \Delta A_t = \kappa(X_t, X_{t-}) \text{ at a jump time}.
 $$
 
-*b.* For $f(a) = e^a$ we have $f'(a) = f''(a) = e^a$, so $f'(A_{t-}) =
-f''(A_{t-}) = M_{t-}$. Itô's formula for a semimartingale gives
+*2.* For $f(a) = e^a$ we have $f'(a) = f''(a) = e^a$, so $f'(A_{t-}) =
+f''(A_{t-}) = M_{t-}$. 
+
+The [(generalized) Itô's formula](https://almostsuremath.com/2010/01/25/the-generalized-ito-formula/) for a semimartingale gives
 
 $$
     dM_t
@@ -2870,7 +2978,7 @@ $$
 Since $A_t = A_{t-} + \Delta A_t$ at a jump, $f(A_t) - f(A_{t-}) =
 M_{t-}\big(\exp[\Delta A_t] - 1\big)$, which is the stated expression.
 
-*c.* Substituting $d\langle A^c, A^c\rangle_t = \gamma(X_{t-})^\top
+*3.* Substituting $d\langle A^c, A^c\rangle_t = \gamma(X_{t-})^\top
 \gamma(X_{t-})\, dt$ and rewriting the jump contribution as an integral
 against the random counting measure $\zeta$ of $(X, A)$ gives
 
@@ -2887,7 +2995,7 @@ $$
 \end{aligned}
 $$
 
-*d.* Writing $\zeta = \tilde\zeta + \eta(dy\mid X_{t-})\, dt$ separates the
+*4.* Writing $\zeta = \tilde\zeta + \eta(dy\mid X_{t-})\, dt$ separates the
 jump integral into a martingale and a predictable drift contribution:
 
 $$
@@ -2897,7 +3005,7 @@ $$
 + \int\big(\exp[\kappa(y,X_{t-})] - 1\big)\eta(dy\mid X_{t-})\, dt .
 $$
 
-*e.* Collecting drift and martingale terms,
+*5.* Collecting drift and martingale terms,
 
 $$
 \begin{aligned}
@@ -2946,10 +3054,11 @@ $$
     N_t = M_t\phi(X_t) - \phi(X_0) - \int_0^t M_s \chi(X_s)\, ds
 $$
 
-is a local martingale, so the task is to identify the predictable drift of
-$M_t\phi(X_t)$ and read off $\chi$.
+is a local martingale. The task therefore has two pieces: identify the
+predictable drift of $M_t\phi(X_t)$ to read off a candidate $\chi$, and
+verify that the residual $N_t$ really is a local martingale.
 
-a. Apply Itô's formula to $Y_t = \exp(A_t)\phi(X_t)$ between jumps and
+(1) Apply Itô's formula to $Y_t = \exp(A_t)\phi(X_t)$ between jumps and
 show that the continuous part of $dY_t$ has drift
 
 $$
@@ -2965,7 +3074,7 @@ $$
     \right] dt .
 $$
 
-b. Show that at a jump time $t$ with $X_{t-}=x$ and $X_t=y$,
+(2) Show that at a jump time $t$ with $X_{t-}=x$ and $X_t=y$,
 
 $$
     \Delta Y_t = M_{t-}\big[\exp[\kappa(y,x)]\phi(y) - \phi(x)\big] ,
@@ -2982,7 +3091,7 @@ $$
         \eta(dy \mid x)\, dt .
 $$
 
-c. Decompose
+(3) Decompose
 
 $$
     \exp[\kappa(y,x)]\phi(y) - \phi(x)
@@ -2992,8 +3101,24 @@ $$
     + \big[\exp[\kappa(y,x)] - 1\big]\phi(x),
 $$
 
-combine the result with part a., and read off $\mathbb A \phi$ to recover
+combine with part (1) to obtain the full predictable drift coefficient
+$\chi(x)$, and check that it matches the closed form
 {eq}`eq:extended-generator`.
+
+(4) Identify the continuous Itô
+integrand from part (1) as a local martingale $N^c_t$ and the compensated
+jump sum from part (2) as a local martingale $N^j_t$. 
+
+Show that the
+semimartingale decomposition of $Y_t = M_t\phi(X_t)$ implies
+
+$$
+    M_t\phi(X_t) - \phi(X_0) - \int_0^t M_s\, \chi(X_s)\, ds
+    = N^c_t + N^j_t ,
+$$
+
+which is a local martingale. Conclude via
+{prf:ref}`lrr-def-extended-generator` that $\mathbb A \phi = \chi$.
 ```
 
 ```{solution-start} lrr_ex4
@@ -3002,7 +3127,7 @@ combine the result with part a., and read off $\mathbb A \phi$ to recover
 
 Here is one solution:
 
-*a.* Set $g(a, x) = e^a \phi(x)$, so that $Y_t = g(A_t, X_t)$.
+*(1)* Set $g(a, x) = e^a \phi(x)$, so that $Y_t = g(A_t, X_t)$.
 
 Between jumps, the continuous parts of $A$ and $X$ are
 
@@ -3036,7 +3161,8 @@ $$
     \partial_{xx} g = e^a \frac{\partial^2 \phi}{\partial x \partial x^\top} .
 $$
 
-Itô's formula yields a continuous martingale part plus the drift
+Itô's formula yields a continuous local martingale $N^c_t$ (the $dB_t$
+part) plus the drift
 
 $$
 \begin{aligned}
@@ -3052,7 +3178,7 @@ $$
 
 Grouping the gradient terms gives the expression in the question.
 
-*b.* At a jump time $t$,
+*(2)* At a jump time $t$,
 $\Delta A_t = \kappa(X_t, X_{t-}) = \kappa(y, x)$, so
 $M_t = M_{t-}\exp[\kappa(y,x)]$ and
 
@@ -3062,10 +3188,11 @@ $$
     = M_{t-}\big[\exp[\kappa(y,x)]\phi(y) - \phi(x)\big] .
 $$
 
-Compensating these jumps against the predictable intensity $\eta(dy \mid x)$
-gives the stated predictable drift.
+The sum of jumps minus its predictable compensator is a local martingale
+$N^j_t$, and the compensator itself contributes the predictable drift
+quoted in the question.
 
-*c.* Adding the jump drift from b. to the continuous drift from a., the
+*(3)* Adding the jump drift from (2) to the continuous drift from (1), the
 predictable drift of $Y_t = M_t \phi(X_t)$ at state $x$ is $M_t\, \chi(x)\, dt$
 with
 
@@ -3086,7 +3213,33 @@ $$
 $$
 
 Collecting the terms multiplying $\phi(x)$ recovers
-{eq}`eq:extended-generator`, so $\chi = \mathbb A \phi$.
+{eq}`eq:extended-generator`.
+
+*(4)* Putting the two martingale
+pieces together and integrating the drift, steps (1) and (2) give the
+semimartingale decomposition
+
+$$
+    M_t \phi(X_t)
+    =
+    \phi(X_0)
+    + \int_0^t M_s\, \chi(X_s)\, ds
+    + N^c_t + N^j_t .
+$$
+
+Rearranging,
+
+$$
+    N_t
+    :=
+    M_t \phi(X_t) - \phi(X_0) - \int_0^t M_s\, \chi(X_s)\, ds
+    = N^c_t + N^j_t ,
+$$
+
+which is a local martingale because it is a sum of local martingales.
+
+This is exactly the property required by
+{prf:ref}`lrr-def-extended-generator`, so $\chi = \mathbb A \phi$.
 
 ```{solution-end}
 ```

From 7c968648bdb9e1f5620078d58a551dc303ff2a65 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Fri, 29 May 2026 15:29:00 +1000
Subject: [PATCH 11/25] updates

---
 lectures/long_run_risk_operator.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index cb0c34546..f800d7f1f 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -20,7 +20,7 @@ kernelspec:
 </div>
 ```
 
-# Long-term risk: an operator approach
+# Long-term Risk: An Operator Approach
 
 ```{contents} Contents
 :depth: 2

From 9c628b457345d998d3d4f501aa2f379b49c1c93e Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Fri, 29 May 2026 16:45:37 +1000
Subject: [PATCH 12/25] updates

---
 lectures/_static/quant-econ.bib    |  12 ++
 lectures/long_run_risk_operator.md | 265 ++++++++++++++++++++++-------
 2 files changed, 212 insertions(+), 65 deletions(-)

diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index cadf56cd4..71ceac6b9 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -4053,3 +4053,15 @@ @article{Breeden1979
   pages     = {265--296},
   doi       = {10.1016/0304-405X(79)90016-3}
 }
+
+@book{Nummelin_1984,
+  author    = {Nummelin, Esa},
+  title     = {General Irreducible {Markov} Chains and Non-Negative
+               Operators},
+  series    = {Cambridge Tracts in Mathematics},
+  number    = {83},
+  publisher = {Cambridge University Press},
+  address   = {Cambridge},
+  year      = {1984},
+  doi       = {10.1017/CBO9780511526237}
+}
diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index f800d7f1f..917d8efd5 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -160,12 +160,18 @@ $$
 and the pure-jump component as
 
 $$
-    dX_t^j = \int y\, \zeta(dy, dt),
+    dX_t^j = \int \bigl(y - X_{t-}\bigr)\, \zeta(dy, dt),
 $$
 
-where $\zeta$ is the random counting measure of jumps and
-$\eta(dy \mid X_{t-})\, dt$ is its compensator, the rate at which $X$
-jumps from $X_{t-}$ to a region $dy$.
+where $\zeta$ is the random counting measure of jumps indexed by the
+*post-jump state* $y$, so that $\zeta(B, [0,t])$ counts the number of
+jumps in $[0,t]$ landing in the Borel set $B$.
+
+Its compensator is $\eta(dy \mid X_{t-})\, dt$, the rate at which $X$
+jumps from $X_{t-}$ into a region $dy$.
+
+We will use this "$y$ = post-jump state" convention throughout, matching
+the form $\phi(y) - \phi(x)$ that appears in the generator below.
 
 We also impose two simplifying assumptions:
 
@@ -683,19 +689,30 @@ That is exactly what we will see when we write down its closed form in
 {eq}`eq:extended-generator` below.
 ```
 
-Why is this the right object? 
+Why is this the right object?
+
+If $\mathbb A\phi = \rho\phi$, then the candidate
+
+$$
+    \hat M_t := \exp(-\rho t)\, M_t\, \frac{\phi(X_t)}{\phi(X_0)}
+$$
+
+is automatically a *local* martingale.
 
-Because if $\mathbb A\phi = \rho \phi$, then
-iterating gives
+When $\hat M$ is in fact a martingale (the content of Assumption 6.1 in
+{cite:t}`HansenScheinkman2009`), taking expectations gives the
+continuous-time analogue of $K^n\phi = \lambda^n\phi$:
 
 $$
-    \mathbb M_t \phi = \exp(\rho t)\, \phi ,
+    \mathbb M_t \phi = \exp(\rho t)\, \phi .
 $$
 
-the continuous-time analogue of $K^n \phi = \lambda^n \phi$.
+Without that upgrade we get only the supermartingale inequality $\mathbb
+M_t \phi \le \exp(\rho t)\, \phi$, which we will revisit below.
 
 So the long-run behaviour of $\mathbb M_t$ is encoded in an eigenvalue
-problem for the local operator $\mathbb A$.
+problem for the local operator $\mathbb A$, together with the
+martingale property of $\hat M$.
 
 ### Extended generator
 
@@ -834,9 +851,17 @@ The four terms have transparent interpretations:
    compensated jumps.
 
 ```{note}
-When $M=S$ is a stochastic discount factor, the term multiplying $\phi(x)$
-in the fourth line encodes local prices of Brownian and jump risk, the
-short-end of the term structure we will revisit later.
+When $M=S$ is a stochastic discount factor, the local risk prices are
+spread across the four pieces of the generator rather than concentrated in
+the level term.
+
+* Brownian factor prices enter through the drift modification
+  $\xi \to \xi + \Gamma\gamma$ in the first line.
+* Jump-risk prices enter through the tilted jump measure
+  $\eta \to \exp[\kappa]\,\eta$ in the third line.
+* The level term in the fourth line contains the instantaneous risk-free
+  rate component (the *short end* of the term structure) plus the
+  jump-compensator contribution $\int(\exp[\kappa]-1)\,\eta$.
 
 We ask readers to verify the derivation of {eq}`eq:extended-generator` in
 {ref}`lrr_ex4`.
@@ -1092,23 +1117,47 @@ Read this as follows:
   of the payoff $\psi$, weighted by $1/\phi$ and averaged against the
   twisted stationary distribution.
 
-The mode of convergence depends on how nice $\psi$ is:
+The two precise cases of Proposition 7.1 of
+{cite:t}`HansenScheinkman2009` are:
 
-* **Almost-everywhere along a sampling grid.** For any fixed $\Delta>0$,
-  convergence along $t=\Delta j$ holds for almost every initial state when
-  $\int |\psi|/\phi\, d\hat\varsigma < \infty$.
-* **Pointwise for all continuous $t$.** Stronger but needs $\psi/\phi$
-  bounded.
+* **Sampled grid, $\hat\varsigma$-almost every starting state.** Fix any
+  $\Delta>0$ and assume $\int |\psi|/\phi\, d\hat\varsigma < \infty$;
+  then the limit holds along $t = \Delta j$ for $\hat\varsigma$-almost
+  every $x \in \mathcal D_0$.
+* **Continuous $t$, every starting state.** If $\psi/\phi$ is bounded
+  then the limit holds for every $x \in \mathcal D_0$ and every
+  $t \to \infty$, with no continuity assumption on $\psi$.
 
 ```{note}
-As we noted before, there can be more than one positive eigenfunction yielding a true
-martingale $\hat M$.
+There can be more than one positive eigenfunction of $\mathbb A$ for which
+the associated $\hat M$ is a martingale, possibly with different
+eigenvalues.
+
+Stochastic stability selects a particular one.
+
+By Proposition 7.2 of {cite:t}`HansenScheinkman2009`, the stable
+eigenfunction's eigenvalue $\rho$ is the **smallest** among all
+eigenvalues of $\mathbb A$ that admit a strictly positive eigenfunction
+satisfying the maintained conditions.
+
+Any other positive eigenfunction sharing this $\rho$ is proportional to
+$\phi$ $\hat\varsigma$-almost surely.
 
-Stochastic stability selects the one that matters for long-run behaviour, and rules out any other positive eigenfunction with the same eigenvalue that fails to be
-proportional to $\phi$ $\hat\varsigma$-a.s.
+The finite-state section below states the same selection in the more
+familiar Perron-Frobenius language, calling $\rho$ the eigenvalue with the
+*largest* real part among all eigenvalues of $A$.
 
-This is the analogue of "the Perron-Frobenius eigenvector is unique up to
-scaling" in finite dimensions.
+These two descriptions identify the same eigenvalue because in the
+irreducible finite-state case only one eigenvalue of $A$ admits a
+strictly positive eigenvector.
+
+In the affine example below the two sets pull apart, since the quadratic
+{eq}`eq:cf-roots` has two roots $c_f$ that both give strictly positive
+exponential-affine eigenfunctions with distinct eigenvalues.
+
+Stochastic
+stability picks the smaller $\rho$ by rejecting the root that produces an
+explosive twisted process.
 ```
 
 ## A finite-state Markov chain
@@ -1265,11 +1314,11 @@ def stationary_distribution(Q):
 
 Consider a boom-recession economy.
 
-State 1 is a *boom* (low short rate $r_1=0.05$, switching to recession at
-rate $\lambda_1 = 0.30$).
+State 1 is a *boom* (higher short rate $r_1=0.05$, switching to recession
+at rate $\lambda_1 = 0.30$).
 
-State 2 is a *recession* (lower short rate $r_2=0.02$, switching to boom at
-rate $\lambda_2 = 0.50$).
+State 2 is a *recession* (lower short rate $r_2=0.02$, switching to boom
+at rate $\lambda_2 = 0.50$).
 
 For now we set the jump multipliers to zero, so the SDF only changes
 continuously through the in-state decay rates.
@@ -1385,13 +1434,15 @@ Both curves settle onto their predicted limits, confirming that the
 long-run behaviour depends on the starting state only through $\phi$.
 
 ```{note}
-The *rate* of convergence is the **spectral gap** of $A$.
+The asymptotic exponential rate of convergence is governed by the gap
+between the *real part* of the leading eigenvalue $\rho$ and the largest
+real part among the remaining eigenvalues of $A$.
 
-This is the operator generalisation of the gap between the leading and
-sub-leading eigenvalues that controls mixing of a stationary Markov chain. 
+For an irreducible Metzler matrix the leading eigenvalue is real and its
+real part is strictly larger than the others, so this gap is well defined.
 
-{ref}`lrr_ex3` works through a three-state example where the gap
-can be checked directly.
+{ref}`lrr_ex3` works through a three-state example where the gap can be
+checked directly.
 ```
 
 ### Adding jumps
@@ -1404,14 +1455,19 @@ jumps are the analogue of the $\kappa$ function in the jump-diffusion
 parameterization.
 
 A natural example arises with a stochastic discount factor that jumps
-*up* when the economy moves from recession into boom (good news, marginal
-utility falls) and *down* on the reverse transition.
+*down* when the economy moves from recession into boom and *up* on the reverse transition.
+
+The matrix `κ_jump` below encodes this. 
 
-The matrix `κ_jump` below encodes this.
+We use the convention
+`κ[j, i]` = log jump multiplier of $M$ for the transition $i \to j$, with
+state index 0 = boom and state index 1 = recession.
 
 ```{code-cell} ipython3
-κ_jump = np.array([[0.0,  0.30],
-                   [-0.20, 0.0]])
+# recession (1) -> boom (0): SDF jumps down on good news (exp(-0.20))
+# boom (0) -> recession (1): SDF jumps up on bad news (exp(+0.30))
+κ_jump = np.array([[ 0.0, -0.20],
+                   [ 0.30,  0.0]])
 
 A_jump = build_generator(U, r, κ_jump)
 ρ_jump, φ_jump = principal_eigenpair(A_jump)
@@ -1432,7 +1488,7 @@ recession-to-boom multiplier varies.
 
 for n, k in enumerate(κ_grid):
     κ_temp = np.array([[0.0, k],
-                           [-0.2, 0.0]])
+                       [0.30, 0.0]])
     A_temp = build_generator(U, r, κ_temp)
     ρ_grid[n], _ = principal_eigenpair(A_temp)
 
@@ -1446,8 +1502,12 @@ ax.set_title("Jumps and the Long-Run Growth Rate")
 plt.show()
 ```
 
-Larger upward jumps on recession-to-boom transitions raise $\rho$ because the
-functional jumps up on those transitions.
+The principal eigenvalue is monotonically increasing in the recession-to-boom
+log multiplier: as that multiplier rises, $M$ jumps less downward (or more
+upward) on good news, which mechanically pushes $\rho$ up.
+
+The economically sensible SDF region is to the left of zero, where the
+multiplier is negative.
 
 ## The affine diffusion example
 
@@ -1839,9 +1899,10 @@ same operator calculation applies once the SDF parameters are replaced by
 
 Let's set up parameters and solve for the principal eigenpair.
 
-We use plausible monthly-frequency parameters: a mean-reverting volatility
-factor $X^f$ with mean $0.04$, a slower-moving predictable-growth factor
-$X^o$ with mean $0.02$, risk aversion $a=4$, time discount rate $b=0.03$.
+We use parameters in the standard long-run-risk neighbourhood: a
+mean-reverting volatility factor $X^f$ with mean $0.04$, a slower-moving
+predictable-growth factor $X^o$ with mean $0.02$, risk aversion $a=4$, and
+a time discount rate $b=0.03$.
 
 ```{code-cell} ipython3
 params_state = {
@@ -1948,7 +2009,16 @@ $$
 The drift distortions are exactly the Girsanov shifts induced by the
 Brownian loadings of $\hat M$.
 
-Let's now simulate the state and verify the factorization holds.
+Let's now simulate the state and check the factorization numerically.
+
+The first check is the *algebraic identity*: once we define $\hat M$
+through {eq}`eq:mhat`, the equation $M_t = \exp(\rho t)\hat M_t
+\phi(X_0)/\phi(X_t)$ is automatic for any choice of $(\rho,\phi)$, and the
+error below is just floating-point round-off.
+
+The second, substantive, check is whether the eigenpair $(\rho,\phi)$ we
+solved for really makes $\hat M$ a martingale, which we approximate by
+computing $E[\hat M_t]$ across many simulated paths.
 
 ```{code-cell} ipython3
 def brownian_increments(n, dt, seed=1234):
@@ -2017,7 +2087,49 @@ M_hat = np.exp(-ρ_s * t) * M * φ_t / φ_0
 transient = φ_0 / φ_t
 
 identity_error = np.max(np.abs(M - np.exp(ρ_s * t) * M_hat * transient))
-print(f"maximum factorization error = {identity_error:.2e}")
+print(f"algebraic identity error = {identity_error:.2e}")
+```
+
+The error above is up to machine precision, as expected.
+
+Next we estimate $E[\hat M_t \mid X_0 = \bar x]$ over a Monte Carlo sample
+of paths.
+
+If $\hat M$ is a martingale, the population mean is exactly $1$ at every
+$t$, and the sample mean should lie within a few standard errors of $1$.
+
+```{code-cell} ipython3
+def simulate_M_hat(params, ρ, cf, co, n_paths=2000, T=20.0, dt=0.01, seed=2024):
+    """Monte Carlo paths of hat M_t along an ensemble of trajectories."""
+    rng = np.random.default_rng(seed)
+    n = int(T / dt)
+    t = np.linspace(0, T, n + 1)
+    M_hat_paths = np.empty((n_paths, n + 1))
+
+    for k in range(n_paths):
+        seed_k = rng.integers(1, 10**9)
+        t_k, Xf_k, Xo_k, dBf_k, dBo_k = simulate_states(
+            params, T=T, dt=dt, seed=int(seed_k)
+        )
+        A_k = additive_log_M(params, t_k, Xf_k, Xo_k, dBf_k, dBo_k)
+        φ_t_k = np.exp(cf * Xf_k + co * Xo_k)
+        φ_0_k = np.exp(cf * Xf_k[0] + co * Xo_k[0])
+        M_hat_paths[k] = np.exp(-ρ * t_k) * np.exp(A_k) * φ_t_k / φ_0_k
+
+    return t, M_hat_paths
+
+
+t_mc, M_hat_paths = simulate_M_hat(params_sdf, ρ_s, cf_s, co_s)
+M_hat_mean = M_hat_paths.mean(axis=0)
+M_hat_se = M_hat_paths.std(axis=0, ddof=1) / np.sqrt(M_hat_paths.shape[0])
+
+print("   t      mean       se      (mean - 1) / se")
+for t_check in [1.0, 5.0, 10.0, 20.0]:
+    idx = np.argmin(np.abs(t_mc - t_check))
+    mean = M_hat_mean[idx]
+    se = M_hat_se[idx]
+    z = (mean - 1.0) / se
+    print(f"{t_mc[idx]:5.2f}   {mean:7.4f}  {se:7.4f}   {z:+6.2f}")
 ```
 
 ```{code-cell} ipython3
@@ -2049,7 +2161,7 @@ plt.tight_layout()
 plt.show()
 ```
 
-Indeed the factorization holds up to numerical noise, and we can see how the three components evolve over time.
+We can see how the three components evolve over time.
 
 ## Long-run risk prices
 
@@ -2405,18 +2517,27 @@ A_t^g
 \end{aligned}
 $$ (eq:growth-functional)
 
-The last line makes $\exp(A_t^g-\delta t) = \hat G_t$ a martingale, with
-$\delta$ the constant trend growth rate.
+The last line is the Itô compensator that makes
+$\exp(A_t^g-\delta t) = \hat G_t$ a *local* martingale, with $\delta$ the
+constant trend growth rate.
+
+Stochastic stability of the growth-twisted process needs three conditions.
 
-For the cash-flow exposure to $B^f$ we also need the Feller-type restriction
+The **Feller-type nonattainment** inequality
 
 $$
-    2(\xi_f+\sigma_f\gamma_f^g)\bar x_f \geq \sigma_f^2 ,
+    2(\xi_f+\sigma_f\gamma_f^g)\bar x_f \geq \sigma_f^2
 $$
 
-which keeps the growth-twisted square-root volatility process from hitting
-zero, so stochastic stability is preserved under the growth-twisted
-measure.
+keeps the twisted $X^f$ from hitting zero.
+
+*Mean reversion* of the twisted $X^f$ is picked by the same root-selection
+argument we used for the SDF in {eq}`eq:cf-roots`.
+
+$\hat G$ itself must be a martingale, the Assumption-6.1 analogue for the
+growth twist.
+
+The Feller inequality is necessary but not sufficient on its own.
 
 ```{note}
 This Feller restriction is a concrete instance of a general point we
@@ -2471,7 +2592,7 @@ required_returns = np.array([
     required_return_for_growth_exposure(g) for g in γ_g_o_grid
 ])
 
-local_line = (-params_sdf["β_bar"]
+local_line = (required_return_for_growth_exposure(0.0)
               + local_price_o * γ_g_o_grid)
 
 fig, ax = plt.subplots()
@@ -2486,7 +2607,14 @@ ax.legend()
 plt.show()
 ```
 
-The slope of the long-run line is the risk price in {eq}`eq:long-run-price-o`.
+The slope of the long-run line is the risk price in
+{eq}`eq:long-run-price-o`, and the dashed line shares the same value at
+$\gamma_o^g = 0$ but with slope equal to the local Brownian risk price
+$-\gamma_o^s$.
+
+The gap between the two slopes is the *persistence correction*; the
+dashed line is a slope comparator, not the actual Breeden local
+expected-return frontier (which is state dependent in $X^f$ and $X^o$).
 
 ```{code-cell} ipython3
 finite_difference = (
@@ -2577,11 +2705,16 @@ The existence proof then proceeds in three steps:
    whenever $\nu(\Lambda) > 0$, so the resolvent doesn't "miss" any
    region of state space.
 
-2. **Nummelin minorization.** Irreducibility yields a lower bound
-   $F_\alpha \psi \geq s\int \psi\, d\nu$ for nonnegative $\psi$. 
-   - This is a
-   classical tool from general-state-space Markov-chain theory; the
-   constant $s>0$ is the *minorization strength*.
+2. **Nummelin minorization.** Irreducibility yields a *bounded nonnegative
+   function* $s$ on the state space, with $\int s\, d\nu > 0$, such that
+   for every nonnegative $\psi$,
+
+   $$
+       F_\alpha \psi(x)\, \geq\, s(x) \int \psi\, d\nu .
+   $$
+
+   The function $s$ (often called the *minorization function*) measures
+   how strongly the resolvent dominates a fixed reference measure $\nu$.
 
 3. **Eigenfunction extraction.** The minorization, combined with additional
    boundedness or strengthened drift assumptions, identifies a critical
@@ -2591,7 +2724,7 @@ The existence proof then proceeds in three steps:
    - Inverting the resolvent transform produces a positive
    eigenfunction for the original semigroup.
 
-These steps are all nontrivial and is out of the scope of this lecture. 
+These steps are all nontrivial and are out of scope for this lecture. 
 
 The details are in Section 9 of {cite:t}`HansenScheinkman2009`.
 
@@ -2625,8 +2758,10 @@ The main steps are:
 2. Build the semigroup
    $\mathbb M_t\psi(x)=E[M_t\psi(X_t)\mid X_0=x]$.
 
-3. Solve the local pricing restriction when $M$ is a valuation or
-   cash-flow valuation object.
+3. When $M = VS$ is the product of a valuation functional and an SDF,
+   impose the local pricing restriction that $VS$ is a martingale; for
+   cash-flow valuation semigroups $\mathbb Q_t = GS$, the pricing
+   restriction is on $S$ alone, and $G$ enters only as a growth twist.
 
 4. Solve the principal eigenvalue problem
    $\mathbb A\phi=\rho\phi$.

From 6e42713d5fa79ea96aa68872643ed5c9c4a02f73 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Sat, 30 May 2026 23:27:30 +1000
Subject: [PATCH 13/25] updates

---
 lectures/ls_learning.md          | 460 ++++++++++++++++---------------
 lectures/rational_learning_re.md | 168 ++++-------
 2 files changed, 306 insertions(+), 322 deletions(-)

diff --git a/lectures/ls_learning.md b/lectures/ls_learning.md
index a6ba0e438..c00813af8 100644
--- a/lectures/ls_learning.md
+++ b/lectures/ls_learning.md
@@ -32,15 +32,15 @@ This lecture is a companion to {doc}`rational_learning_re`, which presents the
 Bray–Kreps perspective on rational learning. 
 
 The present lecture examines the
-closely related but distinct question of whether **least squares** learning
+closely related but distinct question of whether *least squares* learning
 converges to a rational expectations equilibrium in self-referential models.
 
 
 This lecture presents the framework of {cite:t}`MarcetSargent1989jet` for studying
 **least squares learning** in a class of **self-referential** linear stochastic models.
 
-A self-referential model is one in which the **actual** law of motion for the
-economy depends on the **perceived** law of motion held by the agents within
+A self-referential model is one in which the *actual* law of motion for the
+economy depends on the *perceived* law of motion held by the agents within
 it. 
 
 In a rational expectations equilibrium (REE) the two coincide: the
@@ -51,15 +51,15 @@ from equilibrium and update their beliefs by running least squares regressions,
 will they converge to the REE?
 
 {cite:t}`MarcetSargent1989jet` answer this question by exploiting a powerful
-technique from systems-control engineering: the **differential equation
-approach** of {cite:t}`Ljung1977`.
+technique from systems-control engineering: the differential equation
+approach of {cite:t}`Ljung1977`.
 
 The key insight is that the stochastic
 difference equation describing how beliefs evolve can be approximated, in the
-limit, by a deterministic **ordinary differential equation** (ODE).
+limit, by a deterministic ordinary differential equation (ODE).
 
 Almost-sure
-convergence of least squares to the REE is then equivalent to **local stability**
+convergence of least squares to the REE is then equivalent to *local stability*
 of the REE as a fixed point of that ODE.
 
 The framework unifies and extends earlier work by {cite:t}`Bray1982` and
@@ -90,49 +90,25 @@ to solve the associated ODE.
 def simulate_rls_scalar(T_map, σ_u, β0, T_periods=500, N_paths=100,
                         a_seq=None, seed=0):
     """
-    Simulate recursive least squares in a scalar self-referential model.
-
-    The perceived law of motion is:  z1_t = β_t * z2_{t-1} + u_t
-    The actual law of motion is:     z1_t = T(β_t) * z2_{t-1} + V * u_t
-
-    For the scalar examples here z2_t = 1 (constant), so agents learn about
-    the mean of a process that depends on their own expectation.
-
-    Parameters
-    ----------
-    T_map    : callable, the mapping T: β -> T(β)
-    σ_u  : float, std of innovations
-    β0    : float, initial belief
-    T_periods: int, simulation length
-    N_paths  : int, number of Monte Carlo paths
-    a_seq    : None or array of length T_periods (forgetting factors)
-    seed     : int, random seed
-
-    Returns
-    -------
-    β_paths : ndarray, shape (N_paths, T_periods)
+    Simulate recursive least squares for the scalar model
+    z1_t = T(β_t) + u_t with constant regressor z2_t = 1.
     """
     rng = np.random.default_rng(seed)
     if a_seq is None:
-        a_seq = np.ones(T_periods)          # standard OLS
+        a_seq = np.ones(T_periods)
 
     β_paths = np.empty((N_paths, T_periods))
 
     for i in range(N_paths):
         β = β0
-        R = 1.0          # scalar moment estimate
-        prec = 1.0 / R   # use precision for numerical stability
+        R = 1.0
 
         for t in range(T_periods):
             α_t = a_seq[t]
-            # z2 = 1 (constant regressor), so z2*z2' = 1
             z2 = 1.0
             u_t = rng.normal(0, σ_u)
-
-            # Actual z1 given current β
             z1 = T_map(β) * z2 + u_t
 
-            # RLS update (lagged: use previous β to form z1, then update)
             R = R + (α_t / (t + 1)) * (z2**2 - R / α_t)
             R = max(R, 1e-8)
             β = β + (α_t / (t + 1)) / R * z2 * (z1 - β * z2)
@@ -143,7 +119,7 @@ def simulate_rls_scalar(T_map, σ_u, β0, T_periods=500, N_paths=100,
 
 
 def solve_ode(f_ode, β0, t_span=(0, 80), n_points=1000):
-    """Solve scalar ODE d(β)/dt = f_ode(β) from β0."""
+    """Solve the scalar ODE dβ/dt = f_ode(β) from β0."""
     sol = solve_ivp(lambda t, y: [f_ode(y[0])], t_span, [β0],
                     t_eval=np.linspace(*t_span, n_points), method='RK45',
                     max_step=0.1)
@@ -170,7 +146,7 @@ $\eta_t$ is orthogonal to all past $z_2$'s.
 Because agents optimise (or behave) on the basis of this belief, their actions
 feed back into the economy.
 
-The **actual** law of motion for the full state
+The actual law of motion for the full state
 vector $z_t = (z_{1t}, z_{1t}^c)^\top$ is
 
 $$
@@ -181,8 +157,8 @@ $$ (eq:actual_lom)
 
 where $u_t$ is i.i.d. white noise with covariance $\Sigma$.
 
-The mapping $T$ is the key object: it maps the **perceived** coefficient $\beta$
-to the coefficient that **actually** governs $z_{1t}$ in equilibrium.
+The mapping $T$ is the key object: it maps the *perceived* coefficient $\beta$
+to the coefficient that *actually* governs $z_{1t}$ in equilibrium.
 
 A
 **rational expectations equilibrium** is a fixed point $\beta_f = T(\beta_f)$.
@@ -214,10 +190,10 @@ recursively.
 
 ### Lagged and contemporaneous data
 
-The recursion above is written with **lagged information**, so the estimate
+The recursion above is written with *lagged information*, so the estimate
 $\beta_t$ uses observations available through date $t-1$.
 
-Section 3 of {cite:t}`MarcetSargent1989jet` also treats a **contemporaneous-data**
+{cite:t}`MarcetSargent1989jet` also treat a *contemporaneous-data*
 version in which agents update using $z_{1t}$ and $z_{2,t-1}$ at date $t$.
 
 That timing creates simultaneous determination, because $z_t$ depends on the
@@ -226,8 +202,7 @@ same estimate $\beta_t$ that is being updated from $z_t$.
 The extra requirement is that the date-$t$ system have a unique solution
 $(\beta_t, R_t, z_t)$ for each history.
 
-Under that uniqueness condition, Proposition 4 of {cite:t}`MarcetSargent1989jet`
-shows that the same full ODE {eq}`eq:full_ode` and small ODE {eq}`eq:small_ode`
+Under that uniqueness condition, the same full ODE {eq}`eq:full_ode` and small ODE {eq}`eq:small_ode`
 govern convergence.
 
 Thus the stability criterion below is not an artifact of the one-period lag in
@@ -240,7 +215,7 @@ during the learning transition the data-generating process is non-stationary —
 beliefs shift the equilibrium, which shifts the data.
 
 The algorithm is
-"irrational" in the sense that it acts as if the environment were stationary,
+*"irrational"* in the sense that it acts as if the environment were stationary,
 when it is not.
 ```
 
@@ -252,54 +227,120 @@ when it is not.
 characterise the almost-sure limiting behaviour of the stochastic system
 {eq}`eq:rls_beta`–{eq}`eq:rls_R`.
 
-The central result is that the **only possible limit points** of $\beta_t$ are
-fixed points of the ODE
+The central object is the *small ODE*
 
 $$
-\frac{d\beta}{dt} = T(\beta) - \beta .
+\frac{d\beta}{dt} = T(\beta) - \beta ,
 $$ (eq:small_ode)
 
-This is the **small ODE** (equation (6) in {cite:t}`MarcetSargent1989jet`).
-
-Its
-fixed points are exactly the rational expectations equilibria.
+whose fixed points are exactly the rational expectations equilibria.
 
-The full ODE system associated with the joint process $(\beta_t, R_t)$ is
+The *full ODE* associated with the joint process $(\beta_t, R_t)$ is
 
 $$
 \frac{d}{dt}\begin{bmatrix} \beta \\ R \end{bmatrix}
 = \begin{bmatrix} R^{-1} M_{z_2}(\beta)\,[T(\beta) - \beta]^\top \\ M_{z_2}(\beta) - R \end{bmatrix} ,
 $$ (eq:full_ode)
 
-where $M_{z_2}(\beta) = E z_{2t}z_{2t}^\top$ evaluated at the stationary distribution
-induced by $\beta$.
+where $M_{z_2}(\beta) = E z_{2t}z_{2t}^\top$ evaluated at the stationary
+distribution induced by $\beta$.
+
+The fixed point of {eq}`eq:full_ode` is $(\beta_f, R_f)$ where
+$R_f = M_{z_2}(\beta_f)$.
+
+### Regularity and boundedness assumptions
+
+The convergence theorems below presuppose the following conditions on the
+operator $T$, the shocks $u_t$, the gain sequence $\{\alpha_t\}$, and the
+domain of the algorithm.
+
+Let $D_s \subset \mathbb{R}^{n_1 \times n_2}$ be the set on which $T(\beta)$,
+$A(\beta)$, $B(\beta)$, $V(\beta)$ are well defined and the eigenvalues of
+$\bigl[\begin{smallmatrix}0 & T(\beta)\\ A(\beta) & \end{smallmatrix}\bigr]$
+are less than unity in modulus.
+
+```{prf:assumption} A.1 (unique fixed point)
+:label: ass-ms-a1
+
+The operator $T$ has a unique fixed point $\beta_f = T(\beta_f)$ with
+$\beta_f \in D_s$.
+```
+
+```{prf:assumption} A.2 (smoothness)
+:label: ass-ms-a2
+
+$T$ is twice differentiable and $A, B, V$ each have one derivative in $D_s$.
+```
+
+```{prf:assumption} A.3 (nonsingular covariance)
+:label: ass-ms-a3
+
+The covariance matrix $M_{z_2}(\beta_f)$ is nonsingular.
+```
+
+```{prf:assumption} A.4 (gain sequence)
+:label: ass-ms-a4
+
+For all $t$, $\alpha_t > 0$; $\alpha_t$ is non-decreasing in $t$; $\alpha_t \to 1$
+as $t \to \infty$; and $\limsup_{t \to \infty} t\,|\alpha_t - \alpha_{t-1}| = K < \infty$.
+```
+
+```{prf:assumption} A.5 (shocks)
+:label: ass-ms-a5
+
+The vector $u_t$ is serially independent, and $E|u_{it}|^p < \infty$ for all
+$p > 1$ and all $i = 1, \ldots, m$.
+```
+
+```{prf:assumption} A.6 (boundedness along a subsequence)
+:label: ass-ms-a6
+
+There exist a set $\Omega_0$ with $P(\Omega_0) = 1$, random variables
+$C_1(\omega)$ and $C_2(\omega)$, and a subsequence $\{t_k(\omega)\}$ such that
+
+$$
+|z_{2t_k}(\omega)| < C_1(\omega) \quad\text{and}\quad |R_{t_k}(\omega)| < C_2(\omega)
+$$
+
+for all $\omega \in \Omega_0$ and all $k = 1, 2, \ldots$.
+```
+
+```{prf:assumption} A.7 (projection or compactness)
+:label: ass-ms-a7
+
+Either
 
-The fixed point of {eq}`eq:full_ode` is $(\beta_f, R_f)$
-where $R_f = M_{z_2}(\beta_f)$.
+- **(A.7.1)** $D_1 = D_2 = \mathbb{R}^{n_1 \times (n_2)^3}$, and given the
+  set $\Omega_0$ and subsequence $\{t_k\}$ from {prf:ref}`ass-ms-a6`, there
+  exists a compact $D' \subset D_s$ with $\beta_{t_k}(\omega) \in D'$ for all
+  $k$ and all $\omega \in \Omega_0$; moreover, for any initial condition
+  $(\beta(0), R(0))$ with $\beta(0) \in D'$ and $|R(0)| < C_2(\omega)$,
+  trajectories of {eq}`eq:full_ode` never leave a closed subset of $D_s$;
 
-### What the assumptions do
+- **or (A.7.2)** $D_2$ is closed, $D_1$ is open and bounded, $\beta \in D_s$
+  for every $(\beta, R) \in D_1$, and trajectories of {eq}`eq:full_ode` with
+  initial conditions in $D_2$ never leave a closed subset of $D_1$.
+```
 
-The sufficient conditions in {cite:t}`MarcetSargent1989jet` divide naturally into
-regularity assumptions and boundedness assumptions.
+Let $D_A$ denote the domain of attraction of the unique equilibrium
+$(\beta_f, R_f)$ of {eq}`eq:full_ode`.
 
-The regularity assumptions require a unique fixed point, smooth maps
-$T, A, B, V$, a nonsingular second-moment matrix at the fixed point,
-well-behaved gain sequence $\alpha_t/t$, and shocks with enough moments.
+### Convergence of least squares
 
-The harder assumptions are the boundedness conditions A.6--A.7.
+```{prf:proposition}
+:label: prop-ms-convergence
 
-Assumption A.6 requires the regressors and estimates to return to bounded sets
-along a subsequence with probability one.
+Assume (A.1)–(A.6). If either
 
-Assumption A.7 requires either an unrestricted algorithm whose ODE paths stay in
-a compact part of the stationarity region, or a projection facility whose ODE
-paths point back toward the interior of the projection set.
+- (A.7.1) is satisfied and $D' \subset D_A$, or
+- (A.7.2) is satisfied and $D_1 \subset D_A$,
 
-When $z_{2t}$ contains only exogenous ergodic variables, A.6 is usually
-automatic.
+then $\beta_t \to \beta_f$ almost surely as $t \to \infty$.
+```
 
-When $z_{2t}$ contains endogenous variables, as in the investment example below,
-the boundedness argument is a separate and more delicate part of the proof.
+{prf:ref}`prop-ms-convergence` reduces almost-sure convergence of recursive
+least squares to *stability* of the ODE {eq}`eq:full_ode` at $(\beta_f, R_f)$
+plus a boundedness guarantee for the sample path.
 
 ### Stability governs convergence
 
@@ -307,37 +348,53 @@ Let $\mathcal{M}$ be the Jacobian matrix of $T(\beta) - \beta$ evaluated at the
 REE $\beta_f$:
 
 $$
-\mathcal{M} = \frac{d\,\text{col}(T(\beta) - \beta)}{d\,\text{col}(\beta)^\top}\Bigg|_{\beta=\beta_f} .
+\mathcal{M} = \frac{d\,\text{col}(T(\beta) - \beta)}{d\,\text{col}(\beta)^\top}\Bigg|_{\beta=\beta_f} ,
 $$ (eq:jacobian)
 
-**Proposition 3** of {cite:t}`MarcetSargent1989jet` establishes that the Jacobian of
-the full system {eq}`eq:full_ode` at $(\beta_f, R_f)$ has $n_2^2$ repeated
-eigenvalues equal to $-1$ (from the $R$ equation), plus the eigenvalues of
-$\mathcal{M}$ (from the $\beta$ equation).
+and let $h(\beta, R)$ denote the Jacobian of the right-hand side of the full
+ODE {eq}`eq:full_ode` after stacking $(\beta, R)$ into a column vector.
+
+```{prf:proposition}
+:label: prop-ms-jacobian-eigenvalues
+
+The matrix $h(\beta_f, R_f)$ has $(n_2)^2$ repeated eigenvalues equal to
+$-1$; its remaining $n_1 \times n_2$ eigenvalues coincide with the
+eigenvalues of $\mathcal{M}$.
+```
 
 Consequently:
 
-* If all eigenvalues of $\mathcal{M}$ have **strictly negative real parts**, both
-  {eq}`eq:small_ode` and {eq}`eq:full_ode` are locally stable.
+* If all eigenvalues of $\mathcal{M}$ have *strictly negative real parts*, both
+  {eq}`eq:small_ode` and {eq}`eq:full_ode` are locally stable, and
+  {prf:ref}`prop-ms-convergence` then yields $\beta_t \to \beta_f$ almost
+  surely.
 
-  Under suitable
-  boundedness conditions, Proposition 1 guarantees $\beta_t \to \beta_f$ **almost
-  surely**.
+* If any eigenvalue of $\mathcal{M}$ has *positive real part*, then the next
+  proposition shows that convergence is impossible.
 
-* If any eigenvalue of $\mathcal{M}$ has **positive real part**, then
-  $P(\beta_t \to \beta_f) = 0$ — convergence is **impossible**.
+```{prf:proposition}
+:label: prop-ms-necessity
 
-The stability condition $\text{Re}(\lambda_i(\mathcal{M})) < 0$ for all $i$ is
-what the E-stability literature (see {cite:t}`Evans1985`) calls **E-stability**: the
-REE is a stable rest point of the "expectational dynamics" $\dot\beta = T(\beta) - \beta$.
+Assume (A.1)–(A.5).
+
+1. Let $\hat\beta \neq \beta_f$ and suppose $M_{z_2}(\hat\beta)$ is positive
+   definite and $\hat\beta \in \mathrm{int}(D_2)$. Then $P(\beta_t \to \hat\beta) = 0$.
+
+2. If $h(\beta_f, R_f)$ has at least one eigenvalue with strictly positive real
+   part, then $P(\beta_t \to \beta_f) = 0$.
+```
+
+The stability condition $\mathrm{Re}(\lambda_i(\mathcal{M})) < 0$ for all $i$ is
+what the E-stability literature (see {cite:t}`Evans1985`) calls **E-stability**:
+the REE is a stable rest point of the expectational dynamics
+$\dot\beta = T(\beta) - \beta$.
 
 ### The projection facility
 
 E-stability is necessary but not quite sufficient for almost-sure convergence.
 
 Ljung's theorem requires the sample path $(\beta_t, R_t)$ to remain in a
-**bounded region** with probability one (assumptions A.6–A.7 of
-{cite:t}`MarcetSargent1989jet`).
+*bounded region* with probability one.
 
 This boundedness is the job of the **projection
 facility**.
@@ -366,8 +423,8 @@ The set $D_2
 \subset D_1$ is a slightly smaller "safe" region to which the algorithm is
 retracted whenever it threatens to leave $D_1$.
 
-The facility can be thought of as forcing agents to **discard observations that
-are inconsistent with their priors** — a form of bounded rationality that is
+The facility can be thought of as forcing agents to *discard observations that
+are inconsistent with their priors*, a form of bounded rationality that is
 necessary for the mathematical argument but innocuous in practice.
 
 #### Why it is needed
@@ -381,27 +438,38 @@ the algorithm to revisit a compact set infinitely often; the projection facility
 guarantees this by construction.
 
 Formally, {cite:t}`MarcetSargent1989jet` require that the ODE trajectories
-originating in $D_1$ point **inward** at the boundary $\partial D_1$ — that is,
+originating in $D_1$ point *inward* at the boundary $\partial D_1$, that is,
 the vector field $T(\beta) - \beta$ must point back into $D_1$ everywhere on its
 boundary.
 
-When this holds (Assumption A.7.2), the projection is **invoked only
-finitely many times** with probability one, and after the last invocation the
-algorithm runs as plain RLS.
+When this holds, the projection is *invoked only finitely many times* with
+probability one, and after the last invocation the algorithm runs as plain RLS.
+
+```{prf:corollary}
+:label: cor-ms-projection-dichotomy
+
+Assume (A.1)–(A.6), that $(\beta, R) \in D_1$ implies $\beta \in D_s$, and
+that $D_1$ is open and bounded with $D_1 \subset D_A$. Then for some
+subsequence $\{t_k(\omega)\}$,
+
+$$
+P(\beta_t \to \beta_f) + P\bigl(\beta_{t_k} \to (D_1 \setminus D_2)\bigr) = 1.
+$$
+```
 
-Corollary 1 of {cite:t}`MarcetSargent1989jet`
-formalises this: either $\beta_t \to \beta_f$ a.s., or $\beta_t$ clusters on the
-boundary $\partial D_1 \setminus D_2$ — but the latter event has probability zero
-when the ODE trajectories point inward.
+The second event has probability zero whenever the ODE trajectories point
+inward at $\partial D_1$, in which case
+{prf:ref}`cor-ms-projection-dichotomy` reduces to $\beta_t \to \beta_f$ almost
+surely.
 
-#### The exogenous-regressor case (Corollary 2)
+#### The exogenous-regressor case
 
-When the regressors $z_{2t}$ are **exogenous** — so that $M_{z_2}(\beta) \equiv M$
-does not depend on $\beta$ — a particularly clean sufficient condition for
-convergence is available (Corollary 2 of {cite:t}`MarcetSargent1989jet`).
+When the regressors $z_{2t}$ are *exogenous*, so that $E(z_{2t}z_{2t}^\top) =
+M_{z_2}(\beta) \equiv M$ does not depend on $\beta$, the verification of the
+boundary condition becomes routine.
 
-In the notation of the paper, let $H(\beta)$ describe the mean-value slope of
-the small-ODE drift:
+Let $H(\beta)$ be the mean-value slope of the small-ODE drift, i.e. the matrix
+satisfying
 
 $$
 \operatorname{col}\{[T(\beta)-\beta]-[T(\beta_f)-\beta_f]\}
@@ -409,14 +477,37 @@ $$
 H(\beta)\operatorname{col}(\beta-\beta_f).
 $$ (eq:corollary2_cond)
 
-For the scalar linear examples, this reduces to the familiar requirement that
-the slope of $T(\beta)-\beta$ be negative.
+```{prf:corollary}
+:label: cor-ms-exogenous
+
+Consider the algorithm defined by {eq}`eq:rls_beta`–{eq}`eq:rls_R` with
+projection rule {eq}`eq:projection`. Choose $0 < K' < K < \infty$ and assume
+
+1. (A.1)–(A.5) hold;
+2. $z_{2t}$ is exogenous, so that $E(z_{2t}z_{2t}^\top) = M_{z_2}(\beta) \equiv M$;
+3. the small ODE $\dot\beta = T(\beta) - \beta$ is globally stable in
+   $\mathbb{R}^{n_1 \times n_2}$;
+4. there exists $\bar\varepsilon > 0$ such that for all
+   $0 < \varepsilon \leq \bar\varepsilon$ and all $\beta$ with
+   $|\beta - \beta_f| = K$, every eigenvalue of
+   $[I(1-\varepsilon) + \varepsilon H(\beta)]^\top
+   [I(1-\varepsilon) + \varepsilon H(\beta)]$ has modulus less than
+   $\alpha^2$ with $\alpha < 1$.
+
+Take
+$D_1 = \{(\beta, R) : |\beta - \beta_f| < K\}$ and
+$D_2 = \{(\beta, R) : |\beta - \beta_f| \leq K'\}$, and let the projection
+rule retract to any value with $|\beta - \beta_f| \leq K'$.
+
+Then $\beta_t \to \beta_f$ almost surely.
+```
 
-Under this condition one can take $D_1$ to be a ball of radius $K$ around
-$\beta_f$, and the boundary condition is automatically satisfied.
+For the scalar linear examples below, condition (4) reduces to the familiar
+requirement that the slope of $T(\beta) - \beta$ be negative.
 
 For the first four examples below, $T$ is linear and $M_{z_2}$ is independent of
-$\beta$, so Corollary 2 reduces to checking stability of the small ODE.
+$\beta$, so {prf:ref}`cor-ms-exogenous` reduces to checking stability of the
+small ODE.
 
 ```{note}
 In the scalar self-referential examples studied here (Bray, Bray–Savin,
@@ -450,84 +541,68 @@ mystnb:
 def simulate_rls_with_projection(T_map, σ_u, β0, K_proj,
                                  T_periods=500, N_paths=50, seed=0):
     """
-    Simulate RLS with a scalar projection facility.
-
-    The facility keeps β_t in [-K_proj, K_proj].  Whenever the unconstrained
-    update would push β outside this interval, β is retracted to 0
-    (an arbitrary point in D2 = {|β| <= K_proj/2}).
-
-    Returns
-    -------
-    β_paths      : (N_paths, T_periods) array of belief paths
-    n_projections   : (N_paths,) array counting projection invocations per path
-    first_proj_free : (N_paths,) array of first period with no further projections
+    Simulate RLS with a projection facility that retracts β_t to 0
+    whenever the update would push it outside [-K_proj, K_proj].
     """
     rng = np.random.default_rng(seed)
-    β_paths    = np.empty((N_paths, T_periods))
+    β_paths = np.empty((N_paths, T_periods))
     n_projections = np.zeros(N_paths, dtype=int)
-    last_proj     = np.full(N_paths, -1, dtype=int)
+    last_proj = np.full(N_paths, -1, dtype=int)
 
     for i in range(N_paths):
         β = β0
-        R    = 1.0
+        R = 1.0
 
         for t in range(T_periods):
             u_t = rng.normal(0, σ_u)
-            z1  = T_map(β) + u_t          # z2 = 1 (constant regressor)
+            z1 = T_map(β) + u_t
 
-            # Unconstrained RLS update
-            R_new    = R    + (1.0 / (t + 1)) * (1.0 - R)
+            R_new = R + (1.0 / (t + 1)) * (1.0 - R)
             β_new = β + (1.0 / (t + 1)) / R_new * (z1 - β)
 
-            # Projection facility: retract to D2 = {0} if outside D1
             if abs(β_new) > K_proj:
-                β_new = 0.0           # retract to interior of D2
+                β_new = 0.0
                 n_projections[i] += 1
                 last_proj[i] = t
 
             β = β_new
-            R    = max(R_new, 1e-8)
+            R = max(R_new, 1e-8)
             β_paths[i, t] = β
 
-    # First period after which no further projections occur
-    first_proj_free = last_proj + 1   # -1 + 1 = 0 if never projected
+    first_proj_free = last_proj + 1
 
     return β_paths, n_projections, first_proj_free
 
 
-# Run the simulation
 a_bray_pf, b_bray_pf, σ_pf = 1.0, 0.6, 1.5
-T_bray_pf  = lambda β: a_bray_pf + b_bray_pf * β
-β_f_pf  = a_bray_pf / (1 - b_bray_pf)
-β0_far  = 8.0    # well outside D1 = {|β| < 5}
-K_pf       = 5.0
-T_pf_sim   = 600
-N_pf_sim   = 80
+T_bray_pf = lambda β: a_bray_pf + b_bray_pf * β
+β_f_pf = a_bray_pf / (1 - b_bray_pf)
+β0_far = 8.0
+K_pf = 5.0
+T_pf_sim = 600
+N_pf_sim = 80
 
 paths_pf, n_proj, first_free = simulate_rls_with_projection(
     T_bray_pf, σ_pf, β0_far, K_pf,
     T_periods=T_pf_sim, N_paths=N_pf_sim)
 
-# Also run without projection for comparison
 paths_no_pf = simulate_rls_scalar(
     T_bray_pf, σ_pf, β0_far,
     T_periods=T_pf_sim, N_paths=N_pf_sim, seed=0)
 
 fig = plt.figure(figsize=(15, 10))
-gs  = GridSpec(2, 2, figure=fig)
+gs = GridSpec(2, 2, figure=fig)
 
-# Top left: paths with projection
 ax1 = fig.add_subplot(gs[0, 0])
 for i in range(min(30, N_pf_sim)):
     ax1.plot(paths_pf[i], color='steelblue', alpha=0.25, lw=2)
 ax1.plot(np.mean(paths_pf, axis=0), color='navy', lw=2, label='average')
 ax1.axhline(β_f_pf, color='red', ls='--', lw=2,
             label=f'$\\beta_f={β_f_pf:.1f}$')
-ax1.axhline( K_pf, color='gray', ls=':', lw=2, label=f'$D_1$ boundary ($K={K_pf}$)')
+ax1.axhline(K_pf, color='gray', ls=':', lw=2, label=f'$D_1$ boundary ($K={K_pf}$)')
 ax1.axhline(-K_pf, color='gray', ls=':', lw=2)
 ax1.set_xlabel('$t$'); ax1.set_ylabel('$\\beta_t$'); ax1.legend(fontsize=8)
 
-# Top right: paths without projection
 ax2 = fig.add_subplot(gs[0, 1])
 for i in range(min(30, N_pf_sim)):
     ax2.plot(paths_no_pf[i], color='darkorange', alpha=0.25, lw=2)
@@ -536,14 +611,12 @@ ax2.axhline(β_f_pf, color='red', ls='--', lw=2,
             label=f'$\\beta_f={β_f_pf:.1f}$')
 ax2.set_xlabel('$t$'); ax2.set_ylabel('$\\beta_t$'); ax2.legend(fontsize=8)
 
-# Bottom left: histogram of projection counts
 ax3 = fig.add_subplot(gs[1, 0])
 ax3.hist(n_proj, bins=range(0, int(n_proj.max()) + 2),
          color='steelblue', edgecolor='white', alpha=0.8)
 ax3.set_xlabel('number of projections invoked')
 ax3.set_ylabel('number of paths')
 
-# Bottom right: period of last projection
 ax4 = fig.add_subplot(gs[1, 1])
 ax4.hist(first_free[n_proj > 0], bins=20,
          color='darkorange', edgecolor='white', alpha=0.8)
@@ -559,9 +632,9 @@ print(f"Max number of projections:           {n_proj.max()}")
 print(f"Mean last-projection period:         {first_free[n_proj>0].mean():.1f}")
 ```
 
-The simulation illustrates the key theoretical point from Corollary 1: the
-projection is invoked only a **finite number of times** on almost every sample
-path.
+The simulation illustrates the key theoretical point from
+{prf:ref}`cor-ms-projection-dichotomy`: the projection is invoked only a
+*finite number of times* on almost every sample path.
 
 After the last invocation the algorithm runs as unconstrained RLS and
 converges to $\beta_f$ at the usual rate.
@@ -572,13 +645,13 @@ theorem requires.
 
 ## Five illustrative examples
 
-We now work through five examples from Section 4 of {cite:t}`MarcetSargent1989jet`,
+We now work through five examples from {cite:t}`MarcetSargent1989jet`,
 computing the ODE, finding the REE, checking E-stability, and simulating the RLS
 learning path.
 
 ### Example 1: ordinary linear stochastic difference equations
 
-The first example in Section 4 has no self-referential component.
+The first example has no self-referential component.
 
 Let the actual law of motion be fixed, with $T(\beta)=\Gamma$ for a stable
 matrix $\Gamma$ and with $V(\beta)=I$.
@@ -587,7 +660,7 @@ The REE is $\beta_f=\Gamma$.
 
 Since $T$ is constant, $H(\beta)=-I$ and the small ODE is globally stable.
 
-Corollary 2 then implies that recursive least squares converges almost surely
+{prf:ref}`cor-ms-exogenous` then implies that recursive least squares converges almost surely
 to the true law of motion.
 
 This benchmark shows that the Marcet-Sargent machinery nests ordinary strong
@@ -715,24 +788,20 @@ mystnb:
     caption: Bray learning dynamics
     name: fig-bray-learning-dynamics
 ---
-# ------------------------------------------------------------------
-# Bray's cobweb model: T(β) = a + b*β,  REE = a/(1-b)
-# ------------------------------------------------------------------
 a_bray, b_bray, σ_bray = 1.0, 0.6, 1.0
 T_bray = lambda β: a_bray + b_bray * β
 β_f_bray = a_bray / (1 - b_bray)
 
-β0_bray = 0.0   # start well below the REE
+β0_bray = 0.0
 T_sim = 400
 N_sim = 80
 
 β_paths_bray = simulate_rls_scalar(T_bray, σ_bray, β0_bray,
                                       T_periods=T_sim, N_paths=N_sim)
 
-# ODE solution for two starting values
 ode_bray = lambda β: a_bray + b_bray * β - β
-t_ode, sol_low  = solve_ode(ode_bray, 0.0)
-_,     sol_high = solve_ode(ode_bray, 4.5)
+t_ode, sol_low = solve_ode(ode_bray, 0.0)
+_, sol_high = solve_ode(ode_bray, 4.5)
 
 fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
@@ -771,9 +840,6 @@ mystnb:
     caption: Bray-Savin learning dynamics
     name: fig-bray-savin-learning-dynamics
 ---
-# ------------------------------------------------------------------
-# Bray–Savin: T(β) = m + a*β,  REE = m/(1-a)
-# ------------------------------------------------------------------
 m_bs, a_bs, σ_bs = 0.5, 0.7, 1.0
 T_bs = lambda β: m_bs + a_bs * β
 β_f_bs = m_bs / (1 - a_bs)
@@ -782,8 +848,8 @@ T_bs = lambda β: m_bs + a_bs * β
                                     T_periods=T_sim, N_paths=N_sim)
 
 ode_bs = lambda β: T_bs(β) - β
-t_ode_bs, sol_bs_low  = solve_ode(ode_bs, 0.0)
-_,         sol_bs_high = solve_ode(ode_bs, 4.0)
+t_ode_bs, sol_bs_low = solve_ode(ode_bs, 0.0)
+_, sol_bs_high = solve_ode(ode_bs, 4.0)
 
 fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
@@ -820,10 +886,6 @@ mystnb:
     caption: Present-value learning dynamics
     name: fig-present-value-learning-dynamics
 ---
-# ------------------------------------------------------------------
-# Present-value model: T(β) = (lambda*β + 1)*ρ
-# REE = ρ / (1 - lambda*ρ)
-# ------------------------------------------------------------------
 λ, ρ_pv, σ_pv = 0.8, 0.9, 1.0
 T_pv = lambda β: (λ * β + 1) * ρ_pv
 β_f_pv = ρ_pv / (1 - λ * ρ_pv)
@@ -832,8 +894,8 @@ T_pv = lambda β: (λ * β + 1) * ρ_pv
                                     T_periods=T_sim, N_paths=N_sim)
 
 ode_pv = lambda β: T_pv(β) - β
-t_ode_pv, sol_pv_low  = solve_ode(ode_pv, 0.0, t_span=(0, 50))
-_,         sol_pv_high = solve_ode(ode_pv, 10.0, t_span=(0, 50))
+t_ode_pv, sol_pv_low = solve_ode(ode_pv, 0.0, t_span=(0, 50))
+_, sol_pv_high = solve_ode(ode_pv, 10.0, t_span=(0, 50))
 
 fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
@@ -872,12 +934,9 @@ mystnb:
     caption: Unstable Bray dynamics
     name: fig-unstable-bray-dynamics
 ---
-# ------------------------------------------------------------------
-# Unstable case: Bray's model with b > 1
-# ------------------------------------------------------------------
 b_unstable = 1.4
 T_unstable = lambda β: a_bray + b_unstable * β
-β_f_unstable = a_bray / (1 - b_unstable)   # negative
+β_f_unstable = a_bray / (1 - b_unstable)
 
 β_paths_unstable = simulate_rls_scalar(
     T_unstable, σ_bray, β0=0.0,
@@ -885,7 +944,6 @@ T_unstable = lambda β: a_bray + b_unstable * β
 
 ode_unstable = lambda β: T_unstable(β) - β
 
-# Phase diagram: plot drift for β in [-5, 5]
 β_grid = np.linspace(-5, 5, 300)
 drift = np.array([ode_unstable(b) for b in β_grid])
 
@@ -921,10 +979,10 @@ print(f"Jacobian M = b - 1 = {b_unstable - 1:.2f}  (> 0: NOT E-stable)")
 The E-stability condition has a clean geometric interpretation.
 
 At the REE
-$\beta_f$, the small ODE {eq}`eq:small_ode` must have trajectories **pointing
-inward**.
+$\beta_f$, the small ODE {eq}`eq:small_ode` must have trajectories *pointing
+inward*.
 
-This requires the slope $dT/d\beta - 1$ to be **negative** at $\beta_f$.
+This requires the slope $dT/d\beta - 1$ to be *negative* at $\beta_f$.
 
 The figure below plots the phase diagrams for all three scalar examples side by
 side.
@@ -955,7 +1013,6 @@ for ax, (name, ode_fn, bf, color) in zip(axes, models):
                     color=color, alpha=0.12)
     ax.fill_between(β_vec, drift, 0, where=(drift < 0),
                     color=color, alpha=0.12)
-    # Draw arrows showing direction of drift
     for bv in np.linspace(β_vec[20], β_vec[-20], 7):
         d = ode_fn(bv)
         ax.annotate('', xy=(bv + 0.3*np.sign(d), 0),
@@ -983,15 +1040,7 @@ mystnb:
     name: fig-investment-phase-portrait
 ---
 def T_invest(β, b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, ρ_w=0.5):
-    """
-    Mapping T for the investment model (scalar version of equations 11 in
-    Marcet–Sargent 1989).
-
-    β = [β1, β2]
-    T1(β1) = (1 - β1*b) / (1 - β1*b + d^{-1} f^2 A1 N)
-    T2(β1, β2) = -N/(d*(1-ρ_w*b)) * (1 - β1*b + f^2 A1 β2 b*ρ_w)
-                       / (1 - β1*b + d^{-1} f^2 A1 N) * ρ_w
-    """
+    """Mapping T for the investment model with β = [β1, β2]."""
     b1, b2 = β
     denom1 = 1 - b1*b + (1/d)*f**2*A1*N
     T1 = (1 - b1*b) / denom1
@@ -1005,14 +1054,12 @@ def ode_invest(t, β, **kwargs):
     return Tb - β
 
 
-# REE: solve T(β) = β numerically
 from scipy.optimize import fsolve
 
 params = dict(b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, ρ_w=0.5)
 β_f_inv = fsolve(lambda b: T_invest(b, **params) - b, [0.5, 0.1])
 print(f"REE: β_f = {β_f_inv}")
 
-# Check E-stability via Jacobian
 from numpy import linalg as la
 
 eps = 1e-6
@@ -1026,10 +1073,8 @@ eigs = la.eigvals(M)
 print(f"Jacobian M eigenvalues: {eigs}")
 print(f"E-stable: {np.all(eigs.real < 0)}")
 
-# Solve ODE from several initial conditions
 fig, ax = plt.subplots(figsize=(8, 6))
 
-# Plot the vector field
 b1_grid = np.linspace(-0.1, 1.2, 20)
 b2_grid = np.linspace(-0.8, 0.5, 20)
 B1, B2 = np.meshgrid(b1_grid, b2_grid)
@@ -1045,7 +1090,6 @@ speed[speed == 0] = 1e-8
 ax.streamplot(b1_grid, b2_grid, U, V_field, color=speed,
               cmap='Blues', density=1.3, linewidth=1)
 
-# Plot trajectories from several starts
 starts = [(0.1, 0.0), (0.9, 0.4), (1.1, -0.6), (0.3, -0.7)]
 colors_traj = ['red', 'darkorange', 'green', 'purple']
 for (b10, b20), col in zip(starts, colors_traj):
@@ -1066,18 +1110,10 @@ plt.show()
 
 ## Necessary condition: only REE can be limit points
 
-Proposition 2(i) of {cite:t}`MarcetSargent1989jet` shows that **non-REE limit points
-have probability zero**: for any $\hat\beta \neq \beta_f$ in the interior of the
-domain,
-
-$$
-P(\beta_t \to \hat\beta) = 0 .
-$$
-
-This is a converse: RLS either converges to the REE or it diverges.
+{prf:ref}`prop-ms-necessity` is a converse to {prf:ref}`prop-ms-convergence`:
+RLS either converges to the REE or fails to converge at all.
 
-It
-cannot converge to a non-equilibrium fixed point.
+It cannot converge to a non-equilibrium fixed point.
 
 The following simulation makes this vivid by starting agents with an initial
 belief that happens to satisfy $T(\beta_0) \approx \beta_0$ only approximately.
@@ -1089,9 +1125,7 @@ mystnb:
     caption: Non-REE starts
     name: fig-non-ree-starts
 ---
-# Illustration: starting near a non-fixed-point of T still sends β to β_f
-# (Bray model, stable case b=0.6)
-β_false_rest = 3.0   # T(3.0) = 1 + 0.6*3 = 2.8 ≠ 3
+β_false_rest = 3.0
 paths_from_false = simulate_rls_scalar(
     T_bray, σ_bray, β0=β_false_rest,
     T_periods=300, N_paths=60, seed=7)
@@ -1131,8 +1165,7 @@ Because the agent's beliefs shift
 the equilibrium price, the data the agent uses to update beliefs are themselves
 generated by a non-stationary process.
 
-As {cite:t}`MarcetSargent1989jet` note (p.
-338, footnote 2):
+As {cite:t}`MarcetSargent1989jet` put it,
 
 > *"The models do not incorporate fully optimal behavior or rational expectations,
 > because agents operate under the continually falsified assumption that the law of
@@ -1180,7 +1213,7 @@ Key takeaways:
    points of this ODE (REE) are possible limit points of RLS.
 
 4. **E-stability**: the REE is the almost-sure limit of RLS if and only if
-   it is a **locally stable** fixed point of the small ODE — that is, if all
+   it is a locally stable fixed point of the small ODE, that is, if all
    eigenvalues of the Jacobian $\mathcal{M} = dT/d\beta - I$ at $\beta_f$ have
    strictly negative real parts.
 
@@ -1264,7 +1297,7 @@ print("return to the fixed point.  Convergence still occurs but takes longer.")
 
 Necessary condition: non-REE limit points
 
-Proposition 2(i) of {cite:t}`MarcetSargent1989jet` states that $P(\beta_t \to \hat\beta) = 0$
+{prf:ref}`prop-ms-necessity` states that $P(\beta_t \to \hat\beta) = 0$
 for any $\hat\beta \neq \beta_f$ in the interior.
 
 (a) Using the Bray model with $a=1$, $b=0.6$, simulate 100 paths of length
@@ -1293,7 +1326,6 @@ the paths diverge.
 ```{code-cell} ipython3
 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
 
-# (a) far start, stable case
 T_st = lambda β: 1.0 + 0.6*β
 paths_far = simulate_rls_scalar(T_st, 1.0, β0=6.0,
                                 T_periods=600, N_paths=100, seed=1)
@@ -1305,9 +1337,8 @@ ax.axhline(2.5, color='red', ls='--', lw=2, label='$\\beta_f = 2.5$')
 ax.set_title('Stable ($b=0.6$): far start still converges')
 ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$'); ax.legend()
 
-# (b) unstable case, start near REE
 T_un = lambda β: 1.0 + 1.5*β
-β_f_un = 1.0 / (1 - 1.5)   # = -2
+β_f_un = 1.0 / (1 - 1.5)
 paths_un = simulate_rls_scalar(T_un, 1.0, β0=0.1,
                                T_periods=200, N_paths=50, seed=2)
 ax = axes[1]
@@ -1412,7 +1443,6 @@ for ax, lv, col in zip(axes.flat, λ_values, colors_λ):
     ax.plot(np.mean(paths_λ, axis=0), color=col, lw=2, label='RLS average')
 
     if bf is not None:
-        # ODE solution
         t_o, sol_o = solve_ode(ode_fn, 0.0, t_span=(0, 400), n_points=400)
         ax.plot(t_o, sol_o, color='black', ls='--', lw=2, label='ODE')
         ax.axhline(bf, color='red', ls=':', lw=2,
diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index cffd1eef6..12108b98b 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -28,13 +28,13 @@ kernelspec:
 
 ## Overview
 
-This lecture explores a classic question in economic theory: can agents **learn** their way to a rational expectations equilibrium?
+This lecture explores a classic question in economic theory: can agents *learn* their way to a rational expectations equilibrium?
 
 {cite:t}`BrayKreps1987` examine this question in a rigorously specified model.
 
 In a rational expectations equilibrium, agents use market prices to make inferences about other agents' private information.
 
-Each agent knows the **statistical relationship** between prices and the underlying payoff-relevant variables and that relationship is **correct** given the equilibrium.
+Each agent knows the *statistical relationship* between prices and the underlying payoff-relevant variables and that relationship is *correct* given the equilibrium.
 
 But this raises a question: where does that knowledge come from?
 
@@ -51,9 +51,7 @@ This lecture presents the Bray–Kreps framework, explains their benchmark examp
 
 We focus on {cite:t}`BrayKreps1987`, published in *Arrow and the Ascent of Modern Economic Theory*, which synthesizes earlier work by {cite:t}`Bray1982`, {cite:t}`BraySavin1984`, and the rational expectations literature of {cite:t}`Radner1979`, {cite:t}`grossman1976`, and {cite:t}`Jordan1982`.
 
-The local PDF version is the June 1981 Stanford Research Paper version of the same work.
-
-Let's start with the necessary imports.
+Let's start with the following imports
 
 ```{code-cell} ipython3
 import numpy as np
@@ -126,7 +124,7 @@ $$
 x^n_t = \frac{\theta^n}{\sigma^2}(s_t - p_t).
 $$
 
-With $N$ agents and total risky-asset supply $N$, market clearing gives the **full communication price**
+With $N$ agents and total risky-asset supply $N$, market clearing gives the **full communication equilibrium price**
 
 $$
 p_t
@@ -182,13 +180,13 @@ The point is to illustrate how Bayesian posteriors concentrate when the likeliho
 
 ### Setup
 
-Agent $U$ **does not know** the equilibrium price function.
+Agent $U$ *does not know* the equilibrium price function.
 
 Specifically, $U$ does not know $b^*$.
 
 However, $U$ does know:
 * The distribution of $r_t$: $r_t \sim \mathcal{N}(0, \sigma^2)$ IID.
-* That the price function is **linear**: $p_t = a + b r_t$ for some unknown $b$.
+* That the price function is *linear*: $p_t = a + b r_t$ for some unknown $b$.
 * The value of $a = 0$.
 
 So $U$'s task is to learn the single parameter $b$ from observations of prices and (eventually) returns.
@@ -245,17 +243,23 @@ Each observation $(r_s, p_s)$ with $p_s = b r_s + 0$ is treated as a noisy signa
 
 For the simplified Gaussian model, standard Bayesian linear regression implies the following result.
 
-**Proposition:** *For any prior $(\mu_0, v_0)$ with $v_0 < \infty$, as $t \to \infty$:*
+```{prf:proposition}
+:label: prop-rle-gaussian-convergence
+
+For any prior $(\mu_0, v_0)$ with $v_0 < \infty$, as $t \to \infty$,
 
 $$
-\mu_t \xrightarrow{a.s.} b^*, \qquad v_t \xrightarrow{a.s.} 0
+\mu_t \xrightarrow{a.s.} b^*,
+\qquad
+v_t \xrightarrow{a.s.} 0.
 $$
 
-*That is, agent $U$'s posterior distribution on $b$ converges almost surely to a point mass at the true equilibrium value $b^*$.*
+That is, agent $U$'s posterior distribution on $b$ converges almost surely to a point mass at the true equilibrium value $b^*$.
+```
 
 This statement is included to make the simulation transparent.
 
-The formal propositions in {cite:t}`BrayKreps1987` are more general martingale convergence results for posterior assessments, and they are discussed below.
+The more general martingale convergence results for posterior assessments due to {cite:t}`BrayKreps1987` are discussed below.
 
 The intuition is straightforward:
 
@@ -273,19 +277,12 @@ We now implement the Bayesian learning dynamics and verify convergence numerical
 ### Parameters
 
 ```{code-cell} ipython3
-# True equilibrium parameters
-b_true = 2.0        # true b* in the REE
-
-# Distribution of fundamentals
-σ2 = 1.0            # variance of r_t
-
-# Prior on b
-μ_0 = 0.5           # prior mean (misspecified, true is 2.0)
-v_0 = 2.0           # prior variance (diffuse)
-
-# Simulation settings
-T = 300             # time periods
-N = 200             # number of Monte Carlo paths
+b_true = 2.0
+σ2 = 1.0
+μ_0 = 0.5
+v_0 = 2.0
+T = 300
+N = 200
 
 np.random.seed(42)
 ```
@@ -294,42 +291,18 @@ np.random.seed(42)
 
 ```{code-cell} ipython3
 def simulate_bayesian_learning(b_true, σ2, μ_0, v_0, T, N):
-    """
-    Simulate Bayesian learning of the REE slope parameter b*.
-
-    Parameters
-    ----------
-    b_true : true equilibrium slope
-    σ2     : variance of fundamentals r_t
-    μ_0    : prior mean on b
-    v_0    : prior variance on b
-    T      : number of time periods
-    N      : number of Monte Carlo paths
-
-    Returns
-    -------
-    μ_paths : array (N, T) of posterior means over time
-    v_paths : array (N, T) of posterior variances over time
-    """
-    # Draw fundamentals r_t for all paths
+    """Simulate Bayesian learning of the REE slope parameter b*."""
     r = np.random.normal(0, np.sqrt(σ2), size=(N, T))
-
-    # Equilibrium prices: p_t = b_true * r_t
     p = b_true * r
 
-    # Arrays to store posterior parameters
     μ_paths = np.empty((N, T))
     v_paths = np.empty((N, T))
 
     for i in range(N):
-        # Initialize prior
         precision = 1.0 / v_0
         weighted_sum = μ_0 / v_0
 
         for t in range(T):
-            # Each observation: p_s = b * r_s  =>  b = p_s / r_s (when r_s != 0)
-            # Likelihood contribution: precision += r_s^2 / σ2
-            #                          weighted_sum += r_s * p_s / σ2
             precision += r[i, t]**2 / σ2
             weighted_sum += r[i, t] * p[i, t] / σ2
 
@@ -365,7 +338,6 @@ fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
 t_range = np.arange(1, T + 1)
 
-# --- Left panel: posterior means ---
 ax = axes[0]
 for i in range(min(30, N)):
     ax.plot(t_range, μ_paths[i, :], color='steelblue', alpha=0.2, lw=2)
@@ -378,7 +350,6 @@ ax.set_xlabel('$t$')
 ax.set_ylabel('posterior mean $\\mu_t$')
 ax.legend()
 
-# --- Right panel: posterior variances ---
 ax = axes[1]
 for i in range(min(30, N)):
     ax.plot(t_range, v_paths[i, :], color='darkorange', alpha=0.2, lw=2)
@@ -386,7 +357,6 @@ for i in range(min(30, N)):
 ax.plot(t_range, np.mean(v_paths, axis=0), color='saddlebrown', lw=2,
         label='cross-path average')
 
-# Theoretical rate: v_t ≈ σ2 / (t * σ2) = 1/t for large t
 ax.plot(t_range, 1.0 / t_range, color='black', ls='--', lw=2,
         label='$1/t$ (theory)')
 ax.set_xlabel('$t$')
@@ -417,16 +387,10 @@ The following code computes the demand trajectories.
 
 ```{code-cell} ipython3
 def compute_demand(μ_t, p_t, σ2=1.0, θ_U=0.5):
-    """
-    Compute agent U's demand for the risky asset given beliefs μ_t.
-
-    x^U = (θ_U / σ2) * (r_hat - p_t)
-    where r_hat = p_t / μ_t is U's signal extraction.
-    """
+    """Agent U's demand x^U = (θ_U / σ2) * (p_t / μ_t - p_t)."""
     r_hat = p_t / μ_t
     return (θ_U / σ2) * (r_hat - p_t)
 
-# Single representative path
 i_rep = 0
 r_rep = np.random.normal(0, np.sqrt(σ2), T)
 p_rep = b_true * r_rep
@@ -436,7 +400,6 @@ demand_path = np.array([
     for t in range(T)
 ])
 
-# REE demand (what U would demand knowing b*)
 demand_ree = np.array([
     compute_demand(b_true, p_rep[t])
     for t in range(T)
@@ -481,10 +444,7 @@ The following code illustrates this point with a mixture prior.
 
 ```{code-cell} ipython3
 def simulate_two_parameters(b_values, σ2, T, N, seed=0):
-    """
-    Simulate learning when the prior is spread over two possible parameter values.
-    Nature draws the true value from b_values.
-    """
+    """Simulate learning when nature draws b* from b_values."""
     rng = np.random.default_rng(seed)
     b_true_draw = rng.choice(b_values, size=N)
 
@@ -495,7 +455,6 @@ def simulate_two_parameters(b_values, σ2, T, N, seed=0):
         r = rng.normal(0, np.sqrt(σ2), T)
         p = b_i * r
 
-        # Diffuse prior centered between the two equilibria
         μ_prior = np.mean(b_values)
         prec_prior = 1.0 / 4.0
         w_sum = μ_prior * prec_prior
@@ -538,7 +497,7 @@ plt.tight_layout()
 plt.show()
 ```
 
-As expected, agent $U$ learns the **correct** equilibrium as long as the model is correctly specified and the true equilibrium generates the data.
+As expected, agent $U$ learns the *correct* equilibrium as long as the model is correctly specified and the true equilibrium generates the data.
 
 The paper's non-identification example is different: with two informed agents, prices can reveal only the sum of their risk tolerances.
 
@@ -558,13 +517,8 @@ The formal Bray--Kreps model handles this by making the whole price process part
 def simulate_self_referential(b_true, σ2, μ_0, v_0, T, N,
                               α_demand=0.5):
     """
-    Simulate the self-referential learning model where prices depend on
-    current beliefs μ_t.
-
-    p_t = b_true * r_t + α_demand * (μ_t - b_true) * r_t
-
-    This captures the idea that as U's beliefs deviate from b*, the
-    equilibrium price is distorted.
+    Simulate the self-referential price rule
+    p_t = b_true * r_t + α_demand * (μ_t - b_true) * r_t.
     """
     rng = np.random.default_rng(10)
     r_all = rng.normal(0, np.sqrt(σ2), (N, T))
@@ -579,10 +533,8 @@ def simulate_self_referential(b_true, σ2, μ_0, v_0, T, N,
 
         for t in range(T):
             r_t = r_all[i, t]
-            # Price is partly driven by current beliefs
             p_t = b_true * r_t + α_demand * (μ_t - b_true) * r_t
 
-            # Update beliefs with this price
             prec += r_t**2 / σ2
             w_sum += r_t * p_t / σ2
             μ_t = w_sum / prec
@@ -631,21 +583,20 @@ plt.tight_layout()
 plt.show()
 ```
 
-## Convergence of beliefs
+## Convergence of posterior assessments
 
-Section 3 of {cite:t}`BrayKreps1987` proves two general convergence results.
+{cite:t}`BrayKreps1987` prove two general convergence results.
 
 Let $\Omega$ be the underlying state space, and let $H_t^n(p)$ be the information generated for agent $n$ by private information and observed equilibrium prices up to date $t$.
 
-For any event $A$, the posterior assessment
+For any event $A$, the posterior assessment $P^n(A \mid H_t^n(p))$ is a bounded martingale in $t$.
 
-$$
-P^n(A \mid H_t^n(p))
-$$
+The first convergence result is therefore an application of the martingale convergence theorem.
 
-is a bounded martingale in $t$.
+```{prf:proposition}
+:label: prop-bk-event-convergence
 
-Their Proposition 1 is therefore
+For any event $A$,
 
 $$
 P^n(A \mid H_t^n(p))
@@ -654,22 +605,30 @@ P^n(A \mid H_\infty^n(p)),
 \qquad
 H_\infty^n(p)=\bigvee_{t \geq 0} H_t^n(p).
 $$
+```
 
-This is convergence of posterior assessments, not yet convergence to the truth.
+This is convergence of posterior assessments, not yet convergence to "correct beliefs".
 
 If two agents' priors are mutually singular, the almost-sure statements need not hold on a common objective-probability set.
 
-If their priors have the same null sets, simultaneous convergence is obtained outside a common null set.
+If the priors have identical null sets, simultaneous convergence holds outside a common null set.
 
-Their Proposition 2 strengthens the result from events to whole posterior distributions.
+The second result sharpens the convergence from events to entire posterior distributions.
 
-When the parameter space is a complete separable metric space with its Borel sigma-field, regular posterior measures over that parameter space converge weakly almost surely.
+```{prf:proposition}
+:label: prop-bk-measure-convergence
+
+When the parameter space $\Theta$ is a complete separable metric space whose
+Borel $\sigma$-field makes it a Borel space, fixed regular versions of the
+conditional probabilities $P_t^n$ converge weakly $P^n$-a.s. to a regular
+version $P_\infty^n$.
+```
 
-Thus rational Bayesian learning always produces a limiting posterior, but additional identification assumptions are needed to say that the limiting posterior is correct.
+Thus rational Bayesian learning always produces a limiting posterior, but additional regularity is needed to ensure the limiting posterior assesses the truth correctly.
 
-## Identification in the Section 2 example
+## Sharpening the convergence result
 
-Section 4 returns to the two-agent example in which agent $U$ is uncertain about $\theta^I$.
+Now return to the two-agent example in which agent $U$ is uncertain about $\theta^I$.
 
 Let $F_t$ be agent $U$'s posterior distribution over $\theta^I$ after observing the previous price, allocation, and return data.
 
@@ -691,13 +650,13 @@ Third, in this example that limiting price distribution is stochastically decrea
 
 Therefore the long-run distribution of prices identifies the true value of $\theta^I$.
 
-This is the paper's concrete route from convergence of beliefs to convergence to correct beliefs.
+This is the paper's concrete route from convergence of posterior assessments to convergence to the "correct beliefs".
 
 It relies on smoothness, ergodicity, and identification, rather than on martingale convergence alone.
 
 ## Obstacles to convergence
 
-While the positive convergence results are elegant, {cite:t}`BrayKreps1987` are careful to document when learning **fails** to produce convergence to REE.
+While the positive convergence results are elegant, {cite:t}`BrayKreps1987` are careful to document when learning *fails* to produce convergence to REE.
 
 ### Obstacle 1: price maps might not settle down
 
@@ -721,7 +680,7 @@ For decisions in that example, learning the sum is enough, but it is not learnin
 
 ### Obstacle 3: the truth might be outside the model
 
-Section 5 compares the paper's rational-learning model with an example of {cite:t}`BlumeEasley1982`.
+Bray and Kreps compare their rational-learning model with an example of {cite:t}`BlumeEasley1982`.
 
 In that example, agents can converge to an incorrect model because the true stable price relation has zero prior probability under the models they entertain.
 
@@ -755,7 +714,7 @@ Instead, they try to infer the price-state relation from data generated while be
 
 This is the original problem mentioned at the start of the paper: learning changes behavior, and behavior changes the price-state relation being learned.
 
-### Why rational learning has limited reach
+### Why rational learning has limited value
 
 Bray and Kreps call the expanded-state-space formulation natural but also identify its main flaw.
 
@@ -783,7 +742,7 @@ In those models, agents estimate perceived laws of motion from observed data and
 
 Such rules are computationally tractable and can converge in important examples.
 
-But they are **"irrational"** in Bray and Kreps' specific sense.
+But they are *"irrational"* in Bray and Kreps' specific sense.
 
 An agent who already understood the full equilibrium model would not generally use those rules as the Bayesian optimum.
 
@@ -797,12 +756,12 @@ Their proposed discipline is that a stationary limiting equilibrium should not l
 
 In the long run, they argue, equilibrium expectations must either keep changing or become rational.
 
-There is a fundamental **epistemic tension** at the heart of learning about rational expectations equilibria:
+There is a fundamental tension at the heart of learning about rational expectations equilibria:
 
-* A fully rational (Bayesian, correctly specified) learner can only apply Bayes' rule to a model whose structure is *already known* but the structure of the REE is exactly what the agent is trying to learn.
+* A fully rational (Bayesian, correctly specified) learner can only apply Bayes' rule to a model whose structure is *already known*, but the structure of the REE is exactly what the agent is trying to learn.
 * A learner who uses an adaptive algorithm (OLS, least-mean-squares, etc.) can potentially converge to the REE, but only by using a rule that cannot be derived from Bayesian rationality applied to a correctly specified model.
 
-The Bray--Kreps rational-learning model avoids this tension by assumption: agent $U$ knows how each possible risk tolerance would map histories into equilibrium prices and trades.
+The Bray–Kreps rational-learning formulation avoids this tension by assumption: agent $U$ knows how each possible risk tolerance would map histories into equilibrium prices and trades.
 
 The simplified Gaussian code example avoids it even more directly by replacing the equilibrium calculation with a fixed linear observation equation.
 
@@ -928,9 +887,9 @@ Suppose agent $U$ starts with a prior mean $\mu_0$ far from the true value $b^*
 
 (a) Simulate 100 paths of $T = 400$ periods for each of $\mu_0 \in \{-3, 0, 1, 3, 5\}$ and plot the average posterior mean across paths for each $\mu_0$.
 
-(b) Does the prior mean affect the **rate** at which the posterior mean converges to $b^*$?
+(b) Does the prior mean affect the *rate* at which the posterior mean converges to $b^*$?
 
-(c) Does the prior **variance** $v_0$ affect the rate? Verify by comparing $v_0 \in \{0.1, 1.0, 10.0\}$ with fixed $\mu_0 = 0$.
+(c) Does the prior *variance* $v_0$ affect the rate? Verify by comparing $v_0 \in \{0.1, 1.0, 10.0\}$ with fixed $\mu_0 = 0$.
 ```
 
 ```{solution-start} rle_ex2
@@ -944,7 +903,6 @@ T_ex = 400
 N_ex = 100
 t_range_ex = np.arange(1, T_ex + 1)
 
-# (a) and (b): different prior means
 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
 
 ax = axes[0]
@@ -962,7 +920,6 @@ ax.set_ylabel('$E[\\mu_t]$')
 ax.set_title('Effect of Prior Mean on Convergence')
 ax.legend(fontsize=8)
 
-# (c): different prior variances
 ax = axes[1]
 for v0 in [0.1, 1.0, 10.0]:
     μ_p, _ = simulate_bayesian_learning(
@@ -1026,7 +983,7 @@ So $E[r_t^2] = \sigma^2 > 0$ and the strong law of large numbers guarantees $\su
 ```{code-cell} ipython3
 def simulate_learning_mixture(b_true, σ2, μ_0, v_0, T, N):
     """
-    Simulate Bayesian learning with mixture fundamentals:
+    Bayesian learning with mixture fundamentals:
     r_t = 0 with prob 0.5, else N(0, 2*σ2) with prob 0.5.
     """
     rng = np.random.default_rng(42)
@@ -1039,7 +996,6 @@ def simulate_learning_mixture(b_true, σ2, μ_0, v_0, T, N):
         w_sum = μ_0 / v_0
 
         for t in range(T):
-            # Draw from mixture
             if rng.random() < 0.5:
                 r_t = 0.0
             else:
@@ -1062,12 +1018,10 @@ def simulate_learning_mixture(b_true, σ2, μ_0, v_0, T, N):
 T_ex = 500
 N_ex = 50
 
-# Gaussian case
 μ_gauss, v_gauss = simulate_bayesian_learning(
     b_true=2.0, σ2=σ2_ex, μ_0=0.5, v_0=2.0, T=T_ex, N=N_ex
 )
 
-# Mixture case
 μ_mix, v_mix = simulate_learning_mixture(
     b_true=2.0, σ2=σ2_ex, μ_0=0.5, v_0=2.0, T=T_ex, N=N_ex
 )

From 6c144a4c1829a8b2486a86131ef2f3c7f9229c41 Mon Sep 17 00:00:00 2001
From: Chihiro Watanabe <chihiro.watanabe.econ@gmail.com>
Date: Fri, 29 May 2026 04:28:50 +0900
Subject: [PATCH 14/25] Update rng usage in lln_clt.md (#874)

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 lectures/lln_clt.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lectures/lln_clt.md b/lectures/lln_clt.md
index d2839bede..81dd1e2ab 100644
--- a/lectures/lln_clt.md
+++ b/lectures/lln_clt.md
@@ -444,7 +444,7 @@ $Y_5$
 ```{code-cell} python3
 beta_dist = beta(2, 2)
 
-def gen_x_draws(k):
+def gen_x_draws(k, rng):
     """
     Returns a flat array containing k independent draws from the
     distribution of X, the underlying random variable.  This distribution
@@ -456,7 +456,7 @@ def gen_x_draws(k):
     bdraws[1, :] += 0.6
     bdraws[2, :] -= 1.1
     # Set X[i] = bdraws[j, i], where j is a random draw from {0, 1, 2}
-    js = np.random.randint(0, 2, size=k)
+    js = rng.integers(0, 2, size=k)
     X = bdraws[js, np.arange(k)]
     # Rescale, so that the random variable is zero mean
     m, sigma = X.mean(), X.std()
@@ -465,11 +465,12 @@ def gen_x_draws(k):
 nmax = 5
 reps = 100000
 ns = list(range(1, nmax + 1))
+rng = np.random.default_rng()
 
 # Form a matrix Z such that each column is reps independent draws of X
 Z = np.empty((reps, nmax))
 for i in range(nmax):
-    Z[:, i] = gen_x_draws(reps)
+    Z[:, i] = gen_x_draws(reps, rng)
 # Take cumulative sum across columns
 S = Z.cumsum(axis=1)
 # Multiply j-th column by sqrt j

From 29d801f63e6ccb57717e28d4c42b21dbf772d78d Mon Sep 17 00:00:00 2001
From: John Stachurski <john.stachurski@gmail.com>
Date: Fri, 29 May 2026 13:26:25 +0900
Subject: [PATCH 15/25] Fix assorted issues in prob_meaning lecture (#878)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix typos, spelling, notation errors, and swapped bounds in prob_meaning

- Fix spelling: probabilties, probabililty (x6), statististian
- Fix doubled word: "to to help"
- Fix variance formula: remove erroneous factor of n (rho is Bernoulli, not binomial)
- Fix notation: P_{k,i} → rho_{k,i} to match definition
- Fix subject-verb agreement: "means converges" → "mean converges"
- Fix swapped upper/lower bounds in part (e) ppf calls
- Fix compare() to include k=0
- Fix LaTeX: replace * with \cdot for multiplication
- Fix log(I) range: text said 2 to 7, code has 2 to 6
- Fix imprecise wording: f_k^I approximates Prob(X=k|θ), not θ
- Clarify vague exercise pm_ex1 part 3

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Improve code quality: naming, PEP 8, line length, grid resolution

- Rename class frequentist → Frequentist (PEP 8)
- Rename Bay_stat → bayes (snake_case for instances)
- Rename ii → i/post, num → n_obs, num_list → n_obs_list,
  kk → k, K → head_counts, comp → table, step_num → n_obs,
  npt → n_thetas, nn → n_ns, nI → n_Is
- Replace (sample <= θ) * 1 with .astype(int) for consistency
- Shorten docstrings to fit within 80 characters
- Break long code lines (plot calls, list comprehensions, titles)
- Increase θ grid from 100 to 1000 points for smoother density plots
- Use f-strings with comma formatting for plot labels

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Restructure Bayesian section: derive posterior before exercise

- Add back-reference to prob_matrix lecture for Bayes' Law intro
- Derive the n-step posterior Beta(α+k, β+n-k) before the exercise,
  so the exercise solution code no longer precedes its own derivation
- Replace the duplicated derivation after the exercise with a concise
  summary referencing the formula above
- Remove duplicate "Now pretend..." sentence before part (c)
- Replace "this quantecon lecture" cross-references with actual titles
  for better PDF rendering

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Replace prettytable with pandas DataFrame in compare()

Drop the prettytable dependency — pandas is already imported and
renders nicely in Jupyter notebooks. The compare() method now returns
a DataFrame instead of printing a PrettyTable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Fix exercise pm_ex2 parts (a) and (b) to match their solutions

The question asked for the likelihood of "a sample of length n from
a binomial" but the solution gave the single-flip Bernoulli case.
Reword both the questions and solution headers so parts (a) and (b)
are explicitly about a single coin flip. The general n-step case is
already derived in the lecture text before the exercise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Add reproducible random seeds using modern NumPy API

Both classes now accept an rng parameter and use rng.random()
instead of np.random.rand(). Each code cell passes a seeded
np.random.default_rng() for reproducible output across builds.

Also remove "typically" from the hump-shape sentence, since with
fixed seeds the behavior is deterministic.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Clean up posterior PDF plots

- Rename title from "P.D.F" to "PDF"
- Simplify legend labels: "n = 0 (prior)", "n = 1", etc.
- Remove n = 30, 70, 300, 500 from observation list to reduce
  clutter in the first PDF plot

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Remove repeated question text from exercise solutions

Solutions for parts (c)-(h) no longer duplicate the question text
as a header — they just use the part label. This follows the
QuantEcon convention.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Use p(θ) for density notation and break up solution derivation

- Change P(θ) to p(θ) throughout and note it is a density
- Replace the single aligned equation block in the solution for
  part (b) with three separate display equations, each introduced
  by explanatory text (Bayes' Law, substitution, collecting powers)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Final review polish: notation, plotting style, consistency

- Use **bold** instead of __bold__ for binomial distribution
- Use IID instead of i.i.d.
- Fix double "with" in exercise (c) wording
- Rename quantile variables from p_1/p_2 to q_1/q_2 to avoid
  clash with p(θ) density notation
- Fix "means and variances statistics" → "mean and standard deviation"
- Standardize N → n in post-exercise text to match pre-exercise
- Update "exceeds 500" → "exceeds 1000" to match revised n_obs_list
- Standardize frequentist plots to use ax. methods instead of plt.
- Remove dead self.k assignment in Frequentist.binomial()

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Adopt new style guide notation conventions

Replace \textrm{Prob}(...) with \mathbb{P}{...} and E[...] with
\mathbb{E}[...] following QuantEcon.manual#84.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lectures/prob_meaning.md | 470 ++++++++++++++++++---------------------
 1 file changed, 211 insertions(+), 259 deletions(-)

diff --git a/lectures/prob_meaning.md b/lectures/prob_meaning.md
index dfde21873..6b06cfb27 100644
--- a/lectures/prob_meaning.md
+++ b/lectures/prob_meaning.md
@@ -19,7 +19,7 @@ kernelspec:
 
 This lecture  illustrates two distinct interpretations of a  **probability distribution**
 
- * A frequentist interpretation as **relative frequencies** anticipated to occur in a large i.i.d. sample
+ * A frequentist interpretation as **relative frequencies** anticipated to occur in a large IID. sample
 
  * A Bayesian interpretation as a **personal opinion** (about a parameter or list of parameters) after seeing a collection of observations
 
@@ -33,7 +33,7 @@ After you watch that video, please watch the following video on the Bayesian app
 ```{youtube} Pahyv9i_X2k
 ```
 
-After you are familiar with the material in these videos, this lecture uses the Socratic method to  to help consolidate your understanding of the different questions that are answered by
+After you are familiar with the material in these videos, this lecture uses the Socratic method to help consolidate your understanding of the different questions that are answered by
 
  * a frequentist confidence interval
 
@@ -49,19 +49,11 @@ We provide our own answers as the lecture unfolds, but you'll learn more if you
 **Code for answering questions:**
 
 
-In addition to what’s in Anaconda, this lecture will deploy the following library:
-
-```{code-cell} ipython3
-:tags: [hide-output]
-pip install prettytable
-```
-
-To answer our coding questions, we'll start with some imports
+To answer our coding questions, we’ll start with some imports
 
 ```{code-cell} ipython3
 import numpy as np
 import pandas as pd
-import prettytable as pt
 import matplotlib.pyplot as plt
 from scipy.stats import binom
 import scipy.stats as st
@@ -73,16 +65,16 @@ Empowered with these Python tools, we'll now  explore the two meanings described
 
 Consider the following classic example.
 
-The random variable  $X $ takes on possible values $k = 0, 1, 2, \ldots, n$  with probabilties
+The random variable  $X $ takes on possible values $k = 0, 1, 2, \ldots, n$  with probabilities
 
 $$
-\textrm{Prob}(X =  k | \theta) =
+\mathbb{P}\{X = k \mid \theta\} =
 \left(\frac{n!}{k! (n-k)!} \right) \theta^k (1-\theta)^{n-k}
 $$
 
 where the fixed parameter $\theta \in (0,1)$.
 
-This is called   the __binomial distribution__.
+This is called the **binomial distribution**.
 
 Here
 
@@ -114,7 +106,7 @@ f_k^I = \frac{\textrm{number of samples of length n for which } \sum_{h=1}^n y_h
     I}
 $$
 
-The probability  $\textrm{Prob}(X =  k | \theta)$ answers the following question:
+The probability  $\mathbb{P}\{X = k \mid \theta\}$ answers the following question:
 
 * As $I$ becomes large, in what   fraction of  $I$ independent  draws of  $n$ coin flips should we anticipate  $k$ heads to occur?
 
@@ -126,9 +118,9 @@ As usual, a law of large numbers justifies this answer.
 1. Please write a Python class to compute $f_k^I$
 
 2. Please use your code to compute $f_k^I, k = 0, \ldots , n$ and compare them to
-  $\textrm{Prob}(X =  k | \theta)$ for various values of $\theta, n$ and $I$
+  $\mathbb{P}\{X = k \mid \theta\}$ for various values of $\theta, n$ and $I$
 
-3. With the Law of Large numbers in mind, use your code to say something
+3. With the Law of Large Numbers in mind, use your code to describe the relationship between $f_k^I$ and $\mathbb{P}\{X = k \mid \theta\}$ as $I$ grows
 ```
 
 ```{solution-start} pm_ex1
@@ -138,68 +130,45 @@ As usual, a law of large numbers justifies this answer.
 Here is one solution:
 
 ```{code-cell} ipython3
-class frequentist:
-
-    def __init__(self, θ, n, I):
-
-        '''
-        initialization
-        -----------------
-        parameters:
-        θ : probability that one toss of a coin will be a head with Y = 1
-        n : number of independent flips in each independent sequence of draws
-        I : number of independent sequence of draws
-
-        '''
+class Frequentist:
 
+    def __init__(self, θ, n, I, rng=None):
         self.θ, self.n, self.I = θ, n, I
+        self.rng = rng or np.random.default_rng()
 
     def binomial(self, k):
-
-        '''compute the theoretical probability for specific input k'''
-
-        θ, n = self.θ, self.n
-        self.k = k
-        self.P = binom.pmf(k, n, θ)
+        '''Compute the theoretical probability.'''
+        self.P = binom.pmf(k, self.n, self.θ)
 
     def draw(self):
-
-        '''draw n independent flips for I independent sequences'''
-
+        '''Draw n independent flips for I sequences.'''
         θ, n, I = self.θ, self.n, self.I
-        sample = np.random.rand(I, n)
-        Y = (sample <= θ) * 1
-        self.Y = Y
-
-    def compute_fk(self, kk):
+        sample = self.rng.random((I, n))
+        self.Y = (sample <= θ).astype(int)
 
-        '''compute f_{k}^I for specific input k'''
-
-        Y, I = self.Y, self.I
-        K = np.sum(Y, 1)
-        f_kI = np.sum(K == kk) / I
-        self.f_kI = f_kI
-        self.kk = kk
+    def compute_fk(self, k):
+        '''Compute f_k^I for a given k.'''
+        head_counts = np.sum(self.Y, axis=1)
+        self.f_kI = np.sum(head_counts == k) / self.I
 
     def compare(self):
-
-        '''compute and print the comparison'''
-
-        n = self.n
-        comp = pt.PrettyTable()
-        comp.field_names = ['k', 'Theoretical', 'Frequentist']
+        '''Compute and print the comparison.'''
         self.draw()
-        for i in range(n):
-            self.binomial(i+1)
-            self.compute_fk(i+1)
-            comp.add_row([i+1, self.P, self.f_kI])
-        print(comp)
+        rows = []
+        for k in range(self.n + 1):
+            self.binomial(k)
+            self.compute_fk(k)
+            rows.append([k, self.P, self.f_kI])
+        return pd.DataFrame(
+            rows, columns=['k', 'Theoretical', 'Frequentist']
+        ).set_index('k')
 ```
 
 ```{code-cell} ipython3
+rng = np.random.default_rng(123)
 θ, n, k, I = 0.7, 20, 10, 1_000_000
 
-freq = frequentist(θ, n, I)
+freq = Frequentist(θ, n, I, rng=rng)
 
 freq.compare()
 ```
@@ -222,12 +191,13 @@ $$
 We'll vary $\theta$ from $0.01$ to $0.99$ and plot outcomes against $\theta$.
 
 ```{code-cell} ipython3
-θ_low, θ_high, npt = 0.01, 0.99, 50
-thetas = np.linspace(θ_low, θ_high, npt)
+rng = np.random.default_rng(234)
+θ_low, θ_high, n_thetas = 0.01, 0.99, 50
+thetas = np.linspace(θ_low, θ_high, n_thetas)
 P = []
 f_kI = []
-for i in range(npt):
-    freq = frequentist(thetas[i], n, I)
+for i in range(n_thetas):
+    freq = Frequentist(thetas[i], n, I, rng=rng)
     freq.binomial(k)
     freq.draw()
     freq.compute_fk(k)
@@ -240,11 +210,12 @@ fig, ax = plt.subplots(figsize=(8, 6))
 ax.grid()
 ax.plot(thetas, P, 'k-.', label='Theoretical')
 ax.plot(thetas, f_kI, 'r--', label='Fraction')
-plt.title(r'Comparison with different $\theta$', fontsize=16)
-plt.xlabel(r'$\theta$', fontsize=15)
-plt.ylabel('Fraction', fontsize=15)
-plt.tick_params(labelsize=13)
-plt.legend()
+ax.set_title(r'Comparison with different $\theta$',
+             fontsize=16)
+ax.set_xlabel(r'$\theta$', fontsize=15)
+ax.set_ylabel('Fraction', fontsize=15)
+ax.tick_params(labelsize=13)
+ax.legend()
 plt.show()
 ```
 
@@ -255,12 +226,13 @@ Now we fix $\theta=0.7, k=10, I=1,000,000$ and vary $n$ from $1$ to $100$.
 Then we'll plot outcomes.
 
 ```{code-cell} ipython3
-n_low, n_high, nn = 1, 100, 50
-ns = np.linspace(n_low, n_high, nn, dtype='int')
+rng = np.random.default_rng(345)
+n_low, n_high, n_ns = 1, 100, 50
+ns = np.linspace(n_low, n_high, n_ns, dtype='int')
 P = []
 f_kI = []
-for i in range(nn):
-    freq = frequentist(θ, ns[i], I)
+for i in range(n_ns):
+    freq = Frequentist(θ, ns[i], I, rng=rng)
     freq.binomial(k)
     freq.draw()
     freq.compute_fk(k)
@@ -273,26 +245,28 @@ fig, ax = plt.subplots(figsize=(8, 6))
 ax.grid()
 ax.plot(ns, P, 'k-.', label='Theoretical')
 ax.plot(ns, f_kI, 'r--', label='Frequentist')
-plt.title(r'Comparison with different $n$', fontsize=16)
-plt.xlabel(r'$n$', fontsize=15)
-plt.ylabel('Fraction', fontsize=15)
-plt.tick_params(labelsize=13)
-plt.legend()
+ax.set_title(r'Comparison with different $n$',
+             fontsize=16)
+ax.set_xlabel(r'$n$', fontsize=15)
+ax.set_ylabel('Fraction', fontsize=15)
+ax.tick_params(labelsize=13)
+ax.legend()
 plt.show()
 ```
 
 **Comparison with different $I$**
 
-Now we fix $\theta=0.7, n=20, k=10$ and vary $\log(I)$ from $2$ to $7$.
+Now we fix $\theta=0.7, n=20, k=10$ and vary $\log(I)$ from $2$ to $6$.
 
 ```{code-cell} ipython3
-I_log_low, I_log_high, nI = 2, 6, 200
-log_Is = np.linspace(I_log_low, I_log_high, nI)
+rng = np.random.default_rng(456)
+I_log_low, I_log_high, n_Is = 2, 6, 200
+log_Is = np.linspace(I_log_low, I_log_high, n_Is)
 Is = np.power(10, log_Is).astype(int)
 P = []
 f_kI = []
-for i in range(nI):
-    freq = frequentist(θ, n, Is[i])
+for i in range(n_Is):
+    freq = Frequentist(θ, n, Is[i], rng=rng)
     freq.binomial(k)
     freq.draw()
     freq.compute_fk(k)
@@ -305,11 +279,12 @@ fig, ax = plt.subplots(figsize=(8, 6))
 ax.grid()
 ax.plot(Is, P, 'k-.', label='Theoretical')
 ax.plot(Is, f_kI, 'r--', label='Fraction')
-plt.title(r'Comparison with different $I$', fontsize=16)
-plt.xlabel(r'$I$', fontsize=15)
-plt.ylabel('Fraction', fontsize=15)
-plt.tick_params(labelsize=13)
-plt.legend()
+ax.set_title(r'Comparison with different $I$',
+             fontsize=16)
+ax.set_xlabel(r'$I$', fontsize=15)
+ax.set_ylabel('Fraction', fontsize=15)
+ax.tick_params(labelsize=13)
+ax.legend()
 plt.show()
 ```
 
@@ -318,20 +293,20 @@ From the above graphs, we can see that **$I$, the number of independent sequence
 When $I$ becomes larger, the difference between theoretical probability and frequentist estimate becomes smaller.
 
 Also, as long as $I$ is large enough, changing $\theta$ or $n$ does not substantially change the accuracy of the observed fraction
-as an approximation of $\theta$.
+as an approximation of $\mathbb{P}\{X = k \mid \theta\}$.
 
 The Law of Large Numbers is at work here.
 
-For each draw of an independent sequence, $\textrm{Prob}(X_i =  k | \theta)$  is the same, so aggregating all draws forms an i.i.d sequence of a binary random variable $\rho_{k,i},i=1,2,...I$, with a mean of $\textrm{Prob}(X =  k | \theta)$ and a variance of
+For each draw of an independent sequence, $\mathbb{P}\{X_i = k \mid \theta\}$  is the same, so aggregating all draws forms an IID sequence of a binary random variable $\rho_{k,i},i=1,2,...I$, with a mean of $\mathbb{P}\{X = k \mid \theta\}$ and a variance of
 
 $$
-n \cdot \textrm{Prob}(X =  k | \theta) \cdot (1-\textrm{Prob}(X =  k | \theta)).
+\mathbb{P}\{X = k \mid \theta\} \cdot (1-\mathbb{P}\{X = k \mid \theta\}).
 $$
 
-So, by the LLN, the average of $P_{k,i}$ converges to:
+So, by the LLN, the average of $\rho_{k,i}$ converges to:
 
 $$
-E[\rho_{k,i}] = \textrm{Prob}(X =  k | \theta) = \left(\frac{n!}{k! (n-k)!} \right) \theta^k (1-\theta)^{n-k}
+\mathbb{E}[\rho_{k,i}] = \mathbb{P}\{X = k \mid \theta\} = \left(\frac{n!}{k! (n-k)!} \right) \theta^k (1-\theta)^{n-k}
 $$
 
 as $I$ goes to infinity.
@@ -347,30 +322,46 @@ Instead, we think of it as a **random variable**.
 
 $\theta$ is described by a probability distribution.
 
-But now this probability distribution means something different than a relative frequency that we can anticipate to occur in a large i.i.d. sample.
+But now this probability distribution means something different than a relative frequency that we can anticipate to occur in a large IID. sample.
 
 Instead, the probability distribution of $\theta$ is now a summary of our views about  likely values of $\theta$ either
 
   * **before** we have seen **any** data at all, or
   * **before** we have seen **more** data, after we have seen **some** data
 
-Thus, suppose that, before seeing any data, you have a personal prior probability distribution saying that
+Thus, suppose that, before seeing any data, you have a personal prior probability distribution with density
 
 $$
-P(\theta) = \frac{\theta^{\alpha-1}(1-\theta)^{\beta -1}}{B(\alpha, \beta)}
+p(\theta) = \frac{\theta^{\alpha-1}(1-\theta)^{\beta -1}}{B(\alpha, \beta)}
 $$
 
-where $B(\alpha, \beta)$ is a  **beta function** , so that $P(\theta)$ is
-a **beta distribution** with parameters $\alpha, \beta$.
+where $B(\alpha, \beta)$ is a  **beta function** , so that $p(\theta)$ is
+the density of a **beta distribution** with parameters $\alpha, \beta$.
+
+We can update this prior after observing data using Bayes' Law (see {doc}`Probability with Matrices <prob_matrix>` for an introduction).
+
+For a sample of $n$ coin flips that yields $k$ heads, the **likelihood function** is the binomial probability
+
+$$
+L(k \mid \theta) = {n \choose k} \theta^k (1-\theta)^{n-k}
+$$
+
+Applying Bayes' Law with our beta prior, the **posterior density** is
+
+$$
+p(\theta \mid k) = \frac{L(k \mid \theta) \cdot p(\theta)}{\int_0^1 L(k \mid \theta) \cdot p(\theta) \, d\theta} = \textrm{Beta}(\alpha + k, \, \beta + n - k)
+$$
+
+So the posterior is also a beta distribution — a consequence of the beta prior being **conjugate** to the binomial likelihood.
 
 ```{exercise}
 :label: pm_ex2
 
-**a)**  Please write down the **likelihood function** for a sample of length $n$ from a binomial distribution with parameter $\theta$.
+**a)**  Please write down the **likelihood function** for a single coin flip with outcome $Y \in \{0, 1\}$.
 
-**b)** Please write down the **posterior** distribution for $\theta$ after observing  one flip of the coin.
+**b)** Please write down the **posterior** distribution for $\theta$ after observing that single flip.
 
-**c)** Now pretend that the true value of $\theta = .4$ and that someone who doesn't know this has a beta prior distribution with parameters  with $\beta = \alpha = .5$. Please write a Python class to simulate this person's personal posterior distribution for $\theta$  for a _single_ sequence of $n$ draws.
+**c)** Now pretend that the true value of $\theta = .4$ and that someone who doesn't know this has a beta prior distribution with parameters $\beta = \alpha = .5$. Please write a Python class to simulate this person's personal posterior distribution for $\theta$  for a _single_ sequence of $n$ draws.
 
 **d)** Please plot the posterior distribution for $\theta$ as a function of $\theta$ as $n$ grows as $1, 2, \ldots$.
 
@@ -378,7 +369,7 @@ a **beta distribution** with parameters $\alpha, \beta$.
 
 **f)** Please tell what question a Bayesian coverage interval answers.
 
-**g)** Please compute the Posterior probabililty that $\theta \in [.45, .55]$ for various values of sample size $n$.
+**g)** Please compute the Posterior probability that $\theta \in [.45, .55]$ for various values of sample size $n$.
 
 **h)** Please use your Python class to study what happens to the posterior distribution as $n \rightarrow + \infty$, again assuming that the true value of $\theta = .4$, though it is unknown to the person doing the updating via Bayes' Law.
 ```
@@ -388,208 +379,187 @@ a **beta distribution** with parameters $\alpha, \beta$.
 :class: dropdown
 ```
 
-**a)** Please write down the **likelihood function** and the **posterior** distribution for $\theta$ after observing  one flip of our coin.
+**a)** The **likelihood function** for a single coin flip with outcome $Y \in \{0, 1\}$ is
 
-Suppose the outcome is __Y__.
+$$
+L(Y|\theta) = \theta^Y (1-\theta)^{1-Y}
+$$
 
-The likelihood function is:
+**b)** By Bayes' Law, the posterior density for $\theta$ after observing a single flip $Y$ is
 
 $$
-L(Y|\theta)= \textrm{Prob}(X =  Y | \theta) =
-\theta^Y (1-\theta)^{1-Y}
+p(\theta \mid Y) = \frac{L(Y \mid \theta) \cdot p(\theta)}{\int_{0}^{1} L(Y \mid \theta) \cdot p(\theta) \, d\theta}
 $$
 
-**b)** Please write the **posterior** distribution for $\theta$ after observing  one flip of our coin.
-
-The prior distribution is
+Substituting the likelihood from (a) and the beta prior density, this becomes
 
 $$
-\textrm{Prob}(\theta) = \frac{\theta^{\alpha - 1} (1 - \theta)^{\beta - 1}}{B(\alpha, \beta)}
+p(\theta \mid Y) = \frac{\theta^Y (1-\theta)^{1-Y} \cdot \theta^{\alpha - 1} (1 - \theta)^{\beta - 1} / B(\alpha, \beta)}{\int_{0}^{1} \theta^Y (1-\theta)^{1-Y} \cdot \theta^{\alpha - 1} (1 - \theta)^{\beta - 1} / B(\alpha, \beta) \, d\theta}
 $$
 
-We can derive the posterior distribution for $\theta$ via
+Collecting powers of $\theta$ and $(1-\theta)$, we recognize the kernel of a beta density:
 
-\begin{align*}
-  \textrm{Prob}(\theta | Y) &= \frac{\textrm{Prob}(Y | \theta) \textrm{Prob}(\theta)}{\textrm{Prob}(Y)} \\
-  &=\frac{\textrm{Prob}(Y | \theta) \textrm{Prob}(\theta)}{\int_{0}^{1} \textrm{Prob}(Y | \theta) \textrm{Prob}(\theta) d \theta }\\
-  &= \frac{\theta^Y (1-\theta)^{1-Y}\frac{\theta^{\alpha - 1} (1 - \theta)^{\beta - 1}}{B(\alpha, \beta)}}{\int_{0}^{1}\theta^Y (1-\theta)^{1-Y}\frac{\theta^{\alpha - 1} (1 - \theta)^{\beta - 1}}{B(\alpha, \beta)} d \theta } \\
-  &= \frac{ \theta^{Y+\alpha - 1} (1 - \theta)^{1-Y+\beta - 1}}{\int_{0}^{1}\theta^{Y+\alpha - 1} (1 - \theta)^{1-Y+\beta - 1} d \theta}
-\end{align*}
+$$
+p(\theta \mid Y) = \frac{\theta^{Y+\alpha - 1} (1 - \theta)^{1-Y+\beta - 1}}{\int_{0}^{1} \theta^{Y+\alpha - 1} (1 - \theta)^{1-Y+\beta - 1} \, d\theta}
+$$
 
 which means that
 
 $$
-\textrm{Prob}(\theta | Y) \sim \textrm{Beta}(\alpha + Y, \beta + (1-Y))
+\theta \mid Y \sim \textrm{Beta}(\alpha + Y, \, \beta + (1-Y))
 $$
 
-Now please pretend that the true value of $\theta = .4$ and that someone who doesn't know this has a beta prior with $\beta = \alpha = .5$.
-
-**c)** Now pretend that the true value of $\theta = .4$ and that someone who doesn't know this has a beta prior distribution with parameters  with $\beta = \alpha = .5$. Please write a Python class to simulate this person's personal posterior distribution for $\theta$  for a _single_ sequence of $n$ draws.
+**c)**
 
 ```{code-cell} ipython3
 class Bayesian:
 
-    def __init__(self, θ=0.4, n=1_000_000, α=0.5, β=0.5):
-        """
-        Parameters:
+    def __init__(self, θ=0.4, n=1_000_000, α=0.5, β=0.5,
+                 rng=None):
+        '''
+        Parameters
         ----------
-        θ : float, ranging from [0,1].
-           probability that one toss of a coin will be a head with Y = 1
-
-        n : int.
-           number of independent flips in an independent sequence of draws
-
-        α&β : int or float.
-             parameters of the prior distribution on θ
-
-        """
+        θ : Probability of heads on each flip.
+        n : Number of flips in the sequence.
+        α, β : Parameters of the beta prior on θ.
+        rng : NumPy random generator.
+        '''
         self.θ, self.n, self.α, self.β = θ, n, α, β
+        self.rng = rng or np.random.default_rng()
         self.prior = st.beta(α, β)
 
     def draw(self):
-        """
-        simulate a single sequence of draws of length n, given probability θ
-
-        """
-        array = np.random.rand(self.n)
+        '''Simulate a sequence of n coin flips.'''
+        array = self.rng.random(self.n)
         self.draws = (array < self.θ).astype(int)
 
-    def form_single_posterior(self, step_num):
-        """
-        form a posterior distribution after observing the first step_num elements of the draws
-
-        Parameters
-        ----------
-        step_num: int.
-               number of steps observed to form a posterior distribution
-
-        Returns
-        ------
-        the posterior distribution for sake of plotting in the subsequent steps
+    def form_single_posterior(self, n_obs):
+        '''Return the posterior after the first n_obs flips.'''
+        heads = self.draws[:n_obs].sum()
+        tails = n_obs - heads
+        return st.beta(self.α + heads, self.β + tails)
 
-        """
-        heads_num = self.draws[:step_num].sum()
-        tails_num = step_num - heads_num
-
-        return st.beta(self.α+heads_num, self.β+tails_num)
-
-    def form_posterior_series(self,num_obs_list):
-        """
-        form a series of posterior distributions that form after observing different number of draws.
-
-        Parameters
-        ----------
-        num_obs_list: a list of int.
-               a list of the number of observations used to form a series of posterior distributions.
-
-        """
+    def form_posterior_series(self, n_obs_list):
+        '''Form posteriors for each sample size in n_obs_list.'''
         self.posterior_list = []
-        for num in num_obs_list:
-            self.posterior_list.append(self.form_single_posterior(num))
+        for n_obs in n_obs_list:
+            self.posterior_list.append(
+                self.form_single_posterior(n_obs)
+            )
 ```
 
-**d)** Please plot the posterior distribution for $\theta$ as a function of $\theta$ as $n$ grows from $1, 2, \ldots$.
+**d)**
 
 ```{code-cell} ipython3
-Bay_stat = Bayesian()
-Bay_stat.draw()
+rng = np.random.default_rng(567)
+bayes = Bayesian(rng=rng)
+bayes.draw()
 
-num_list = [1, 2, 3, 4, 5, 10, 20, 30, 50, 70, 100, 300, 500, 1000, # this line for finite n
-            5000, 10_000, 50_000, 100_000, 200_000, 300_000]  # this line for approximately infinite n
+n_obs_list = [1, 2, 3, 4, 5, 10, 20, 50,
+              100, 1000,
+              5000, 10_000, 50_000, 100_000,
+              200_000, 300_000]
 
-Bay_stat.form_posterior_series(num_list)
+bayes.form_posterior_series(n_obs_list)
 
-θ_values = np.linspace(0.01, 1, 100)
+θ_values = np.linspace(0.01, 1, 1000)
 
 fig, ax = plt.subplots(figsize=(10, 6))
 
-ax.plot(θ_values, Bay_stat.prior.pdf(θ_values), label='Prior Distribution', color='k', linestyle='--')
+ax.plot(θ_values, bayes.prior.pdf(θ_values),
+        label='n = 0 (prior)', color='k',
+        linestyle='--')
 
-for ii, num in enumerate(num_list[:14]):
-    ax.plot(θ_values, Bay_stat.posterior_list[ii].pdf(θ_values), label='Posterior with n = %d' % num)
+for i, n_obs in enumerate(n_obs_list[:10]):
+    posterior = bayes.posterior_list[i]
+    ax.plot(θ_values, posterior.pdf(θ_values),
+            label=f'n = {n_obs}')
 
-ax.set_title('P.D.F of Posterior Distributions', fontsize=15)
+ax.set_title('PDF of Posterior Distributions',
+             fontsize=15)
 ax.set_xlabel(r"$\theta$", fontsize=15)
 
 ax.legend(fontsize=11)
 plt.show()
 ```
 
-**e)** For various $n$'s, please describe and compute  $.05$ and $.95$ quantiles for  posterior probabilities.
+**e)**
 
 ```{code-cell} ipython3
-upper_bound = [ii.ppf(0.05) for ii in Bay_stat.posterior_list[:14]]
-lower_bound = [ii.ppf(0.95) for ii in Bay_stat.posterior_list[:14]]
+lower_bound = [post.ppf(0.05) for post in bayes.posterior_list[:10]]
+upper_bound = [post.ppf(0.95) for post in bayes.posterior_list[:10]]
 
 interval_df = pd.DataFrame()
 interval_df['upper'] = upper_bound
 interval_df['lower'] = lower_bound
-interval_df.index = num_list[:14]
+interval_df.index = n_obs_list[:10]
 interval_df = interval_df.T
 interval_df
 ```
 
 As $n$ increases, we can see that Bayesian coverage intervals narrow and move toward $0.4$.
 
-**f)** Please tell what question a Bayesian coverage interval answers.
-
-The Bayesian coverage interval tells the range of $\theta$ that corresponds to the [$p_1$, $p_2$] quantiles of the cumulative probability distribution (CDF)  of the posterior distribution.
+**f)** The Bayesian coverage interval tells the range of $\theta$ that corresponds to the [$q_1$, $q_2$] quantiles of the cumulative distribution function (CDF) of the posterior distribution.
 
 To construct the coverage interval we first compute a posterior distribution of the unknown parameter $\theta$.
 
-If the CDF is $F(\theta)$, then the Bayesian coverage interval $[a,b]$ for the interval $[p_1,p_2]$ is described by
+If the CDF is $F(\theta)$, then the Bayesian coverage interval $[a,b]$ for the interval $[q_1,q_2]$ is described by
 
 $$
-F(a)=p_1,F(b)=p_2
+F(a)=q_1,F(b)=q_2
 $$
 
-**g)** Please compute the Posterior probabililty that $\theta \in [.45, .55]$ for various values of sample size $n$.
+**g)**
 
 ```{code-cell} ipython3
 left_value, right_value = 0.45, 0.55
 
-posterior_prob_list=[ii.cdf(right_value)-ii.cdf(left_value) for ii in Bay_stat.posterior_list]
+posterior_prob_list = [
+    post.cdf(right_value) - post.cdf(left_value)
+    for post in bayes.posterior_list
+]
 
 fig, ax = plt.subplots(figsize=(8, 5))
 ax.plot(posterior_prob_list)
-ax.set_title('Posterior Probabililty that '+ r"$\theta$" +' Ranges from %.2f to %.2f'%(left_value, right_value),
-             fontsize=13)
+ax.set_title(
+    r'Posterior Probability that $\theta$'
+    f' Ranges from {left_value:.2f}'
+    f' to {right_value:.2f}',
+    fontsize=13)
 ax.set_xticks(np.arange(0, len(posterior_prob_list), 3))
-ax.set_xticklabels(num_list[::3])
+ax.set_xticklabels(n_obs_list[::3])
 ax.set_xlabel('Number of Observations', fontsize=11)
 
 plt.show()
 ```
 
-Notice that in the graph above the posterior probabililty that $\theta \in [.45, .55]$ typically exhibits a hump shape as $n$ increases.
+Notice that in the graph above the posterior probability that $\theta \in [.45, .55]$ exhibits a hump shape as $n$ increases.
 
 Two opposing forces are at work.
 
-The first force is that the individual  adjusts his belief as he observes new outcomes, so his posterior probability distribution  becomes more and more realistic, which explains the rise of the posterior probabililty.
+The first force is that the individual  adjusts his belief as he observes new outcomes, so his posterior probability distribution  becomes more and more realistic, which explains the rise of the posterior probability.
 
 However, $[.45, .55]$ actually excludes the true $\theta =.4 $ that generates the data.
 
-As a result, the posterior probabililty drops as larger and larger samples refine his  posterior probability distribution of $\theta$.
+As a result, the posterior probability drops as larger and larger samples refine his  posterior probability distribution of $\theta$.
 
 The descent seems precipitous only because of the scale of the graph  that has the number of observations increasing disproportionately.
 
 When the number of observations becomes large enough, our Bayesian becomes so confident about $\theta$ that he considers $\theta \in [.45, .55]$ very unlikely.
 
-That is why we see a nearly horizontal line when the number of observations exceeds 500.
+That is why we see a nearly horizontal line when the number of observations exceeds 1000.
 
-**h)** Please use your Python class to study what happens to the posterior distribution as $n \rightarrow + \infty$, again assuming that the true value of $\theta = .4$, though it is unknown to the person doing the updating via Bayes' Law.
-
-Using the Python class we made above, we can see the evolution of posterior distributions as $n$ approaches infinity.
+**h)** Using the Python class we made above, we can see the evolution of posterior distributions as $n$ approaches infinity.
 
 ```{code-cell} ipython3
 fig, ax = plt.subplots(figsize=(10, 6))
 
-for ii, num in enumerate(num_list[14:]):
-    ii += 14
-    ax.plot(θ_values, Bay_stat.posterior_list[ii].pdf(θ_values),
-            label='Posterior with n=%d thousand' % (num/1000))
+for i, n_obs in enumerate(n_obs_list[10:]):
+    posterior = bayes.posterior_list[i + 10]
+    ax.plot(θ_values, posterior.pdf(θ_values),
+            label=f'n = {n_obs:,}')
 
-ax.set_title('P.D.F of Posterior Distributions', fontsize=15)
+ax.set_title('PDF of Posterior Distributions', fontsize=15)
 ax.set_xlabel(r"$\theta$", fontsize=15)
 ax.set_xlim(0.3, 0.5)
 
@@ -599,26 +569,28 @@ plt.show()
 
 As $n$ increases, we can see that the probability density functions _concentrate_ on $0.4$, the true value of $\theta$.
 
-Here the  posterior means  converges to $0.4$ while the posterior standard deviations converges to $0$ from above.
+Here the  posterior mean  converges to $0.4$ while the posterior standard deviation converges to $0$ from above.
 
-To show this, we compute the means and variances statistics of the posterior distributions.
+To show this, we compute the mean and standard deviation of the posterior distributions.
 
 ```{code-cell} ipython3
-mean_list = [ii.mean() for ii in Bay_stat.posterior_list]
-std_list = [ii.std() for ii in Bay_stat.posterior_list]
+mean_list = [post.mean() for post in bayes.posterior_list]
+std_list = [post.std() for post in bayes.posterior_list]
 
 fig, ax = plt.subplots(1, 2, figsize=(14, 5))
 
 ax[0].plot(mean_list)
-ax[0].set_title('Mean Values of Posterior Distribution', fontsize=13)
+ax[0].set_title('Mean of Posterior Distribution',
+                fontsize=13)
 ax[0].set_xticks(np.arange(0, len(mean_list), 3))
-ax[0].set_xticklabels(num_list[::3])
+ax[0].set_xticklabels(n_obs_list[::3])
 ax[0].set_xlabel('Number of Observations', fontsize=11)
 
 ax[1].plot(std_list)
-ax[1].set_title('Standard Deviations of Posterior Distribution', fontsize=13)
+ax[1].set_title('Std Dev of Posterior Distribution',
+                fontsize=13)
 ax[1].set_xticks(np.arange(0, len(std_list), 3))
-ax[1].set_xticklabels(num_list[::3])
+ax[1].set_xticklabels(n_obs_list[::3])
 ax[1].set_xlabel('Number of Observations', fontsize=11)
 
 plt.show()
@@ -629,57 +601,37 @@ plt.show()
 
 How shall we interpret the patterns above?
 
-The answer is encoded in the  Bayesian updating formulas.
-
-It is natural to extend the one-step Bayesian update to an $n$-step Bayesian update.
-
-
-$$
-\textrm{Prob}(\theta|k) = \frac{\textrm{Prob}(\theta,k)}{\textrm{Prob}(k)}=\frac{\textrm{Prob}(k|\theta)*\textrm{Prob}(\theta)}{\textrm{Prob}(k)}=\frac{\textrm{Prob}(k|\theta)*\textrm{Prob}(\theta)}{\int_0^1 \textrm{Prob}(k|\theta)*\textrm{Prob}(\theta) d\theta}
-$$
-
-$$
-=\frac{{N \choose k} (1 - \theta)^{N-k} \theta^k*\frac{\theta^{\alpha - 1} (1 - \theta)^{\beta - 1}}{B(\alpha, \beta)}}{\int_0^1 {N \choose k} (1 - \theta)^{N-k} \theta^k*\frac{\theta^{\alpha - 1} (1 - \theta)^{\beta - 1}}{B(\alpha, \beta)} d\theta}
-$$
-
-$$
-=\frac{(1 -\theta)^{\beta+N-k-1}* \theta^{\alpha+k-1}}{\int_0^1 (1 - \theta)^{\beta+N-k-1}* \theta^{\alpha+k-1} d\theta}
-$$
-
-$$
-={Beta}(\alpha + k, \beta+N-k)
-$$
-
-A beta distribution with $\alpha$ and $\beta$ has the following mean and variance.
-
-The mean is $\frac{\alpha}{\alpha + \beta}$
+The answer is encoded in the Bayesian updating formula derived above.
 
-The variance is $\frac{\alpha \beta}{(\alpha + \beta)^2 (\alpha + \beta + 1)}$
+Recall that after observing $k$ heads in $n$ flips, the posterior is $\textrm{Beta}(\alpha + k, \, \beta + n - k)$.
 
-* $\alpha$ can be viewed as the number of successes
+A beta distribution with parameters $\alpha$ and $\beta$ has
 
-* $\beta$ can be viewed as the number of failures
+* mean $\frac{\alpha}{\alpha + \beta}$
 
-The random variables $k$ and $N-k$ are governed by Binomial Distribution with $\theta=0.4$.
+* variance $\frac{\alpha \beta}{(\alpha + \beta)^2 (\alpha + \beta + 1)}$
 
-Call this the true data generating process.
+Here $\alpha + k$ can be viewed as the number of successes (prior pseudo-count plus observed heads) and $\beta + n - k$ as the number of failures.
 
-According to the Law of Large Numbers, for a large number of observations, observed frequencies of $k$ and $N-k$ will be described by the true data generating process, i.e., the population probability distribution that we assumed when generating the observations on the computer. (See {ref}`pm_ex1`).
+Since the data are generated with $\theta = 0.4$, the Law of Large Numbers tells us that, as $n$ grows, $k/n \to 0.4$ (see {ref}`pm_ex1`).
 
-Consequently, the  mean of the posterior distribution converges to $0.4$ and the variance withers to zero.
+Consequently, the posterior mean converges to $0.4$ and the posterior variance shrinks to zero.
 
 ```{code-cell} ipython3
-upper_bound = [ii.ppf(0.95) for ii in Bay_stat.posterior_list]
-lower_bound = [ii.ppf(0.05) for ii in Bay_stat.posterior_list]
+upper_bound = [post.ppf(0.95) for post in bayes.posterior_list]
+lower_bound = [post.ppf(0.05) for post in bayes.posterior_list]
 
 fig, ax = plt.subplots(figsize=(10, 6))
-ax.scatter(np.arange(len(upper_bound)), upper_bound, label='95 th Quantile')
-ax.scatter(np.arange(len(lower_bound)), lower_bound, label='05 th Quantile')
+ax.scatter(np.arange(len(upper_bound)),
+           upper_bound, label='95th Quantile')
+ax.scatter(np.arange(len(lower_bound)),
+           lower_bound, label='5th Quantile')
 
 ax.set_xticks(np.arange(0, len(upper_bound), 2))
-ax.set_xticklabels(num_list[::2])
+ax.set_xticklabels(n_obs_list[::2])
 ax.set_xlabel('Number of Observations', fontsize=12)
-ax.set_title('Bayesian Coverage Intervals of Posterior Distributions', fontsize=15)
+ax.set_title('Bayesian Coverage Intervals of '
+             'Posterior Distributions', fontsize=15)
 
 ax.legend(fontsize=11)
 plt.show()
@@ -687,7 +639,7 @@ plt.show()
 
 After observing a large number of outcomes, the  posterior distribution collapses around $0.4$.
 
-Thus, the Bayesian statististian  comes to believe that $\theta$ is near $.4$.
+Thus, the Bayesian statistician  comes to believe that $\theta$ is near $.4$.
 
 As shown in the figure above, as the number of observations grows, the Bayesian coverage intervals (BCIs) become narrower and narrower   around  $0.4$.
 
@@ -704,7 +656,7 @@ So posterior and prior are both beta distributions, albeit ones with different p
 
 When a likelihood function and prior fit together like hand and glove in this way, we can  say that the  prior and posterior are **conjugate distributions**.
 
-In this situation, we also sometimes  say that we have **conjugate prior** for the likelihood function $\textrm{Prob}(X | \theta)$.
+In this situation, we also sometimes  say that we have **conjugate prior** for the likelihood function $\mathbb{P}\{X \mid \theta\}$.
 
 Typically, the functional form of the likelihood function determines the functional form of a **conjugate prior**.
 
@@ -716,7 +668,7 @@ To be argumentative, one could ask, why should the form of the likelihood functi
 
 A dignified response to that question is, well, it shouldn't, but if you want to compute a posterior easily you'll just be happier if your prior is conjugate to your likelihood.
 
-Otherwise, your posterior won't have a convenient analytical form and you'll be in the situation of wanting to apply the Markov chain Monte Carlo techniques deployed in {doc}`this quantecon lecture <bayes_nonconj>`.
+Otherwise, your posterior won't have a convenient analytical form and you'll be in the situation of wanting to apply the Markov chain Monte Carlo techniques deployed in {doc}`Non-Conjugate Priors <bayes_nonconj>`.
 
 We also apply these powerful methods to approximating Bayesian posteriors for non-conjugate priors in
-{doc}`this quantecon lecture <ar1_bayes>` and {doc}`this quantecon lecture <ar1_turningpts>`
+{doc}`Posterior Distributions for AR(1) Parameters <ar1_bayes>` and {doc}`Forecasting an AR(1) Process <ar1_turningpts>`.

From cd4d979a4cbe74f3c80ef964c4513c571c8e9f13 Mon Sep 17 00:00:00 2001
From: Chihiro Watanabe <chihiro.watanabe.econ@gmail.com>
Date: Fri, 29 May 2026 13:27:37 +0900
Subject: [PATCH 16/25] Update rng usage in stats_examples.md (#873)

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 lectures/stats_examples.md | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/lectures/stats_examples.md b/lectures/stats_examples.md
index 8e3f6bce4..8a1e5cccc 100644
--- a/lectures/stats_examples.md
+++ b/lectures/stats_examples.md
@@ -74,11 +74,13 @@ $$
 Let's use Python  draw observations from the distribution and compare the sample mean and variance with the theoretical results.
 
 ```{code-cell} ipython3
+rng = np.random.default_rng()
+
 # specify parameters
 p, n = 0.3, 1_000_000
 
 # draw observations from the distribution
-x = np.random.geometric(p, n)
+x = rng.geometric(p, n)
 
 # compute sample mean and variance
 μ_hat = np.mean(x)
@@ -126,7 +128,7 @@ $$
 r, p, n = 10, 0.3, 1_000_000
 
 # draw observations from the distribution
-x = np.random.negative_binomial(r, p, n)
+x = rng.negative_binomial(r, p, n)
 
 # compute sample mean and variance
 μ_hat = np.mean(x)
@@ -217,7 +219,7 @@ In the below example, we set $\mu = 0, \sigma = 0.1$.
 n = 1_000_000
 
 # draw observations from the distribution
-x = np.random.normal(μ, σ, n)
+x = rng.normal(μ, σ, n)
 
 # compute sample mean and variance
 μ_hat = np.mean(x)
@@ -259,7 +261,7 @@ a, b = 10, 20
 n = 1_000_000
 
 # draw observations from the distribution
-x = a + (b-a)*np.random.rand(n)
+x = a + (b-a)*rng.random(n)
 
 # compute sample mean and variance
 μ_hat = np.mean(x)
@@ -296,9 +298,9 @@ $$
 Let's start by generating a random sample and computing sample moments.
 
 ```{code-cell} ipython3
-x = np.random.rand(1_000_000)
+x = rng.random(1_000_000)
 # x[x > 0.95] = 100*x[x > 0.95]+300
-x[x > 0.95] = 100*np.random.rand(len(x[x > 0.95]))+300
+x[x > 0.95] = 100*rng.random(len(x[x > 0.95]))+300
 x[x <= 0.95] = 0
 
 μ_hat = np.mean(x)
@@ -441,13 +443,13 @@ Let's check with `numpy`.
 n, λ = 1_000_000, 0.3
 
 # draw uniform numbers
-u = np.random.rand(n)
+u = rng.random(n)
 
 # transform
 x = -np.log(1-u)/λ
 
 # draw geometric distributions
-x_g = np.random.exponential(1 / λ, n)
+x_g = rng.exponential(1 / λ, n)
 
 # plot and compare
 plt.hist(x, bins=100, density=True)
@@ -517,13 +519,13 @@ The exponential distribution is the continuous analog of geometric distribution.
 n, λ = 1_000_000, 0.8
 
 # draw uniform numbers
-u = np.random.rand(n)
+u = rng.random(n)
 
 # transform
 x = np.ceil(np.log(1-u)/np.log(λ) - 1)
 
 # draw geometric distributions
-x_g = np.random.geometric(1-λ, n)
+x_g = rng.geometric(1-λ, n)
 
 # plot and compare
 plt.hist(x, bins=150, density=True)
@@ -531,7 +533,7 @@ plt.show()
 ```
 
 ```{code-cell} ipython3
-np.random.geometric(1-λ, n).max()
+rng.geometric(1-λ, n).max()
 ```
 
 ```{code-cell} ipython3

From e2e97a4cf85fefeaa7184b814e71173f7473eceb Mon Sep 17 00:00:00 2001
From: thomassargent30 <ts43@nyu.edu>
Date: Sun, 31 May 2026 20:18:12 +0900
Subject: [PATCH 17/25] Tom's May 31 edits of long-run risk lecture

---
 lectures/long_run_risk_operator.md | 94 +++++++++++++++---------------
 1 file changed, 48 insertions(+), 46 deletions(-)

diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index 917d8efd5..03f962837 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -28,33 +28,30 @@ kernelspec:
 
 ## Overview
 
-How should we value a cash flow that pays off thirty years from now?
-
 Standard short-horizon asset pricing tells us how investors are compensated
 for tiny, instantaneous exposures to shocks, the *short end* of the term
 structure of risk prices.
 
-But many of the most interesting asset-pricing questions, the equity
+But many of the most interesting  topics about asset pricing questions -- e.g.,  the equity
 premium puzzle, the slope of the yield curve, the prices of long-dated
-options, depend on what happens at the *long end* instead.
+options -- are about the  *long end* of the term structure of risk prices.
 
 This lecture studies the long end using the operator approach of
 {cite:t}`HansenScheinkman2009`.
 
-At the center of the play is a stochastic discount factor or a return is
-multiplicative across time, so its expectation defines a *semigroup* of
+At  center stage will be  a stochastic discount factor process and a  return process that are
+multiplicative across time in the sense that conditional expectations define a *semigroup* of
 valuation operators indexed by horizon $t$.
 
 Long-horizon behaviour of the semigroup is controlled by a single eigenvalue
 problem on the state space.
 
-When we solve that eigenvalue problem and pick the right eigenfunction, the
-multiplicative functional $M_t$ factors into three economically meaningful
-pieces: a deterministic exponential trend, a martingale that changes
+By solving that eigenvalue problem and selecting an appropriate eigenfunction, we can factor the
+multiplicative functional $M_t$  into three economically meaningful
+pieces: a deterministic exponential trend, a non-negative martingale that changes
 probability measure, and a transient state-dependent component.
 
-We will derive this factorization carefully, but here is the headline result
-to keep in mind:
+In particular, the factorization takes the form
 
 $$
     M_t
@@ -75,7 +72,7 @@ where
   twisted process settles into its stationary distribution.
 
 In finite-state problems this is exactly the Perron-Frobenius decomposition of
-a positive matrix; in general state spaces it is a continuous-state analogue.
+a positive matrix; in general state spaces it is a continuous-state counterpart.
 
 We will refer to {eq}`eq:hs-factorization` as the **multiplicative
 factorization** associated with $(\rho,\phi,\hat M)$.
@@ -92,19 +89,19 @@ This lecture is closely related to the advanced lecture
 permanent-transitory decomposition for additive and multiplicative
 functionals in a discrete-time linear-Gaussian setting.
 
-Reading the two together is a good way to see the same long-run risk ideas
+Reading these two lextures together is a good way to learn about representations of long-run risks
 in both continuous and discrete time.
 ```
 
 We will build up to {eq}`eq:hs-factorization` and use it to compute
 long-run risk prices in concrete models.
 
-The plan is:
+The plan of this lecture is to:
 
 1. Set up positive multiplicative functionals $M$ (discount factors, returns,
    stochastic growth) and the valuation semigroups they generate.
 
-2. Introduce the **generator** of a semigroup, the local operator whose
+2. Introduce the **generator** of a semigroup, a local operator whose
    eigenvalue problem controls long-run behaviour.
 
 3. Find the principal eigenfunction $\phi$ and derive the factorization.
@@ -114,14 +111,13 @@ The plan is:
    the eigenfunction is exponential-affine and we get closed-form formulas).
 
 5. Use the factorization to compute long-run risk prices and compare them to
-   the local risk prices that would be reported by short-horizon asset
+   the local risk prices appropriate for short-horizon asset
    pricing.
 
-A recurring theme will be that local and long-run risk prices can differ
-sharply when shocks move persistent state variables.
+A recurring theme will be that when shocks move persistent state variables,  local and long-run risk prices can differ markedly.
 
-That is the key
-mechanism that makes long-run risk models like {cite:t}`Bansal_Yaron_2004`
+That diifference underlies the
+mechanism that lets long-run risk models like {cite:t}`Bansal_Yaron_2004`
 generate large equity premia.
 
 We start with the following imports
@@ -143,8 +139,8 @@ history.
 We will work with a strong Markov process whose sample paths are càdlàg
 (defined below).
 
-For the explicit formulas later we will specialize to a semimartingale,
-which decomposes into a continuous component $X^c$ and a pure-jump
+For the explicit formulas later we will specialize to a semimartingale that 
+ decomposes into a continuous component $X^c$ and a pure-jump
 component $X^j$:
 
 $$
@@ -176,18 +172,18 @@ the form $\phi(y) - \phi(x)$ that appears in the generator below.
 We also impose two simplifying assumptions:
 
 * **Finite jumps** on finite time intervals: only finitely many jumps
-  occur on any bounded interval, which keeps integrals against the jump
+  occur on any bounded interval; this keeps integrals against the jump
   measure well-defined and finite.
 * **Sufficient rank in $\Gamma$** so that the Brownian shocks relevant for
-  pricing can be recovered from the state history, which is what makes the
-  Markov state $X$ "rich enough" to be a sufficient statistic for valuation.
+  pricing can be recovered from the state history; this  makes the
+  Markov state $X$ "rich enough" to describe valuation.
 
-They let us write the generator
+These assumptions let us write the generator
 in closed form and use martingale-based changes of measure freely.
 
 ### Functionals and càdlàg paths
 
-We need a name for "any process that records something about the history of
+We need a name for "a process that records something about the history of
 $X$".
 
 This includes, for example, a stochastic discount factor or a cumulated return.
@@ -213,7 +209,7 @@ $$
 and the left limit $M_{t-}(\omega) := \lim_{s \uparrow t} M_s(\omega)$ exists
 and is finite for all $t > 0$.
 
-In words: paths may jump, but each jump $\Delta M_t := M_t - M_{t-}$ resolves
+Thus,  paths can jump, but each jump $\Delta M_t := M_t - M_{t-}$ occurs
 instantaneously.
 
 At the jump time $t$, the value is the post-jump value,
@@ -248,7 +244,7 @@ $$ (eq:multiplicative)
 where $\theta_t$ shifts the underlying Markov path forward by $t$ units.
 ```
 
-Why is this the natural condition?
+Why is this a useful  condition to require?
 
 Think of $M_t = S_t$, a stochastic discount factor.
 
@@ -263,7 +259,9 @@ $$
 
 For the price to depend only on the current Markov state $X_\tau$ (and not on
 the entire history up to $\tau$), the ratio $S_t/S_\tau$ must be a function
-only of the Markov path *after* $\tau$. That is,
+only of the Markov path *after* $\tau$. 
+
+Thus,
 $S_{\tau+u}/S_\tau = S_u(\theta_\tau)$, which is exactly
 {eq}`eq:multiplicative`.
 
@@ -384,7 +382,7 @@ The semigroup identity says these two procedures give the same answer.
 This is the operator-level version of the intertemporal consistency that
 rules out arbitrage across horizons.
 
-Four positive multiplicative functionals will appear throughout.
+Four positive multiplicative functionals will appear often below.
 
 | Symbol | Object | Semigroup |
 |:---|:---|:---:|
@@ -463,7 +461,7 @@ $$
 so a transient state-dependent factor can be shuffled between $G$ and $\psi$
 without changing $D_t$.
 
-We resolve this by normalizing the growth component so its permanent part is
+We resolve this indeterminacy by normalizing the growth component so that its permanent part is
 a martingale: $G_t = \exp(\delta t)\hat G_t$, with $\hat G$ a martingale and
 $\delta$ a constant trend.
 
@@ -478,7 +476,7 @@ like at the *short* end.
 
 That is the standard instantaneous risk-return relation.
 
-This will give us a benchmark to compare long-run risk prices against later.
+This will give us a benchmark against which to compare long-run risk prices.
 
 For a textbook discrete-time treatment of the same SDF-based asset-pricing
 ideas, see {doc}`advanced:asset_pricing_lph`. 
@@ -569,7 +567,7 @@ $$
 \end{aligned}
 $$ (eq:local-risk-return)
 
-Thus the Brownian local risk-price vector is $-\gamma^s(x)$, expressed in
+Thus, the Brownian local risk-price vector is $-\gamma^s(x)$, expressed in
 the same exposure units as $\gamma^v(x)$.
 
 Jump risk is priced through the function $\kappa^s$.
@@ -583,8 +581,10 @@ The eigenvalue calculations below describe the other end.
 So far we have a family of operators $\{\mathbb M_t\}_{t \geq 0}$, one for each
 horizon $t$.
 
-That is more information than we can analyze directly and what we really
-want is the behaviour of $\mathbb M_t \psi$ as $t \to \infty$.
+That is more information than we can analyze directly.
+
+Actually,  what we really
+care about is the behaviour of $\mathbb M_t \psi$ as $t \to \infty$.
 
 The **generator** $\mathbb A$ compresses the entire semigroup into one
 time-independent operator on the state space.
@@ -592,7 +592,7 @@ time-independent operator on the state space.
 It records the *instantaneous* rate of change of $M_t \psi(X_t)$, and its
 eigenvalues drive the long-run growth rate of $\mathbb M_t$.
 
-That is what lets us turn an asymptotic question about a family of operators
+This lets us turn an asymptotic question about a family of operators
 into a single eigenvalue problem.
 
 ### Discrete-time intuition
@@ -656,7 +656,7 @@ $M_n \psi(X_n)$, and through $K^n$ it also controls long-run growth.
 
 Continuous time keeps the same logic.
 
-The natural replacement for $K-I$ is the **infinitesimal generator** of the
+The natural counterpart $K-I$ is the **infinitesimal generator** of the
 semigroup $\{\mathbb M_t\}$, the time derivative at zero:
 
 $$
@@ -716,7 +716,7 @@ martingale property of $\hat M$.
 
 ### Extended generator
 
-There is a catch with the limit definition above.
+There is a qualification to  the limit definition above.
 
 To make the limit $h \downarrow 0$ rigorous, the textbook definition
 requires $(\mathbb M_h\psi - \psi)/h$ to converge to $\mathbb A\psi$ in a
@@ -735,7 +735,7 @@ limit need not converge for them.
 through a *Doob-Meyer style* semimartingale decomposition of
 $M_t \psi(X_t)$, a pathwise condition that does not require any norm.
 
-This is the continuous-time analogue of writing $K - I$ as the predictable
+This is the continuous-time counterpart to writing $K - I$ as the predictable
 rate of change of $M_n\psi(X_n)$ in discrete time.
 
 The resulting **extended generator** admits unbounded $\psi$, has a
@@ -749,7 +749,7 @@ Concretely:
 
 Fix a Borel function $\psi$, and look for a second Borel function $\chi$ that
 will play the role of "the instantaneous rate of change of $M_t \psi(X_t)$
-at the current state". Precisely, we ask whether there exists $\chi$ such
+at the current state". We ask whether there exists $\chi$ such
 that
 
 $$
@@ -962,7 +962,7 @@ $$
 $$
 ```
 
-The verification only showed that $\hat M$ is a *local* martingale, but
+The verification establishes only  that $\hat M$ is a *local* martingale, but
 the definition above (and the change-of-measure interpretation of
 $\hat M$) both require it to be a martingale.
 
@@ -989,7 +989,7 @@ so $\rho$ is at least an upper bound on the long-run growth rate of
 $\mathbb M_t \phi$.
 
 When $\hat M$ is in fact a martingale, $E\hat M_t = 1$, the inequality
-becomes equality, and the local condition $\mathbb A\phi = \rho\phi$ lifts
+becomes an equality, and the local condition $\mathbb A\phi = \rho\phi$ lifts
 to the semigroup eigenvalue equation
 
 $$
@@ -1002,7 +1002,7 @@ $$ (eq:semigroup-eigen)
 We now have a factorization {eq}`eq:hs-factorization` for *any* principal
 eigenfunction.
 
-But for $(\rho,\phi)$ to actually describe **long-run** behaviour of
+But for $(\rho,\phi)$ to  describe **long-run** behaviour of
 $\mathbb M_t$  the twisted
 process must settle into a stationary regime as $t \to \infty$.
 
@@ -1076,12 +1076,14 @@ $$
 $$
 ```
 
-Reachability (Condition 2) is not enough. A region
+Reachability (Condition 2) is not enough.
+
+A region
 might be reachable but visited only with small probability, so time averages
 fail to converge to $\hat\varsigma$-averages. 
 
 Harris recurrence is the
-continuous-state replacement for "recurrent state" in a finite chain.
+continuous-state counterpart to a "recurrent state" in a finite chain.
 
 Bundling these together:
 

From c319130fde80d903bfa352f9bdb1de17407cc1fd Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Mon, 1 Jun 2026 20:35:50 +1000
Subject: [PATCH 18/25] updates

---
 lectures/ls_learning.md | 237 +++++++++++++++++++++++++++++-----------
 1 file changed, 175 insertions(+), 62 deletions(-)

diff --git a/lectures/ls_learning.md b/lectures/ls_learning.md
index c00813af8..8ad46147c 100644
--- a/lectures/ls_learning.md
+++ b/lectures/ls_learning.md
@@ -54,8 +54,8 @@ will they converge to the REE?
 technique from systems-control engineering: the differential equation
 approach of {cite:t}`Ljung1977`.
 
-The key insight is that the stochastic
-difference equation describing how beliefs evolve can be approximated, in the
+They applied stochastic
+difference equation to describe how beliefs evolve can be approximated, in the
 limit, by a deterministic ordinary differential equation (ODE).
 
 Almost-sure
@@ -71,13 +71,15 @@ whose data-generating process shifts with beliefs) discussed in
 
 
 
-Let's begin with the imports we'll use throughout.
+Let's begin with the imports we'll use throughout
 
 ```{code-cell} ipython3
 import numpy as np
 import matplotlib.pyplot as plt
-from scipy.integrate import solve_ivp
 from matplotlib.gridspec import GridSpec
+from numpy import linalg as la
+from scipy.integrate import solve_ivp
+from scipy.optimize import fsolve
 
 np.random.seed(42)
 ```
@@ -575,7 +577,12 @@ def simulate_rls_with_projection(T_map, σ_u, β0, K_proj,
 
 
 a_bray_pf, b_bray_pf, σ_pf = 1.0, 0.6, 1.5
-T_bray_pf = lambda β: a_bray_pf + b_bray_pf * β
+
+
+def T_bray_pf(β):
+    return a_bray_pf + b_bray_pf * β
+
+
 β_f_pf = a_bray_pf / (1 - b_bray_pf)
 β0_far = 8.0
 K_pf = 5.0
@@ -599,17 +606,27 @@ for i in range(min(30, N_pf_sim)):
 ax1.plot(np.mean(paths_pf, axis=0), color='navy', lw=2, label='average')
 ax1.axhline(β_f_pf, color='red', ls='--', lw=2,
             label=f'$\\beta_f={β_f_pf:.1f}$')
-ax1.axhline(K_pf, color='gray', ls=':', lw=2, label=f'$D_1$ boundary ($K={K_pf}$)')
+ax1.axhline(
+    K_pf, color='gray', ls=':', lw=2,
+    label=f'$D_1$ boundary ($K={K_pf}$)'
+)
 ax1.axhline(-K_pf, color='gray', ls=':', lw=2)
-ax1.set_xlabel('$t$'); ax1.set_ylabel('$\\beta_t$'); ax1.legend(fontsize=8)
+ax1.set_xlabel('$t$')
+ax1.set_ylabel('$\\beta_t$')
+ax1.legend(fontsize=8)
 
 ax2 = fig.add_subplot(gs[0, 1])
 for i in range(min(30, N_pf_sim)):
     ax2.plot(paths_no_pf[i], color='darkorange', alpha=0.25, lw=2)
-ax2.plot(np.mean(paths_no_pf, axis=0), color='saddlebrown', lw=2, label='average')
+ax2.plot(
+    np.mean(paths_no_pf, axis=0), color='saddlebrown', lw=2,
+    label='average'
+)
 ax2.axhline(β_f_pf, color='red', ls='--', lw=2,
             label=f'$\\beta_f={β_f_pf:.1f}$')
-ax2.set_xlabel('$t$'); ax2.set_ylabel('$\\beta_t$'); ax2.legend(fontsize=8)
+ax2.set_xlabel('$t$')
+ax2.set_ylabel('$\\beta_t$')
+ax2.legend(fontsize=8)
 
 ax3 = fig.add_subplot(gs[1, 0])
 ax3.hist(n_proj, bins=range(0, int(n_proj.max()) + 2),
@@ -629,7 +646,10 @@ plt.show()
 print(f"Paths with at least one projection: {(n_proj > 0).sum()} / {N_pf_sim}")
 print(f"Mean number of projections per path: {n_proj.mean():.2f}")
 print(f"Max number of projections:           {n_proj.max()}")
-print(f"Mean last-projection period:         {first_free[n_proj>0].mean():.1f}")
+print(
+    "Mean last-projection period:         "
+    f"{first_free[n_proj > 0].mean():.1f}"
+)
 ```
 
 The simulation illustrates the key theoretical point from
@@ -789,7 +809,12 @@ mystnb:
     name: fig-bray-learning-dynamics
 ---
 a_bray, b_bray, σ_bray = 1.0, 0.6, 1.0
-T_bray = lambda β: a_bray + b_bray * β
+
+
+def T_bray(β):
+    return a_bray + b_bray * β
+
+
 β_f_bray = a_bray / (1 - b_bray)
 
 β0_bray = 0.0
@@ -799,7 +824,10 @@ N_sim = 80
 β_paths_bray = simulate_rls_scalar(T_bray, σ_bray, β0_bray,
                                       T_periods=T_sim, N_paths=N_sim)
 
-ode_bray = lambda β: a_bray + b_bray * β - β
+def ode_bray(β):
+    return a_bray + b_bray * β - β
+
+
 t_ode, sol_low = solve_ode(ode_bray, 0.0)
 _, sol_high = solve_ode(ode_bray, 4.5)
 
@@ -817,8 +845,14 @@ ax.set_ylabel('$\\beta_t$')
 ax.legend()
 
 ax = axes[1]
-ax.plot(t_ode, sol_low,  color='steelblue', lw=2, label='ODE from $\\beta_0=0$')
-ax.plot(t_ode, sol_high, color='darkorange', lw=2, label='ODE from $\\beta_0=4.5$')
+ax.plot(
+    t_ode, sol_low, color='steelblue', lw=2,
+    label='ODE from $\\beta_0=0$'
+)
+ax.plot(
+    t_ode, sol_high, color='darkorange', lw=2,
+    label='ODE from $\\beta_0=4.5$'
+)
 ax.axhline(β_f_bray, color='red', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_bray:.2f}$')
 ax.set_xlabel('$t$')
@@ -841,13 +875,21 @@ mystnb:
     name: fig-bray-savin-learning-dynamics
 ---
 m_bs, a_bs, σ_bs = 0.5, 0.7, 1.0
-T_bs = lambda β: m_bs + a_bs * β
+
+
+def T_bs(β):
+    return m_bs + a_bs * β
+
+
 β_f_bs = m_bs / (1 - a_bs)
 
 β_paths_bs = simulate_rls_scalar(T_bs, σ_bs, 0.0,
                                     T_periods=T_sim, N_paths=N_sim)
 
-ode_bs = lambda β: T_bs(β) - β
+def ode_bs(β):
+    return T_bs(β) - β
+
+
 t_ode_bs, sol_bs_low = solve_ode(ode_bs, 0.0)
 _, sol_bs_high = solve_ode(ode_bs, 4.0)
 
@@ -860,15 +902,23 @@ ax.plot(np.mean(β_paths_bs, axis=0), color='saddlebrown', lw=2,
         label='cross-path average')
 ax.axhline(β_f_bs, color='red', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_bs:.2f}$')
-ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta_t$')
 ax.legend()
 
 ax = axes[1]
-ax.plot(t_ode_bs, sol_bs_low,  color='darkorange', lw=2, label='ODE from $\\beta_0=0$')
-ax.plot(t_ode_bs, sol_bs_high, color='steelblue',  lw=2, label='ODE from $\\beta_0=4$')
+ax.plot(
+    t_ode_bs, sol_bs_low, color='darkorange', lw=2,
+    label='ODE from $\\beta_0=0$'
+)
+ax.plot(
+    t_ode_bs, sol_bs_high, color='steelblue', lw=2,
+    label='ODE from $\\beta_0=4$'
+)
 ax.axhline(β_f_bs, color='red', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_bs:.2f}$')
-ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta(t)$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta(t)$')
 ax.legend()
 
 plt.tight_layout()
@@ -887,13 +937,21 @@ mystnb:
     name: fig-present-value-learning-dynamics
 ---
 λ, ρ_pv, σ_pv = 0.8, 0.9, 1.0
-T_pv = lambda β: (λ * β + 1) * ρ_pv
+
+
+def T_pv(β):
+    return (λ * β + 1) * ρ_pv
+
+
 β_f_pv = ρ_pv / (1 - λ * ρ_pv)
 
 β_paths_pv = simulate_rls_scalar(T_pv, σ_pv, 0.0,
                                     T_periods=T_sim, N_paths=N_sim)
 
-ode_pv = lambda β: T_pv(β) - β
+def ode_pv(β):
+    return T_pv(β) - β
+
+
 t_ode_pv, sol_pv_low = solve_ode(ode_pv, 0.0, t_span=(0, 50))
 _, sol_pv_high = solve_ode(ode_pv, 10.0, t_span=(0, 50))
 
@@ -906,15 +964,23 @@ ax.plot(np.mean(β_paths_pv, axis=0), color='darkgreen', lw=2,
         label='cross-path average')
 ax.axhline(β_f_pv, color='red', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_pv:.2f}$')
-ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta_t$')
 ax.legend()
 
 ax = axes[1]
-ax.plot(t_ode_pv, sol_pv_low,  color='seagreen',  lw=2, label='ODE from $\\beta_0=0$')
-ax.plot(t_ode_pv, sol_pv_high, color='steelblue', lw=2, label='ODE from $\\beta_0=10$')
+ax.plot(
+    t_ode_pv, sol_pv_low, color='seagreen', lw=2,
+    label='ODE from $\\beta_0=0$'
+)
+ax.plot(
+    t_ode_pv, sol_pv_high, color='steelblue', lw=2,
+    label='ODE from $\\beta_0=10$'
+)
 ax.axhline(β_f_pv, color='red', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_pv:.2f}$')
-ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta(t)$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta(t)$')
 ax.legend()
 
 plt.tight_layout()
@@ -935,14 +1001,22 @@ mystnb:
     name: fig-unstable-bray-dynamics
 ---
 b_unstable = 1.4
-T_unstable = lambda β: a_bray + b_unstable * β
+
+
+def T_unstable(β):
+    return a_bray + b_unstable * β
+
+
 β_f_unstable = a_bray / (1 - b_unstable)
 
 β_paths_unstable = simulate_rls_scalar(
     T_unstable, σ_bray, β0=0.0,
     T_periods=200, N_paths=50)
 
-ode_unstable = lambda β: T_unstable(β) - β
+def ode_unstable(β):
+    return T_unstable(β) - β
+
+
 
 β_grid = np.linspace(-5, 5, 300)
 drift = np.array([ode_unstable(b) for b in β_grid])
@@ -954,7 +1028,8 @@ for i in range(min(30, 50)):
     ax.plot(β_paths_unstable[i], color='crimson', alpha=0.3, lw=2)
 ax.axhline(β_f_unstable, color='black', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_unstable:.2f}$ (unstable)')
-ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta_t$')
 ax.legend()
 
 ax = axes[1]
@@ -966,7 +1041,8 @@ ax.fill_between(β_grid, drift, 0,
                 where=(drift > 0), color='crimson', alpha=0.15)
 ax.fill_between(β_grid, drift, 0,
                 where=(drift < 0), color='steelblue', alpha=0.15)
-ax.set_xlabel('$\\beta$'); ax.set_ylabel('$T(\\beta) - \\beta$')
+ax.set_xlabel('$\\beta$')
+ax.set_ylabel('$T(\\beta) - \\beta$')
 ax.legend()
 
 plt.tight_layout()
@@ -997,9 +1073,24 @@ mystnb:
 β_vec = np.linspace(-1.0, 5.5, 400)
 
 models = [
-    ("Bray ($b=0.6$)",       lambda b: a_bray + 0.6*b - b,   a_bray/(1-0.6),   'steelblue'),
-    ("Bray–Savin ($a=0.7$)", lambda b: m_bs + 0.7*b - b,     m_bs/(1-0.7),     'darkorange'),
-    ("Present-value",        lambda b: T_pv(b) - b,           β_f_pv,        'seagreen'),
+    (
+        "Bray ($b=0.6$)",
+        lambda b: a_bray + 0.6 * b - b,
+        a_bray / (1 - 0.6),
+        'steelblue'
+    ),
+    (
+        "Bray–Savin ($a=0.7$)",
+        lambda b: m_bs + 0.7 * b - b,
+        m_bs / (1 - 0.7),
+        'darkorange'
+    ),
+    (
+        "Present-value",
+        lambda b: T_pv(b) - b,
+        β_f_pv,
+        'seagreen'
+    ),
 ]
 
 fig, axes = plt.subplots(1, 3, figsize=(15, 5))
@@ -1015,9 +1106,11 @@ for ax, (name, ode_fn, bf, color) in zip(axes, models):
                     color=color, alpha=0.12)
     for bv in np.linspace(β_vec[20], β_vec[-20], 7):
         d = ode_fn(bv)
-        ax.annotate('', xy=(bv + 0.3*np.sign(d), 0),
-                    xytext=(bv, 0),
-                    arrowprops=dict(arrowstyle='->', color=color, lw=2))
+        ax.annotate(
+            '', xy=(bv + 0.3 * np.sign(d), 0),
+            xytext=(bv, 0),
+            arrowprops=dict(arrowstyle='->', color=color, lw=2)
+        )
     ax.set_xlabel('$\\beta$')
     ax.set_ylabel('$T(\\beta) - \\beta$')
     ax.legend(fontsize=9)
@@ -1042,10 +1135,10 @@ mystnb:
 def T_invest(β, b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, ρ_w=0.5):
     """Mapping T for the investment model with β = [β1, β2]."""
     b1, b2 = β
-    denom1 = 1 - b1*b + (1/d)*f**2*A1*N
-    T1 = (1 - b1*b) / denom1
-    numer2 = (1 - b1*b + f**2*A1*b2*b*ρ_w)
-    T2 = (-N / (d*(1 - ρ_w*b))) * (numer2 / denom1) * ρ_w
+    denom1 = 1 - b1 * b + (1 / d) * f**2 * A1 * N
+    T1 = (1 - b1 * b) / denom1
+    numer2 = 1 - b1 * b + f**2 * A1 * b2 * b * ρ_w
+    T2 = (-N / (d * (1 - ρ_w * b))) * (numer2 / denom1) * ρ_w
     return np.array([T1, T2])
 
 
@@ -1054,20 +1147,19 @@ def ode_invest(t, β, **kwargs):
     return Tb - β
 
 
-from scipy.optimize import fsolve
-
 params = dict(b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, ρ_w=0.5)
 β_f_inv = fsolve(lambda b: T_invest(b, **params) - b, [0.5, 0.1])
 print(f"REE: β_f = {β_f_inv}")
 
-from numpy import linalg as la
-
 eps = 1e-6
 J = np.zeros((2, 2))
 for j in range(2):
-    e = np.zeros(2); e[j] = eps
-    J[:, j] = (T_invest(β_f_inv + e, **params) -
-               T_invest(β_f_inv - e, **params)) / (2*eps)
+    e = np.zeros(2)
+    e[j] = eps
+    J[:, j] = (
+        T_invest(β_f_inv + e, **params)
+        - T_invest(β_f_inv - e, **params)
+    ) / (2 * eps)
 M = J - np.eye(2)
 eigs = la.eigvals(M)
 print(f"Jacobian M eigenvalues: {eigs}")
@@ -1078,12 +1170,14 @@ fig, ax = plt.subplots(figsize=(8, 6))
 b1_grid = np.linspace(-0.1, 1.2, 20)
 b2_grid = np.linspace(-0.8, 0.5, 20)
 B1, B2 = np.meshgrid(b1_grid, b2_grid)
-U = np.zeros_like(B1); V_field = np.zeros_like(B2)
+U = np.zeros_like(B1)
+V_field = np.zeros_like(B2)
 for i in range(B1.shape[0]):
     for j in range(B1.shape[1]):
-        β_ij = np.array([B1[i,j], B2[i,j]])
+        β_ij = np.array([B1[i, j], B2[i, j]])
         drift = T_invest(β_ij, **params) - β_ij
-        U[i,j] = drift[0]; V_field[i,j] = drift[1]
+        U[i, j] = drift[0]
+        V_field[i, j] = drift[1]
 
 speed = np.sqrt(U**2 + V_field**2)
 speed[speed == 0] = 1e-8
@@ -1100,7 +1194,7 @@ for (b10, b20), col in zip(starts, colors_traj):
     ax.plot(sol.y[0], sol.y[1], color=col, lw=2)
     ax.plot(b10, b20, 'o', color=col, ms=7)
 
-ax.plot(*β_f_inv, 'k*', ms=14, label=f'REE $\\beta_f$')
+ax.plot(*β_f_inv, 'k*', ms=14, label='REE $\\beta_f$')
 ax.set_xlabel('$\\beta_1$', fontsize=12)
 ax.set_ylabel('$\\beta_2$', fontsize=12)
 ax.legend()
@@ -1139,7 +1233,8 @@ ax.axhline(β_f_bray, color='red', ls='--', lw=2,
            label=f'REE $\\beta_f = {β_f_bray:.2f}$')
 ax.axhline(β_false_rest, color='gray', ls=':', lw=2,
            label=f'False start $\\beta_0 = {β_false_rest}$')
-ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta_t$')
 ax.legend()
 plt.tight_layout()
 plt.show()
@@ -1271,7 +1366,9 @@ colors_ex = ['steelblue', 'darkorange', 'seagreen', 'purple']
 
 fig, ax = plt.subplots(figsize=(11, 5))
 for b_val, col in zip(b_values, colors_ex):
-    T_fn = lambda β, bv=b_val: a_ex + bv * β
+    def T_fn(β, b_val=b_val):
+        return a_ex + b_val * β
+
     paths = simulate_rls_scalar(T_fn, σ_u=1.0, β0=0.0,
                                 T_periods=T_ex, N_paths=N_ex, seed=0)
     bf = a_ex / (1 - b_val)
@@ -1326,7 +1423,10 @@ the paths diverge.
 ```{code-cell} ipython3
 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
 
-T_st = lambda β: 1.0 + 0.6*β
+def T_st(β):
+    return 1.0 + 0.6 * β
+
+
 paths_far = simulate_rls_scalar(T_st, 1.0, β0=6.0,
                                 T_periods=600, N_paths=100, seed=1)
 ax = axes[0]
@@ -1335,9 +1435,14 @@ for i in range(40):
 ax.plot(np.mean(paths_far, axis=0), color='navy', lw=2, label='average')
 ax.axhline(2.5, color='red', ls='--', lw=2, label='$\\beta_f = 2.5$')
 ax.set_title('Stable ($b=0.6$): far start still converges')
-ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$'); ax.legend()
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta_t$')
+ax.legend()
+
+def T_un(β):
+    return 1.0 + 1.5 * β
+
 
-T_un = lambda β: 1.0 + 1.5*β
 β_f_un = 1.0 / (1 - 1.5)
 paths_un = simulate_rls_scalar(T_un, 1.0, β0=0.1,
                                T_periods=200, N_paths=50, seed=2)
@@ -1347,7 +1452,9 @@ for i in range(50):
 ax.axhline(β_f_un, color='black', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_un}$ (unstable)')
 ax.set_title('Unstable ($b=1.5$): diverges even near REE')
-ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$'); ax.legend()
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta_t$')
+ax.legend()
 
 plt.tight_layout()
 plt.show()
@@ -1357,7 +1464,7 @@ plt.show()
 
 ```{code-cell} ipython3
 β_g = np.linspace(-8, 6, 400)
-drift_un = np.array([1.0 + 1.5*b - b for b in β_g])
+drift_un = np.array([1.0 + 1.5 * b - b for b in β_g])
 
 fig, ax = plt.subplots(figsize=(8, 4))
 ax.plot(β_g, drift_un, color='crimson', lw=2)
@@ -1368,7 +1475,8 @@ ax.fill_between(β_g, drift_un, 0, where=(drift_un > 0),
                 color='crimson', alpha=0.15)
 ax.fill_between(β_g, drift_un, 0, where=(drift_un < 0),
                 color='steelblue', alpha=0.15)
-ax.set_xlabel('$\\beta$'); ax.set_ylabel('$T(\\beta) - \\beta$')
+ax.set_xlabel('$\\beta$')
+ax.set_ylabel('$T(\\beta) - \\beta$')
 ax.set_title('Phase Diagram: Unstable REE ($b=1.5$)\n'
              'Drift points away from $\\beta_f$ everywhere')
 ax.legend()
@@ -1432,8 +1540,12 @@ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
 colors_λ = ['steelblue', 'darkorange', 'seagreen', 'purple']
 
 for ax, lv, col in zip(axes.flat, λ_values, colors_λ):
-    T_fn = lambda β, l=lv: (l * β + 1) * ρ_ex
-    ode_fn = lambda β, l=lv: T_fn(β, l) - β
+    def T_fn(β, λ_val=lv):
+        return (λ_val * β + 1) * ρ_ex
+
+    def ode_fn(β, λ_val=lv):
+        return (λ_val * β + 1) * ρ_ex - β
+
     bf = ρ_ex / (1 - lv * ρ_ex) if abs(lv * ρ_ex) < 1 else None
 
     paths_λ = simulate_rls_scalar(T_fn, 1.0, β0=0.0,
@@ -1450,7 +1562,8 @@ for ax, lv, col in zip(axes.flat, λ_values, colors_λ):
 
     M_jac = lv * ρ_ex - 1
     ax.set_title(f'$\\lambda={lv}$,  $\\mathcal{{M}}={M_jac:.3f}$')
-    ax.set_xlabel('$t$'); ax.set_ylabel('$\\beta_t$')
+    ax.set_xlabel('$t$')
+    ax.set_ylabel('$\\beta_t$')
     ax.legend(fontsize=8)
 
 plt.tight_layout()

From 008dfab960cf63c8f1699fea181d5f8b67d9b474 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Mon, 1 Jun 2026 20:39:27 +1000
Subject: [PATCH 19/25] updates

---
 lectures/rational_learning_re.md | 881 ++++++++++++-------------------
 1 file changed, 342 insertions(+), 539 deletions(-)

diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index 12108b98b..b96a45dbd 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -30,7 +30,7 @@ kernelspec:
 
 This lecture explores a classic question in economic theory: can agents *learn* their way to a rational expectations equilibrium?
 
-{cite:t}`BrayKreps1987` examine this question in a rigorously specified model.
+The starting point is {cite:t}`BrayKreps1987`, which gives a rigorous model of Bayesian learning inside a rational expectations equilibrium.
 
 In a rational expectations equilibrium, agents use market prices to make inferences about other agents' private information.
 
@@ -38,18 +38,18 @@ Each agent knows the *statistical relationship* between prices and the underlyin
 
 But this raises a question: where does that knowledge come from?
 
-The **rational learning** approach studied by Bray and Kreps asks whether agents who start with uncertainty about the equilibrium price function can, over time, learn it from observations of past prices.
+The **rational learning** approach asks whether agents who start with uncertainty about the equilibrium price function can, over time, learn it from observations of past prices.
 
 The key findings are:
 
 * In every rational learning model, posterior assessments converge because they are bounded martingales.
-* In the paper's benchmark example, the uninformed agent learns the informed agent's risk tolerance.
+* In the benchmark example, the uninformed agent learns the informed agent's risk tolerance.
 * Correct learning requires identification, smooth equilibrium price maps, and positive prior probability for the true model.
 
-This lecture presents the Bray–Kreps framework, explains their benchmark example, and provides Python code for a simplified Bayesian learning illustration.
+This lecture presents the framework, explains the benchmark example, and provides Python code that solves the full equilibrium with rational learning.
 
 
-We focus on {cite:t}`BrayKreps1987`, published in *Arrow and the Ascent of Modern Economic Theory*, which synthesizes earlier work by {cite:t}`Bray1982`, {cite:t}`BraySavin1984`, and the rational expectations literature of {cite:t}`Radner1979`, {cite:t}`grossman1976`, and {cite:t}`Jordan1982`.
+The discussion also connects to earlier work by {cite:t}`Bray1982`, {cite:t}`BraySavin1984`, and the rational expectations literature of {cite:t}`Radner1979`, {cite:t}`grossman1976`, and {cite:t}`Jordan1982`.
 
 Let's start with the following imports
 
@@ -62,7 +62,7 @@ import matplotlib.pyplot as plt
 
 ### Agents and assets
 
-The paper's example is an infinitely repeated version of the information model in {cite:t}`GrossmanStiglitz1980`.
+The benchmark example is an infinitely repeated version of the information model in {cite:t}`GrossmanStiglitz1980`.
 
 Each date is economically disconnected from the others, so agents start each period afresh.
 
@@ -136,9 +136,9 @@ $$ (eq:bk-full-communication-price)
 
 Thus if $\sum_n \theta^n$ is known, the price fully reveals $s_t$.
 
-Following {cite:t}`Radner1979`, Bray and Kreps call this a full communication rational expectations equilibrium.
+Following {cite:t}`Radner1979`, this is called a full communication rational expectations equilibrium.
 
-The paper's learning problem starts when $\theta^I$ is unknown to agent $U$.
+Suppose that $\theta^I$ is unknown to agent $U$.
 
 Agent $U$ knows $\sigma^2$ and $\theta^U$, and starts with a prior density over $\theta^I$ on an interval $[a,b] \subset (0,\infty)$.
 
@@ -158,423 +158,349 @@ After trading, agent $U$ observes $r_t$.
 
 Bayes' rule then updates the posterior over $\theta^I$ using the normal density of the signal implied by {eq}`eq:bk-signal-implied` conditional on the realized return.
 
-This is the main object learned in Bray and Kreps' benchmark example.
+This is the main object learned in the benchmark example.
 
-They emphasize that the equilibrium can be defined recursively, but closed-form prices are "out of the question" even in this simple case.
+Even in this simple case, the equilibrium can be defined recursively but closed-form prices are unavailable.
 
-## A simplified Gaussian illustration
+## The rational learning equilibrium
 
-The code below is a pedagogical simplification of the Bayesian consistency logic.
+The model has two pieces that interact at each date.
 
-Instead of solving the full Bray--Kreps equilibrium with a posterior over risk tolerance, it studies a linear observation model
+The first is the within-period equilibrium given the uninformed agent's current posterior on $\theta^I$.
 
-$$
-p_t = b r_t,
-$$
-
-where the single unknown coefficient $b$ plays the role of an identifiable structural parameter.
-
-The point is to illustrate how Bayesian posteriors concentrate when the likelihood is correctly specified and the true parameter is identified by observations.
-
-## The simplified learning model
-
-### Setup
-
-Agent $U$ *does not know* the equilibrium price function.
+The second is the Bayesian update of that posterior after the period closes.
 
-Specifically, $U$ does not know $b^*$.
+### Uninformed demand given beliefs
 
-However, $U$ does know:
-* The distribution of $r_t$: $r_t \sim \mathcal{N}(0, \sigma^2)$ IID.
-* That the price function is *linear*: $p_t = a + b r_t$ for some unknown $b$.
-* The value of $a = 0$.
+Suppose at date $t$ agent $U$ has posterior density $f_t$ on $\theta^I$ supported on $[a, b]$.
 
-So $U$'s task is to learn the single parameter $b$ from observations of prices and (eventually) returns.
+Suppose the equilibrium informed trade and price are $X^I$ and $p$.
 
-### Observing the signal
+From {eq}`eq:bk-signal-implied`, conditional on $\theta^I$, agent $U$ infers $s_t = \sigma^2 X^I/\theta^I + p$.
 
-At date $t$, agent $U$ observes $p_t$.
+Marginalising over $\theta^I \sim f_t$ and combining with $r_t = s_t + \epsilon_t$ where $\epsilon_t \sim \mathcal N(0,\sigma^2)$ gives the implied conditional distribution of $r_t$.
 
-The signal $U$ extracts is the return implied by the price:
+Because CARA preferences have no wealth effects, agent $U$'s problem reduces to
 
 $$
-\hat{r}_t = \frac{p_t}{b_{t-1}}
+\max_{x^U}\,
+E\!\left[-\exp\!\left(-\tfrac{x^U(r_t - p)}{\theta^U}\right)\right],
 $$
 
-where $b_{t-1}$ is $U$'s current estimate of $b^*$.
+where the expectation integrates over $\theta^I \sim f_t$ and $\epsilon_t$.
 
-After date $t$ trading and before date $t+1$, $U$ observes $r_t$ (the actual return is revealed, say through dividend payments).
-
-### Bayesian updating
-
-Agent $U$ begins with a **prior** distribution on $b$:
+Integrating out $\epsilon_t$ first and then $\theta^I$ yields
 
 $$
-b \sim \mathcal{N}(\mu_0, v_0)
+E[u^U]
+=
+-\exp\!\left(\frac{(x^U)^2 \sigma^2}{2(\theta^U)^2}\right)
+\int_a^b f_t(\theta)\,
+\exp\!\left(-\frac{x^U \sigma^2 X^I}{\theta\,\theta^U}\right)
+d\theta.
 $$
 
-Given past data $(r_1, p_1), \ldots, (r_{t-1}, p_{t-1})$, agent $U$'s posterior on $b$ at date $t$ is
+The first-order condition rearranges to
 
 $$
-b \mid \text{data} \sim \mathcal{N}(\mu_t, v_t)
-$$
+\frac{x^U}{\theta^U}
+=
+X^I \;
+\frac{\int_a^b \theta^{-1} f_t(\theta)\,\exp\!\big(-x^U \sigma^2 X^I/(\theta\theta^U)\big)\,d\theta}
+     {\int_a^b f_t(\theta)\,\exp\!\big(-x^U \sigma^2 X^I/(\theta\theta^U)\big)\,d\theta}.
+$$ (eq:bk-foc)
 
-The posterior is updated using Bayes' rule.
+The right-hand side is $X^I$ multiplied by a tilted expectation of $1/\theta^I$ under a weighting that depends on $x^U$ itself.
 
-Since $p_t = b \cdot r_t$ (with $a = 0$), each pair $(r_s, p_s)$ provides the observation $p_s = b \cdot r_s$, i.e., a noisy linear measurement of $b$.
+Equation {eq}`eq:bk-foc` implicitly defines $x^U(X^I; f_t)$, the uninformed agent's optimal demand at conjectured informed trade $X^I$ and posterior $f_t$.
 
-For a Gaussian prior and Gaussian likelihood, the posterior updates as:
+The optimum does not depend separately on $p$, because the distribution of $r_t - p$ implied by the posterior depends only on $X^I$.
 
-$$
-v_t^{-1} = v_0^{-1} + \frac{1}{\sigma^2} \sum_{s=1}^{t} r_s^2
-$$ (eq:posterior_precision)
+### Market clearing
 
-$$
-\mu_t = v_t \left( v_0^{-1} \mu_0 + \frac{1}{\sigma^2} \sum_{s=1}^{t} r_s p_s \right)
-$$ (eq:posterior_mean)
+Market clearing $X^I + x^U(X^I; f_t) = 2$ pins down the equilibrium informed trade $X^I_t$ as a function of beliefs alone.
 
+Plugging $X^I_t$ into {eq}`eq:bk-informed-demand` recovers the equilibrium price
 
-Equations {eq}`eq:posterior_precision` and {eq}`eq:posterior_mean` follow from the standard Gaussian linear regression posterior.
+$$
+p_t = s_t - \frac{\sigma^2 X^I_t}{\theta^I}.
+$$ (eq:bk-price)
 
-Each observation $(r_s, p_s)$ with $p_s = b r_s + 0$ is treated as a noisy signal of $b$ with signal-to-noise ratio $r_s^2 / \sigma^2$.
+When $f_t$ collapses to a point mass at the true $\theta^I$, equation {eq}`eq:bk-foc` simplifies to $x^U/\theta^U = X^I/\theta^I$, and market clearing gives the full-communication allocation
 
+$$
+X^I_t = \frac{2\theta^I}{\theta^I + \theta^U},
+\qquad
+x^U_t = \frac{2\theta^U}{\theta^I + \theta^U}.
+$$ (eq:bk-full-info-trade)
 
-### The simplified convergence result
+This is the CARA-Normal benchmark we will use to check the simulation.
 
-For the simplified Gaussian model, standard Bayesian linear regression implies the following result.
+### Bayesian update
 
-```{prf:proposition}
-:label: prop-rle-gaussian-convergence
+After trading, agent $U$ observes $(p_t, x^U_t, r_t)$.
+
+Market clearing gives $X^I_t = 2 - x^U_t$, and equation {eq}`eq:bk-signal-implied` assigns a candidate $s_t(\theta) = \sigma^2 X^I_t/\theta + p_t$ to each $\theta$.
 
-For any prior $(\mu_0, v_0)$ with $v_0 < \infty$, as $t \to \infty$,
+Since $s_t \sim \mathcal N(\mu_s, \tau^2)$ independently of $\epsilon_t \sim \mathcal N(0,\sigma^2)$, the conditional density of $s_t$ given $r_t$ is Gaussian:
 
 $$
-\mu_t \xrightarrow{a.s.} b^*,
-\qquad
-v_t \xrightarrow{a.s.} 0.
+g(s\mid r)
+=
+\phi\!\left(s;\, \frac{\sigma^2 \mu_s + \tau^2 r}{\sigma^2 + \tau^2},\,
+                  \frac{\sigma^2 \tau^2}{\sigma^2 + \tau^2}\right),
 $$
 
-That is, agent $U$'s posterior distribution on $b$ converges almost surely to a point mass at the true equilibrium value $b^*$.
-```
-
-This statement is included to make the simulation transparent.
+where $\phi(\cdot; m, v)$ denotes the Normal density with mean $m$ and variance $v$.
 
-The more general martingale convergence results for posterior assessments due to {cite:t}`BrayKreps1987` are discussed below.
+Bayes' rule then produces the posterior update
 
-The intuition is straightforward:
+$$
+f_{t+1}(\theta)
+\propto
+f_t(\theta)\;
+g\!\left(\frac{\sigma^2 X^I_t}{\theta} + p_t \,\Big|\, r_t\right).
+$$ (eq:bk-bayes)
 
-* Each period adds a new observation $(r_t, p_t)$ with information content proportional to $r_t^2$.
-* Since $r_t$ is IID with $E[r_t^2] = \sigma^2 > 0$, the cumulative information $\sum_{s=1}^t r_s^2 \to \infty$ by the law of large numbers.
-* Therefore the posterior precision $v_t^{-1} \to \infty$, which means $v_t \to 0$.
-* Since the observations are generated by the true $b^*$, the posterior mean $\mu_t$ converges to $b^*$.
+This is the rule we simulate below.
 
-The proof follows from standard results on Bayesian consistency for correctly specified Gaussian linear models.
+## Computing the equilibrium
 
-## Simulating Bayesian learning
+We discretise the support $[a,b]$ of $\theta^I$ on a fine grid and represent $f_t$ as a vector of density values.
 
-We now implement the Bayesian learning dynamics and verify convergence numerically.
+There are three computational primitives.
 
-### Parameters
+* `uninformed_demand` solves the FOC in {eq}`eq:bk-foc` for $x^U(X^I; f)$ by root-finding.
+* `equilibrium_XI` solves market clearing $X^I + x^U(X^I; f) = 2$ for $X^I_t$.
+* `bayes_update` applies {eq}`eq:bk-bayes` and renormalises.
 
 ```{code-cell} ipython3
-b_true = 2.0
-σ2 = 1.0
-μ_0 = 0.5
-v_0 = 2.0
-T = 300
-N = 200
-
-np.random.seed(42)
+from scipy.optimize import brentq
 ```
 
-### Bayesian updating function
-
 ```{code-cell} ipython3
-def simulate_bayesian_learning(b_true, σ2, μ_0, v_0, T, N):
-    """Simulate Bayesian learning of the REE slope parameter b*."""
-    r = np.random.normal(0, np.sqrt(σ2), size=(N, T))
-    p = b_true * r
-
-    μ_paths = np.empty((N, T))
-    v_paths = np.empty((N, T))
-
-    for i in range(N):
-        precision = 1.0 / v_0
-        weighted_sum = μ_0 / v_0
-
-        for t in range(T):
-            precision += r[i, t]**2 / σ2
-            weighted_sum += r[i, t] * p[i, t] / σ2
-
-            v_t = 1.0 / precision
-            μ_t = v_t * weighted_sum
+def uninformed_demand(XI, f, θ_grid, θ_U, σ2):
+    """
+    Solve the FOC for the uninformed agent's demand x^U, given
+    a conjectured informed trade XI and posterior density f.
+    """
+    with np.errstate(divide='ignore'):
+        log_f = np.log(f)            # -inf where f == 0 is fine
+
+    def foc(xU):
+        z = xU * σ2 * XI / (θ_grid * θ_U)
+        log_w = log_f - z
+        M = log_w.max()
+        w = np.exp(log_w - M)        # bounded in [0, 1], max value = 1
+        num = np.sum(w / θ_grid)
+        den = np.sum(w)
+        return xU / θ_U - XI * num / den
+
+    return brentq(foc, -20.0, 20.0, xtol=1e-10)
+```
 
-            μ_paths[i, t] = μ_t
-            v_paths[i, t] = v_t
+```{code-cell} ipython3
+def equilibrium_XI(f, θ_grid, θ_U, σ2):
+    """
+    Solve market clearing X^I + x^U(X^I; f) = 2 for the
+    equilibrium informed trade.
+    """
+    def mc(XI):
+        return XI + uninformed_demand(XI, f, θ_grid, θ_U, σ2) - 2.0
 
-    return μ_paths, v_paths
+    return brentq(mc, 1e-4, 4.0, xtol=1e-10)
 ```
 
-### Running the simulation
-
 ```{code-cell} ipython3
-μ_paths, v_paths = simulate_bayesian_learning(
-    b_true, σ2, μ_0, v_0, T, N
-)
+def bayes_update(f, θ_grid, p_t, xU_t, r_t, σ2, τ2, μ_s):
+    """
+    Bayesian update of the posterior on θ^I given date-t observations.
+    """
+    XI = 2.0 - xU_t
+    s_mean = (σ2 * μ_s + τ2 * r_t) / (σ2 + τ2)
+    s_var  = σ2 * τ2 / (σ2 + τ2)
+    s_implied = σ2 * XI / θ_grid + p_t
+
+    log_like = -0.5 * (s_implied - s_mean)**2 / s_var
+    log_like -= log_like.max()           # log-shift for stability
+    f_new = f * np.exp(log_like)
+    dθ = θ_grid[1] - θ_grid[0]
+    f_new /= np.sum(f_new) * dθ
+    return f_new
 ```
 
-### Plotting results
+The simulation loop chains $(s_t, \epsilon_t)$ shocks through these three functions.
 
 ```{code-cell} ipython3
----
-mystnb:
-  figure:
-    caption: posterior learning paths
-    name: fig-rle-posterior-learning
-  image:
-    alt: Posterior mean and posterior variance paths over time
----
-fig, axes = plt.subplots(1, 2, figsize=(13, 5))
-
-t_range = np.arange(1, T + 1)
-
-ax = axes[0]
-for i in range(min(30, N)):
-    ax.plot(t_range, μ_paths[i, :], color='steelblue', alpha=0.2, lw=2)
-
-ax.plot(t_range, np.mean(μ_paths, axis=0), color='navy', lw=2,
-        label='cross-path average')
-ax.axhline(b_true, color='red', ls='--', lw=2, label=f'$b^* = {b_true}$')
-ax.axhline(μ_0, color='gray', ls=':', lw=2, label=f'prior mean $= {μ_0}$')
-ax.set_xlabel('$t$')
-ax.set_ylabel('posterior mean $\\mu_t$')
-ax.legend()
-
-ax = axes[1]
-for i in range(min(30, N)):
-    ax.plot(t_range, v_paths[i, :], color='darkorange', alpha=0.2, lw=2)
-
-ax.plot(t_range, np.mean(v_paths, axis=0), color='saddlebrown', lw=2,
-        label='cross-path average')
-
-ax.plot(t_range, 1.0 / t_range, color='black', ls='--', lw=2,
-        label='$1/t$ (theory)')
-ax.set_xlabel('$t$')
-ax.set_ylabel('posterior variance $v_t$')
-ax.legend()
-
-plt.tight_layout()
-plt.show()
+def simulate(θ_I_true, θ_U, σ2, μ_s, τ2,
+             a, b, n_grid, T, prior=None, seed=42):
+    """
+    Simulate T periods of the Bray-Kreps rational-learning equilibrium.
+    """
+    rng = np.random.default_rng(seed)
+    θ_grid = np.linspace(a, b, n_grid)
+    dθ = θ_grid[1] - θ_grid[0]
+
+    if prior is None:
+        f = np.ones(n_grid) / (b - a)
+    else:
+        f = prior(θ_grid)
+        f /= np.sum(f) * dθ
+
+    s_seq   = rng.normal(μ_s, np.sqrt(τ2), T)
+    eps_seq = rng.normal(0.0, np.sqrt(σ2), T)
+
+    XI_path   = np.empty(T)
+    p_path    = np.empty(T)
+    r_path    = np.empty(T)
+    post_mean = np.empty(T + 1)
+    post_var  = np.empty(T + 1)
+    post_mean[0] = np.sum(θ_grid * f) * dθ
+    post_var[0]  = np.sum((θ_grid - post_mean[0])**2 * f) * dθ
+
+    snap_times = {0, 5, 20, 50, 100, T}
+    snapshots = {0: f.copy()}
+
+    for t in range(T):
+        XI  = equilibrium_XI(f, θ_grid, θ_U, σ2)
+        xU  = 2.0 - XI
+        p_t = s_seq[t] - σ2 * XI / θ_I_true
+        r_t = s_seq[t] + eps_seq[t]
+        f   = bayes_update(f, θ_grid, p_t, xU, r_t, σ2, τ2, μ_s)
+
+        XI_path[t]   = XI
+        p_path[t]    = p_t
+        r_path[t]    = r_t
+        post_mean[t + 1] = np.sum(θ_grid * f) * dθ
+        post_var[t + 1]  = np.sum(
+            (θ_grid - post_mean[t + 1])**2 * f
+        ) * dθ
+        if (t + 1) in snap_times:
+            snapshots[t + 1] = f.copy()
+
+    return dict(
+        θ_grid=θ_grid,
+        snapshots=snapshots,
+        XI_path=XI_path,
+        p_path=p_path,
+        r_path=r_path,
+        post_mean=post_mean,
+        post_var=post_var,
+    )
 ```
 
-The left panel shows that regardless of the (misspecified) prior mean, agent $U$'s posterior mean converges to the true equilibrium value $b^* = 2$.
+## Posterior concentration
 
-The right panel confirms that the posterior variance vanishes at rate $1/t$, consistent with the formula in {eq}`eq:posterior_precision`.
-
-## Demand and equilibrium
-
-To connect the learning story to market equilibrium, we can track how agent $U$'s **equilibrium demand** for the risky asset evolves.
-
-Given $U$'s current beliefs about $b$ (summarized by $\mu_t$), $U$ estimates $r_t \approx p_t / \mu_t$ and formulates demand:
-
-$$
-x^U_t(\mu_t) = \frac{\theta^U}{\sigma^2} \cdot \left(\frac{p_t}{\mu_t} - p_t\right)
-$$
-
-As $\mu_t \to b^*$, this demand function converges to the demand implied by the rational expectations equilibrium.
-
-The following code computes the demand trajectories.
+We run the simulation with a uniform prior on $[0.5, 4]$ and true $\theta^I = 2$.
 
 ```{code-cell} ipython3
-def compute_demand(μ_t, p_t, σ2=1.0, θ_U=0.5):
-    """Agent U's demand x^U = (θ_U / σ2) * (p_t / μ_t - p_t)."""
-    r_hat = p_t / μ_t
-    return (θ_U / σ2) * (r_hat - p_t)
-
-i_rep = 0
-r_rep = np.random.normal(0, np.sqrt(σ2), T)
-p_rep = b_true * r_rep
-
-demand_path = np.array([
-    compute_demand(μ_paths[i_rep, t], p_rep[t])
-    for t in range(T)
-])
-
-demand_ree = np.array([
-    compute_demand(b_true, p_rep[t])
-    for t in range(T)
-])
+params = dict(
+    θ_I_true=2.0,
+    θ_U=1.0,
+    σ2=1.0,
+    μ_s=1.0,
+    τ2=1.0,
+    a=0.5,
+    b=4.0,
+    n_grid=300,
+    T=200,
+    seed=42,
+)
 
+res = simulate(**params)
 ```
 
+The first picture shows snapshots of the posterior density at selected dates.
+
 ```{code-cell} ipython3
 ---
 mystnb:
   figure:
-    caption: demand convergence
-    name: fig-rle-demand-convergence
+    caption: posterior density over $\theta^I$ at selected dates
+    name: fig-rle-posterior-density
   image:
-    alt: Learning demand and rational expectations demand over time
+    alt: Posterior density on theta^I concentrating around the true value
 ---
 fig, ax = plt.subplots(figsize=(10, 5))
-ax.plot(t_range, demand_path, color='steelblue', alpha=0.7,
-        lw=2, label="$x^U_t$ (learning)")
-ax.plot(t_range, demand_ree, color='red', ls='--', lw=2,
-        label="$x^U_t$ (REE)")
-ax.set_xlabel('$t$')
-ax.set_ylabel("agent $U$'s demand $x^U_t$")
+for t, ft in sorted(res['snapshots'].items()):
+    ax.plot(res['θ_grid'], ft, lw=2, label=f't = {t}')
+ax.axvline(params['θ_I_true'], color='black', ls='--', lw=1.5,
+           label=r'$\theta^I_{\rm true}$')
+ax.set_xlabel(r'$\theta^I$')
+ax.set_ylabel('posterior density')
 ax.legend()
 plt.tight_layout()
 plt.show()
 ```
 
-## Two toy extensions
-
-The next two simulations are not in Bray and Kreps.
-
-They are included as small numerical illustrations of themes that appear in the paper: identification and feedback from beliefs to prices.
-
-### 1. Two possible parameters
-
-First suppose the simplified linear model can be generated by one of two possible values of $b^*$.
-
-If the data identify which value is operating, Bayesian learning separates the two cases.
+The posterior tightens around $\theta^I_{\rm true} = 2$ as price and return data accumulate.
 
-The following code illustrates this point with a mixture prior.
-
-```{code-cell} ipython3
-def simulate_two_parameters(b_values, σ2, T, N, seed=0):
-    """Simulate learning when nature draws b* from b_values."""
-    rng = np.random.default_rng(seed)
-    b_true_draw = rng.choice(b_values, size=N)
-
-    μ_paths_all = np.empty((N, T))
-
-    for i in range(N):
-        b_i = b_true_draw[i]
-        r = rng.normal(0, np.sqrt(σ2), T)
-        p = b_i * r
-
-        μ_prior = np.mean(b_values)
-        prec_prior = 1.0 / 4.0
-        w_sum = μ_prior * prec_prior
-        prec = prec_prior
-
-        for t in range(T):
-            prec += r[t]**2 / σ2
-            w_sum += r[t] * p[t] / σ2
-            μ_paths_all[i, t] = w_sum / prec
-
-    return μ_paths_all, b_true_draw
-
-b_values = [1.0, 3.0]
-μ_two, b_drawn = simulate_two_parameters(b_values, σ2=1.0, T=200, N=300)
-```
+The next picture tracks the posterior mean and variance.
 
 ```{code-cell} ipython3
 ---
 mystnb:
   figure:
-    caption: two-parameter learning
-    name: fig-rle-two-parameters
+    caption: posterior mean and variance over time
+    name: fig-rle-posterior-moments
   image:
-    alt: Posterior mean paths converging to two possible parameter values
+    alt: Posterior mean of theta^I converging to the true value and posterior variance vanishing
 ---
-fig, ax = plt.subplots(figsize=(10, 5))
-
-colors = {b_values[0]: 'steelblue', b_values[1]: 'darkorange'}
-for i in range(len(b_drawn)):
-    c = colors[b_drawn[i]]
-    ax.plot(np.arange(1, 201), μ_two[i, :], color=c, alpha=0.1, lw=2)
-
-for bv, c in colors.items():
-    ax.axhline(bv, color=c, ls='--', lw=2, label=f'$b^* = {bv}$')
+fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
+ax = axes[0]
+ax.plot(np.arange(params['T'] + 1), res['post_mean'], lw=2)
+ax.axhline(params['θ_I_true'], color='red', ls='--', lw=2,
+           label=r'$\theta^I_{\rm true}$')
 ax.set_xlabel('$t$')
-ax.set_ylabel('posterior mean $\\mu_t$')
+ax.set_ylabel(r'$E_t[\theta^I]$')
 ax.legend()
+
+ax = axes[1]
+ax.plot(np.arange(params['T'] + 1), res['post_var'], lw=2)
+ax.set_xlabel('$t$')
+ax.set_ylabel(r'${\rm Var}_t[\theta^I]$')
+
 plt.tight_layout()
 plt.show()
 ```
 
-As expected, agent $U$ learns the *correct* equilibrium as long as the model is correctly specified and the true equilibrium generates the data.
-
-The paper's non-identification example is different: with two informed agents, prices can reveal only the sum of their risk tolerances.
-
-### 2. A self-referential price rule
-
-The next toy model lets the price at date $t$ depend directly on agent $U$'s current belief $\mu_t$.
-
-But $\mu_t$ is updated based on past prices.
-
-This creates a **self-referential** system: beliefs drive prices, and prices update beliefs.
-
-This is a deliberately simple stand-in for the paper's warning that learning changes behavior, which changes the data that agents observe.
-
-The formal Bray--Kreps model handles this by making the whole price process part of a grand rational expectations equilibrium over an expanded state space.
-
-```{code-cell} ipython3
-def simulate_self_referential(b_true, σ2, μ_0, v_0, T, N,
-                              α_demand=0.5):
-    """
-    Simulate the self-referential price rule
-    p_t = b_true * r_t + α_demand * (μ_t - b_true) * r_t.
-    """
-    rng = np.random.default_rng(10)
-    r_all = rng.normal(0, np.sqrt(σ2), (N, T))
-
-    μ_paths_sr = np.empty((N, T))
-    p_paths_sr = np.empty((N, T))
+The posterior mean converges to the truth and the posterior variance vanishes.
 
-    for i in range(N):
-        prec = 1.0 / v_0
-        w_sum = μ_0 / v_0
-        μ_t = μ_0
+This is the concrete manifestation of weak convergence of posteriors to a point mass at $\theta^I_{\rm true}$, which we describe in general terms below.
 
-        for t in range(T):
-            r_t = r_all[i, t]
-            p_t = b_true * r_t + α_demand * (μ_t - b_true) * r_t
+## Equilibrium trades and prices
 
-            prec += r_t**2 / σ2
-            w_sum += r_t * p_t / σ2
-            μ_t = w_sum / prec
+The equilibrium informed trade $X^I_t$ depends only on $f_t$, not directly on $s_t$ or $\theta^I$.
 
-            μ_paths_sr[i, t] = μ_t
-            p_paths_sr[i, t] = p_t
-
-    return μ_paths_sr, p_paths_sr
-
-μ_sr, p_sr = simulate_self_referential(
-    b_true, σ2, μ_0, v_0, T=200, N=100, α_demand=0.3
-)
-```
+As $f_t$ tightens around $\theta^I_{\rm true}$, $X^I_t$ approaches the full-information benchmark in {eq}`eq:bk-full-info-trade`.
 
 ```{code-cell} ipython3
 ---
 mystnb:
   figure:
-    caption: self-referential learning
-    name: fig-rle-self-referential
+    caption: equilibrium trade and prices over time
+    name: fig-rle-trade-price
   image:
-    alt: Self-referential posterior means and price paths over time
+    alt: Equilibrium informed trade X^I_t and price p_t over time
 ---
+XI_full = 2 * params['θ_I_true'] / (params['θ_I_true'] + params['θ_U'])
+p_mean_full = params['μ_s'] - params['σ2'] * XI_full / params['θ_I_true']
+
 fig, axes = plt.subplots(1, 2, figsize=(13, 5))
 
 ax = axes[0]
-for i in range(30):
-    ax.plot(np.arange(1, 201), μ_sr[i, :], color='steelblue', alpha=0.2, lw=2)
-ax.plot(np.arange(1, 201), np.mean(μ_sr, axis=0), color='navy', lw=2,
-        label='average $\\mu_t$')
-ax.axhline(b_true, color='red', ls='--', lw=2, label=f'$b^* = {b_true}$')
+ax.plot(res['XI_path'], lw=2, label='$X^I_t$ (learning)')
+ax.axhline(XI_full, color='red', ls='--', lw=2,
+           label='$X^I$ (full info)')
 ax.set_xlabel('$t$')
-ax.set_ylabel('$\\mu_t$')
+ax.set_ylabel('$X^I_t$')
 ax.legend()
 
 ax = axes[1]
-for i in range(30):
-    ax.plot(np.arange(1, 201), p_sr[i, :], color='darkorange', alpha=0.15, lw=2)
-ax.plot(np.arange(1, 201), np.mean(np.abs(p_sr), axis=0), color='saddlebrown', lw=2,
-        label='average $|p_t|$')
+ax.plot(res['p_path'], lw=1.5, alpha=0.7, label='$p_t$ (learning)')
+ax.axhline(p_mean_full, color='red', ls='--', lw=2,
+           label='$E[p_t]$ (full info)')
 ax.set_xlabel('$t$')
 ax.set_ylabel('$p_t$')
 ax.legend()
@@ -583,9 +509,14 @@ plt.tight_layout()
 plt.show()
 ```
 
+The left panel shows $X^I_t$ approaching the full-information allocation as beliefs concentrate.
+
+The right panel shows the price path, which fluctuates because $p_t$ inherits the variation in $s_t$.
+
+
 ## Convergence of posterior assessments
 
-{cite:t}`BrayKreps1987` prove two general convergence results.
+The general theory in {cite:t}`BrayKreps1987` gives two convergence results.
 
 Let $\Omega$ be the underlying state space, and let $H_t^n(p)$ be the information generated for agent $n$ by private information and observed equilibrium prices up to date $t$.
 
@@ -634,7 +565,7 @@ Let $F_t$ be agent $U$'s posterior distribution over $\theta^I$ after observing
 
 By weak convergence of posteriors, $F_t$ converges almost surely to a limiting distribution $F_\infty$.
 
-Bray and Kreps then show why this limiting posterior must be a point mass at the true $\theta^I$ in their example.
+In the benchmark two-agent example, this limiting posterior must be a point mass at the true $\theta^I$.
 
 The argument has three parts.
 
@@ -650,19 +581,19 @@ Third, in this example that limiting price distribution is stochastically decrea
 
 Therefore the long-run distribution of prices identifies the true value of $\theta^I$.
 
-This is the paper's concrete route from convergence of posterior assessments to convergence to the "correct beliefs".
+This is the concrete route from convergence of posterior assessments to convergence to the "correct beliefs".
 
 It relies on smoothness, ergodicity, and identification, rather than on martingale convergence alone.
 
 ## Obstacles to convergence
 
-While the positive convergence results are elegant, {cite:t}`BrayKreps1987` are careful to document when learning *fails* to produce convergence to REE.
+While the positive convergence results are elegant, the same framework also shows when learning can *fail* to produce convergence to REE.
 
 ### Obstacle 1: price maps might not settle down
 
 The step from weak convergence of posteriors to convergence of prices requires smoothness of the equilibrium price functional.
 
-Bray and Kreps stress that this can be hard, because small changes in a price function can produce large changes in the information communicated by prices.
+This can be hard, because small changes in a price function can produce large changes in the information communicated by prices.
 
 Thus martingale convergence of beliefs does not by itself guarantee that the economy settles into a stationary price relation.
 
@@ -670,7 +601,7 @@ Thus martingale convergence of beliefs does not by itself guarantee that the eco
 
 Even if prices settle down, the long-run distribution of prices need not identify every structural parameter.
 
-The paper gives a simple variant with two informed agents whose risk tolerances $\theta^{I1}$ and $\theta^{I2}$ are both unknown to the uninformed agent.
+A simple variant has two informed agents whose risk tolerances $\theta^{I1}$ and $\theta^{I2}$ are both unknown to the uninformed agent.
 
 In that case, prices reveal only the sum $\theta^{I1}+\theta^{I2}$.
 
@@ -680,17 +611,17 @@ For decisions in that example, learning the sum is enough, but it is not learnin
 
 ### Obstacle 3: the truth might be outside the model
 
-Bray and Kreps compare their rational-learning model with an example of {cite:t}`BlumeEasley1982`.
+An example of {cite:t}`BlumeEasley1982` illustrates a related misspecification problem.
 
 In that example, agents can converge to an incorrect model because the true stable price relation has zero prior probability under the models they entertain.
 
-Bray and Kreps argue that this cannot occur in their rational-learning formulation except on a prior-null event.
+In the rational-learning formulation, this kind of failure can occur only on a prior-null event.
 
 The reason is that rational learning puts the possible price relations generated by the expanded state space inside the Bayesian model from the start.
 
 ## Learning *within* versus learning *about* a rational expectations equilibrium
 
-One of the deepest conceptual contributions of {cite:t}`BrayKreps1987` is a distinction they draw in their concluding section between two fundamentally different notions of learning in a rational expectations context.
+One of the deepest conceptual points in {cite:t}`BrayKreps1987` is a distinction between two fundamentally different notions of learning in a rational expectations context.
 
 ### The distinction
 
@@ -698,11 +629,11 @@ One of the deepest conceptual contributions of {cite:t}`BrayKreps1987` is a dist
 
 The phrase refers to Bayesian inference that takes place *inside* a correctly specified model of the economy.
 
-In Bray and Kreps' rational-learning formulation, agents are uncertain about parameters such as other agents' risk tolerances.
+In the rational-learning formulation, agents are uncertain about parameters such as other agents' risk tolerances.
 
 But for every possible parameter realization, they are assumed to know the equilibrium price and allocation maps.
 
-Their Bayesian learning model is therefore a large rational expectations equilibrium over an expanded state space.
+The Bayesian learning model is therefore a large rational expectations equilibrium over an expanded state space.
 
 This is why the martingale convergence theorem can be applied so cleanly.
 
@@ -712,21 +643,21 @@ Here agents do not begin with the equilibrium map already embedded in their mode
 
 Instead, they try to infer the price-state relation from data generated while beliefs and behavior are changing.
 
-This is the original problem mentioned at the start of the paper: learning changes behavior, and behavior changes the price-state relation being learned.
+This is the original problem that motivated the analysis: learning changes behavior, and behavior changes the price-state relation being learned.
 
 ### Why rational learning has limited value
 
-Bray and Kreps call the expanded-state-space formulation natural but also identify its main flaw.
+The expanded-state-space formulation is natural, but it has a main flaw.
 
 It avoids the question of how agents learn the relation between prices and states by assuming that agents already know the equilibrium for every possible economy in the state space.
 
-In their conclusion, they say that their results do not satisfactorily answer the question "How does a rational expectations equilibrium come about?"
+It does not satisfactorily answer the question "How does a rational expectations equilibrium come about?"
 
 The reason is not that Bayesian convergence is false.
 
 The reason is that the Bayesian agents must have extraordinary insight into the structure of the economy and the implied probabilities of events.
 
-This is why the paper is useful both as a benchmark and as a warning.
+This is why the framework is useful both as a benchmark and as a warning.
 
 It gives sharp restrictions on what rational learning can imply, but it does not provide a plausible behavioral story for attaining rational expectations.
 
@@ -742,7 +673,7 @@ In those models, agents estimate perceived laws of motion from observed data and
 
 Such rules are computationally tractable and can converge in important examples.
 
-But they are *"irrational"* in Bray and Kreps' specific sense.
+But they are *"irrational"* in the specific sense used here.
 
 An agent who already understood the full equilibrium model would not generally use those rules as the Bayesian optimum.
 
@@ -750,27 +681,27 @@ The attraction of these rules is precisely that they ask a different question.
 
 They ask whether agents using standard statistical procedures on the data generated by the model could eventually learn to form rational expectations.
 
-Bray and Kreps are skeptical that rational Bayesian learning is behaviorally plausible, but they also use it to discipline adaptive learning stories.
+Rational Bayesian learning is demanding as a behavioral assumption, but it also disciplines adaptive learning stories.
 
-Their proposed discipline is that a stationary limiting equilibrium should not leave agents' beliefs systematically contradicted by observations.
+The proposed discipline is that a stationary limiting equilibrium should not leave agents' beliefs systematically contradicted by observations.
 
-In the long run, they argue, equilibrium expectations must either keep changing or become rational.
+In the long run, equilibrium expectations must either keep changing or become rational.
 
 There is a fundamental tension at the heart of learning about rational expectations equilibria:
 
 * A fully rational (Bayesian, correctly specified) learner can only apply Bayes' rule to a model whose structure is *already known*, but the structure of the REE is exactly what the agent is trying to learn.
 * A learner who uses an adaptive algorithm (OLS, least-mean-squares, etc.) can potentially converge to the REE, but only by using a rule that cannot be derived from Bayesian rationality applied to a correctly specified model.
 
-The Bray–Kreps rational-learning formulation avoids this tension by assumption: agent $U$ knows how each possible risk tolerance would map histories into equilibrium prices and trades.
+The rational-learning formulation avoids this tension by assumption: agent $U$ knows how each possible risk tolerance would map histories into equilibrium prices and trades.
 
-The simplified Gaussian code example avoids it even more directly by replacing the equilibrium calculation with a fixed linear observation equation.
+The full equilibrium simulation above embeds exactly that knowledge, since `equilibrium_XI` is recomputed from $f_t$ at every date.
 
-Both devices make Bayesian consistency transparent, but both sidestep the deeper difficulty of learning *about* an REE from scratch.
+The device makes Bayesian consistency transparent, but it still sidesteps the deeper difficulty of learning *about* an REE from scratch.
 
 
 ## Summary
 
-This lecture has discussed ideas from {cite:t}`BrayKreps1987`:
+This lecture has discussed rational learning in the sense of {cite:t}`BrayKreps1987`:
 
 1. **Rational learning** is modeled by expanding the state space to include unknown structural parameters such as risk tolerances.
 
@@ -780,15 +711,15 @@ This lecture has discussed ideas from {cite:t}`BrayKreps1987`:
 
 4. **Correct learning** requires more than martingale convergence, because the limiting price distribution must identify the true parameter.
 
-5. **In the paper's two-agent example**, the uninformed agent learns the informed agent's risk tolerance because the limiting price distribution is monotone in that parameter.
+5. **In the two-agent example**, the uninformed agent learns the informed agent's risk tolerance because the limiting price distribution is monotone in that parameter.
 
 6. **Identification can fail** when prices reveal only a composite parameter, such as the sum of two informed agents' risk tolerances.
 
 7. **Misspecification matters** because a stable price relation outside the learner's prior support cannot be learned by Bayes' rule.
 
-8. **The simplified Gaussian simulation** illustrates posterior concentration in a fixed correctly specified model, not the full Bray--Kreps equilibrium calculation.
+8. **The full simulation** above solves the within-period equilibrium from the posterior at every date and shows the posterior on $\theta^I$ collapsing to a point mass at $\theta^I_{\rm true}$.
 
-The broader message of Bray and Kreps is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle and the conditions under which learning succeeds are more restrictive than they might appear.
+The broader message is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle and the conditions under which learning succeeds are more restrictive than they might appear.
 
 
 ## Exercises
@@ -796,100 +727,71 @@ The broader message of Bray and Kreps is that while the mathematics of Bayesian
 ```{exercise}
 :label: rle_ex1
 
-**Posterior Precision Growth**
+**Off-centre prior**
 
-In the Bayesian learning model above, the posterior precision is
+The baseline simulation uses a uniform prior on $\theta^I \in [0.5, 4]$.
 
-$$
-v_t^{-1} = v_0^{-1} + \frac{1}{\sigma^2} \sum_{s=1}^{t} r_s^2
-$$
+(a) Re-run the simulation with a prior whose mass sits *above* the true value, for example
+
+```
+prior = lambda θ: (θ - 0.5)**3 * (4 - θ)
+```
 
-(a) Show that $v_t \to 0$ almost surely as $t \to \infty$, using the law of large numbers.
+which peaks near $\theta = 3.1$.
 
-(b) What is the approximate rate of decay of $v_t$? That is, what does $t \cdot v_t$ converge to?
+(b) Plot the posterior mean over time alongside the uniform-prior baseline.
 
-(c) Write Python code to verify your answer for $\sigma^2 = 1$ and a single simulated path of $T = 500$ periods.
+(c) Does the posterior eventually concentrate on $\theta^I_{\rm true}$, and how does the speed compare?
 ```
 
 ```{solution-start} rle_ex1
 :class: dropdown
 ```
 
-**(a)** By the strong law of large numbers, since $r_s \sim \mathcal{N}(0, \sigma^2)$ IID with $E[r_s^2] = \sigma^2$:
-
-$$
-\frac{1}{t} \sum_{s=1}^t r_s^2 \xrightarrow{a.s.} \sigma^2 > 0
-$$
-
-Therefore
-
-$$
-\frac{1}{t} v_t^{-1} = \frac{v_0^{-1}}{t} + \frac{1}{\sigma^2} \cdot \frac{1}{t} \sum_{s=1}^t r_s^2 \xrightarrow{a.s.} \sigma^2 / \sigma^2 = 1
-$$
-
-So $v_t^{-1} \sim t$ and $v_t \to 0$ almost surely.
-
-**(b)** From the above, $t \cdot v_t^{-1} \to 1$ implies $t \cdot v_t \to 1 / 1 = 1 / \sigma^2 \cdot \sigma^2 = 1$ when $\sigma^2 = 1$.
-
-More precisely, $t \cdot v_t \to \sigma^2 / \sigma^2 = 1$ (since $v_t \approx \sigma^2 / (t \sigma^2) = 1/t$ for large $t$ when $\sigma^2 = 1$).
-
-So $t \cdot v_t \to 1$ (when $\sigma^2 = 1$).
-
-**(c)**
-
 ```{code-cell} ipython3
-σ2_ex = 1.0
-T_ex = 500
-v0_ex = 2.0
-
-np.random.seed(7)
-r_ex = np.random.normal(0, np.sqrt(σ2_ex), T_ex)
-
-precisions = np.empty(T_ex)
-prec = 1.0 / v0_ex
-for t in range(T_ex):
-    prec += r_ex[t]**2 / σ2_ex
-    precisions[t] = prec
-
-v_t_ex = 1.0 / precisions
-
-fig, axes = plt.subplots(1, 2, figsize=(12, 4))
-
-axes[0].plot(np.arange(1, T_ex + 1), v_t_ex, lw=2, label='$v_t$')
-axes[0].plot(np.arange(1, T_ex + 1), 1.0 / np.arange(1, T_ex + 1),
-             '--', lw=2, label='$1/t$')
-axes[0].set_xlabel('$t$')
-axes[0].set_ylabel('$v_t$')
-axes[0].set_title('Posterior Variance Decay')
-axes[0].legend()
-
-axes[1].plot(np.arange(1, T_ex + 1),
-             np.arange(1, T_ex + 1) * v_t_ex, lw=2, label='$t \\cdot v_t$')
-axes[1].axhline(1.0, color='red', ls='--', lw=2, label='limit = 1')
-axes[1].set_xlabel('$t$')
-axes[1].set_ylabel('$t \\cdot v_t$')
-axes[1].set_title('Normalized Variance Converges to 1')
-axes[1].legend()
+res_uniform = simulate(**params)
 
+params_biased = dict(params)
+params_biased['prior'] = lambda θ: (θ - 0.5)**3 * (4 - θ)
+res_biased = simulate(**params_biased)
+
+fig, ax = plt.subplots(figsize=(10, 5))
+ax.plot(res_uniform['post_mean'], lw=2, label='uniform prior')
+ax.plot(res_biased['post_mean'], lw=2, label='off-centre prior')
+ax.axhline(params['θ_I_true'], color='black', ls='--',
+           label=r'$\theta^I_{\rm true}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel(r'$E_t[\theta^I]$')
+ax.legend()
 plt.tight_layout()
 plt.show()
 ```
 
+The off-centre prior starts the posterior mean well above $\theta^I_{\rm true} = 2$, but Bayesian updating drives it down to the truth.
+
+This is the rational-learning convergence result in action: any prior that puts positive density on $\theta^I_{\rm true}$ eventually concentrates around it.
+
 ```{solution-end}
 ```
 
 ```{exercise}
 :label: rle_ex2
 
-**Effect of Prior Misspecification**
+**Speed of learning across $\theta^I$**
 
-Suppose agent $U$ starts with a prior mean $\mu_0$ far from the true value $b^* = 2$.
+Information from one period about $\theta^I$ comes through the implied signal
 
-(a) Simulate 100 paths of $T = 400$ periods for each of $\mu_0 \in \{-3, 0, 1, 3, 5\}$ and plot the average posterior mean across paths for each $\mu_0$.
+$$
+s_t(\theta) = \frac{\sigma^2 X^I_t}{\theta} + p_t.
+$$
+
+The sensitivity $|\partial s_t/\partial \theta| = \sigma^2 X^I_t/\theta^2$ depends on the level of $\theta^I_{\rm true}$ through $X^I_t$ and $\theta^{-2}$.
 
-(b) Does the prior mean affect the *rate* at which the posterior mean converges to $b^*$?
+(a) Run the simulation for $\theta^I_{\rm true} \in \{0.8, 2.0, 3.5\}$, holding everything else at the baseline.
 
-(c) Does the prior *variance* $v_0$ affect the rate? Verify by comparing $v_0 \in \{0.1, 1.0, 10.0\}$ with fixed $\mu_0 = 0$.
+(b) Plot the posterior variance on a log scale for each case.
+
+(c) Which value of $\theta^I_{\rm true}$ yields the fastest concentration, and does the result match the sensitivity formula above?
 ```
 
 ```{solution-start} rle_ex2
@@ -897,168 +799,69 @@ Suppose agent $U$ starts with a prior mean $\mu_0$ far from the true value $b^*
 ```
 
 ```{code-cell} ipython3
-b_true_ex = 2.0
-σ2_ex = 1.0
-T_ex = 400
-N_ex = 100
-t_range_ex = np.arange(1, T_ex + 1)
-
-fig, axes = plt.subplots(1, 2, figsize=(14, 5))
-
-ax = axes[0]
-for μ0 in [-3, 0, 1, 3, 5]:
-    μ_p, _ = simulate_bayesian_learning(
-        b_true_ex, σ2_ex, μ0, v_0=1.0, T=T_ex, N=N_ex
-    )
-    ax.plot(t_range_ex, np.mean(μ_p, axis=0), lw=2,
-            label=f'$\\mu_0 = {μ0}$')
-
-ax.axhline(b_true_ex, color='black', ls='--', lw=2,
-           label=f'$b^* = {b_true_ex}$')
-ax.set_xlabel('$t$')
-ax.set_ylabel('$E[\\mu_t]$')
-ax.set_title('Effect of Prior Mean on Convergence')
-ax.legend(fontsize=8)
-
-ax = axes[1]
-for v0 in [0.1, 1.0, 10.0]:
-    μ_p, _ = simulate_bayesian_learning(
-        b_true_ex, σ2_ex, μ_0=0.0, v_0=v0, T=T_ex, N=N_ex
-    )
-    ax.plot(t_range_ex, np.mean(μ_p, axis=0), lw=2,
-            label=f'$v_0 = {v0}$')
-
-ax.axhline(b_true_ex, color='black', ls='--', lw=2,
-           label=f'$b^* = {b_true_ex}$')
+fig, ax = plt.subplots(figsize=(10, 5))
+for θ_val in [0.8, 2.0, 3.5]:
+    params_θ = dict(params)
+    params_θ['θ_I_true'] = θ_val
+    res_θ = simulate(**params_θ)
+    ax.semilogy(res_θ['post_var'], lw=2,
+                label=fr'$\theta^I_{{\rm true}} = {θ_val}$')
 ax.set_xlabel('$t$')
-ax.set_ylabel('$E[\\mu_t]$')
-ax.set_title('Effect of Prior Variance on Convergence')
+ax.set_ylabel(r'${\rm Var}_t[\theta^I]$ (log scale)')
 ax.legend()
-
 plt.tight_layout()
 plt.show()
-
-print("Observations:")
-print("(b) Prior mean affects the initial level but not the long-run rate.")
-print("    All paths converge to b* = 2 at the same asymptotic rate.")
-print("(c) A tighter prior (small v_0) slows initial adaptation but all")
-print("    converge; a diffuse prior adapts quickly early on.")
 ```
 
+The smallest $\theta^I_{\rm true}$ produces the steepest decline in posterior variance.
+
+The reason is that the sensitivity $\sigma^2 X^I_t/\theta^2$ scales as $\theta^{-2}$ for fixed $X^I_t$, so the same noise level conveys much more information about $\theta^I$ when $\theta^I$ is small.
+
+The asymmetry is a feature of the geometry of the equilibrium map, not of the learning rule itself.
+
 ```{solution-end}
 ```
 
 ```{exercise}
 :label: rle_ex3
 
-**Convergence with Non-Standard Fundamentals**
+**Effect of return noise**
+
+Larger $\sigma^2$ widens the conditional density of $s_t$ given $r_t$, which one might guess slows learning.
 
-The convergence proof relies on $E[r_t^2] = \sigma^2 > 0$.
+But $\sigma^2$ also scales the price intercept in {eq}`eq:bk-price`, so price dispersion across candidate $\theta$ grows with $\sigma^2$.
 
-(a) Suppose $r_t$ follows a **mixture distribution**: with probability $0.5$ it equals $0$, and with probability $0.5$ it is drawn from $\mathcal{N}(0, 2\sigma^2)$.
-Show that $E[r_t^2] = \sigma^2 > 0$ still holds, so convergence is guaranteed.
+(a) Run the simulation with $\sigma^2 \in \{0.25, 1.0, 4.0\}$, keeping $\tau^2 = 1$ fixed.
 
-(b) Simulate $T = 500$ periods with $\sigma^2 = 1$ and $b^* = 2$ using this mixture distribution for $r_t$.
-Plot the posterior mean and variance over time for 50 paths.
+(b) Plot the posterior variance on a log scale for each $\sigma^2$.
 
-(c) Compare the speed of convergence to the Gaussian case.
-Why does the mixture distribution slow convergence even though $E[r_t^2]$ is the same?
+(c) Which effect dominates? Explain in terms of the signal-to-noise ratio for inferring $\theta^I$ from the price.
 ```
 
 ```{solution-start} rle_ex3
 :class: dropdown
 ```
 
-**(a)** Let $Z \sim \mathcal{N}(0, 2\sigma^2)$.
-Then
-
-$$
-E[r_t^2] = 0.5 \cdot 0^2 + 0.5 \cdot E[Z^2] = 0.5 \cdot 2\sigma^2 = \sigma^2
-$$
-
-So $E[r_t^2] = \sigma^2 > 0$ and the strong law of large numbers guarantees $\sum_{s=1}^t r_s^2 / t \to \sigma^2$, ensuring convergence.
-
-**(b) and (c)**
-
 ```{code-cell} ipython3
-def simulate_learning_mixture(b_true, σ2, μ_0, v_0, T, N):
-    """
-    Bayesian learning with mixture fundamentals:
-    r_t = 0 with prob 0.5, else N(0, 2*σ2) with prob 0.5.
-    """
-    rng = np.random.default_rng(42)
-
-    μ_paths = np.empty((N, T))
-    v_paths = np.empty((N, T))
-
-    for i in range(N):
-        prec = 1.0 / v_0
-        w_sum = μ_0 / v_0
-
-        for t in range(T):
-            if rng.random() < 0.5:
-                r_t = 0.0
-            else:
-                r_t = rng.normal(0, np.sqrt(2 * σ2))
-
-            p_t = b_true * r_t
-
-            prec += r_t**2 / σ2
-            w_sum += r_t * p_t / σ2
-
-            v_t = 1.0 / prec
-            μ_t = v_t * w_sum
-
-            μ_paths[i, t] = μ_t
-            v_paths[i, t] = v_t
-
-    return μ_paths, v_paths
-
-σ2_ex = 1.0
-T_ex = 500
-N_ex = 50
-
-μ_gauss, v_gauss = simulate_bayesian_learning(
-    b_true=2.0, σ2=σ2_ex, μ_0=0.5, v_0=2.0, T=T_ex, N=N_ex
-)
-
-μ_mix, v_mix = simulate_learning_mixture(
-    b_true=2.0, σ2=σ2_ex, μ_0=0.5, v_0=2.0, T=T_ex, N=N_ex
-)
-
-t_range_ex = np.arange(1, T_ex + 1)
-
-fig, axes = plt.subplots(1, 2, figsize=(14, 5))
-
-ax = axes[0]
-ax.plot(t_range_ex, np.mean(μ_gauss, axis=0), label='Gaussian $r_t$',
-        color='steelblue', lw=2)
-ax.plot(t_range_ex, np.mean(μ_mix, axis=0), label='Mixture $r_t$',
-        color='darkorange', lw=2)
-ax.axhline(2.0, color='red', ls='--', lw=2, label='$b^* = 2$')
-ax.set_xlabel('$t$')
-ax.set_ylabel('$E[\\mu_t]$')
-ax.set_title('Posterior Mean: Gaussian vs Mixture')
-ax.legend()
-
-ax = axes[1]
-ax.plot(t_range_ex, np.mean(v_gauss, axis=0), label='Gaussian $r_t$',
-        color='steelblue', lw=2)
-ax.plot(t_range_ex, np.mean(v_mix, axis=0), label='Mixture $r_t$',
-        color='darkorange', lw=2)
+fig, ax = plt.subplots(figsize=(10, 5))
+for σ2_val in [0.25, 1.0, 4.0]:
+    params_σ = dict(params)
+    params_σ['σ2'] = σ2_val
+    res_σ = simulate(**params_σ)
+    ax.semilogy(res_σ['post_var'], lw=2,
+                label=fr'$\sigma^2 = {σ2_val}$')
 ax.set_xlabel('$t$')
-ax.set_ylabel('$E[v_t]$')
-ax.set_title('Posterior Variance: Gaussian vs Mixture')
+ax.set_ylabel(r'${\rm Var}_t[\theta^I]$ (log scale)')
 ax.legend()
-
 plt.tight_layout()
 plt.show()
-
-print("The mixture distribution slows convergence because periods with r_t = 0")
-print("provide NO information about b* (the observation p_t = 0 is uninformative).")
-print("Even though E[r_t^2] = sigma^2, the variance of r_t^2 is larger under the")
-print("mixture, leading to noisier information accumulation.")
 ```
 
+The posterior variance falls *faster* for larger $\sigma^2$.
+
+The reason is visible in the price equation $p_t = s_t - \sigma^2 X^I_t/\theta^I$: the price gap between two candidate $\theta$ values grows linearly with $\sigma^2$, while the conditional variance of the implied signal $g(s\mid r)$ is bounded above by $\tau^2$.
+
+The Grossman-Stiglitz-style trade thus becomes more revealing about $\theta^I$ as the return shock $\epsilon_t$ becomes more volatile, even though each return is individually noisier.
+
 ```{solution-end}
 ```

From c293ebe2d932eb655a9a92a582e2a7f535eee743 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Tue, 2 Jun 2026 13:03:14 +1000
Subject: [PATCH 20/25] updates

---
 lectures/rational_learning_re.md | 440 ++++++++++++++++++++-----------
 1 file changed, 284 insertions(+), 156 deletions(-)

diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index b96a45dbd..b3a556d71 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -28,7 +28,9 @@ kernelspec:
 
 ## Overview
 
-This lecture explores a classic question in economic theory: can agents *learn* their way to a rational expectations equilibrium?
+This lecture explores an important question in economic theory: can agents *learn* their way to a rational expectations equilibrium?
+
+If they can, then the rational expectations equilibrium can be justiﬁed as a dynamic attractor for learning processes.
 
 The starting point is {cite:t}`BrayKreps1987`, which gives a rigorous model of Bayesian learning inside a rational expectations equilibrium.
 
@@ -40,14 +42,10 @@ But this raises a question: where does that knowledge come from?
 
 The **rational learning** approach asks whether agents who start with uncertainty about the equilibrium price function can, over time, learn it from observations of past prices.
 
-The key findings are:
-
-* In every rational learning model, posterior assessments converge because they are bounded martingales.
-* In the benchmark example, the uninformed agent learns the informed agent's risk tolerance.
-* Correct learning requires identification, smooth equilibrium price maps, and positive prior probability for the true model.
-
-This lecture presents the framework, explains the benchmark example, and provides Python code that solves the full equilibrium with rational learning.
+This lecture develops that idea through an asset-market model.
 
+The aim is to see what rational learning can explain, and where its limits
+appear, before turning to the computational illustration.
 
 The discussion also connects to earlier work by {cite:t}`Bray1982`, {cite:t}`BraySavin1984`, and the rational expectations literature of {cite:t}`Radner1979`, {cite:t}`grossman1976`, and {cite:t}`Jordan1982`.
 
@@ -56,13 +54,17 @@ Let's start with the following imports
 ```{code-cell} ipython3
 import numpy as np
 import matplotlib.pyplot as plt
+from scipy.optimize import brentq
 ```
 
 ## The economy
 
-### Agents and assets
+Let's start with a simple asset-market model that captures the key features of rational learning.
+
+The example is an infinitely repeated version of the information model in {cite:t}`GrossmanStiglitz1980`.
+
 
-The benchmark example is an infinitely repeated version of the information model in {cite:t}`GrossmanStiglitz1980`.
+### Agents and assets
 
 Each date is economically disconnected from the others, so agents start each period afresh.
 
@@ -138,9 +140,15 @@ Thus if $\sum_n \theta^n$ is known, the price fully reveals $s_t$.
 
 Following {cite:t}`Radner1979`, this is called a full communication rational expectations equilibrium.
 
-Suppose that $\theta^I$ is unknown to agent $U$.
+Suppose now that $\theta^I$ is unknown to agent $U$.
+
+Following {cite:t}`BrayKreps1987`, we treat this uncertainty by *expanding the state space*: we let the unknown parameter $\theta^I$ become a coordinate of the state, alongside the per-period shocks $(s_t, \epsilon_t)$.
+
+Formally, the state space is $\Omega = \Theta \times \Phi^\infty$, where $\Theta = [a,b]$ supports the unknown $\theta^I$ and $\Phi$ supports each $(s_t, \epsilon_t)$.
+
+Agent $U$ knows $\sigma^2$ and $\theta^U$, and starts with a prior density over $\theta^I$ on $[a,b]$.
 
-Agent $U$ knows $\sigma^2$ and $\theta^U$, and starts with a prior density over $\theta^I$ on an interval $[a,b] \subset (0,\infty)$.
+This expansion is what turns the learning problem into Bayesian inference inside a single rational expectations equilibrium on $\Omega$.
 
 At a date when agent $U$ has posterior density $f$ over $\theta^I$, his own trade reveals $x^I_t=2-x^U_t$ through market clearing.
 
@@ -158,7 +166,7 @@ After trading, agent $U$ observes $r_t$.
 
 Bayes' rule then updates the posterior over $\theta^I$ using the normal density of the signal implied by {eq}`eq:bk-signal-implied` conditional on the realized return.
 
-This is the main object learned in the benchmark example.
+This is the main object learned in the two-agent example.
 
 Even in this simple case, the equilibrium can be defined recursively but closed-form prices are unavailable.
 
@@ -180,16 +188,54 @@ From {eq}`eq:bk-signal-implied`, conditional on $\theta^I$, agent $U$ infers $s_
 
 Marginalising over $\theta^I \sim f_t$ and combining with $r_t = s_t + \epsilon_t$ where $\epsilon_t \sim \mathcal N(0,\sigma^2)$ gives the implied conditional distribution of $r_t$.
 
+Equivalently, conditional on a candidate value $\theta$, the excess payoff on
+one unit of the risky asset is
+
+$$
+r_t - p
+=
+\frac{\sigma^2 X^I}{\theta} + \epsilon_t.
+$$
+
 Because CARA preferences have no wealth effects, agent $U$'s problem reduces to
 
 $$
 \max_{x^U}\,
-E\!\left[-\exp\!\left(-\tfrac{x^U(r_t - p)}{\theta^U}\right)\right],
+E[u^U(x^U, r_t, p)],
+\qquad
+u^U(x^U, r_t, p)
+=
+-\exp\!\left(-\frac{x^U(r_t-p)}{\theta^U}\right),
 $$
 
 where the expectation integrates over $\theta^I \sim f_t$ and $\epsilon_t$.
 
-Integrating out $\epsilon_t$ first and then $\theta^I$ yields
+To derive the expected utility formula, substitute the conditional excess
+payoff above:
+
+$$
+E[u^U]
+=
+-\int_a^b f_t(\theta)
+E_\epsilon
+\left[
+\exp\!\left(
+-\frac{x^U}{\theta^U}
+\left(\frac{\sigma^2 X^I}{\theta}+\epsilon_t\right)
+\right)
+\right]
+d\theta.
+$$
+
+Using the normal moment-generating formula
+
+$$
+E_\epsilon\!\left[\exp(c\epsilon_t)\right]
+=
+\exp\!\left(\frac{c^2\sigma^2}{2}\right)
+$$
+
+with $c=-x^U/\theta^U$, this becomes
 
 $$
 E[u^U]
@@ -200,7 +246,28 @@ E[u^U]
 d\theta.
 $$
 
-The first-order condition rearranges to
+To get the first-order condition, define
+
+$$
+I(x^U)
+=
+\int_a^b f_t(\theta)\,
+\exp\!\left(-\frac{x^U \sigma^2 X^I}{\theta\,\theta^U}\right)
+d\theta.
+$$
+
+Hence the first-order condition is
+
+$$
+\frac{d}{dx^U}
+\left[
+\frac{(x^U)^2\sigma^2}{2(\theta^U)^2}
++ \log I(x^U)
+\right]
+=0.
+$$
+
+Rearranging gives
 
 $$
 \frac{x^U}{\theta^U}
@@ -234,7 +301,7 @@ X^I_t = \frac{2\theta^I}{\theta^I + \theta^U},
 x^U_t = \frac{2\theta^U}{\theta^I + \theta^U}.
 $$ (eq:bk-full-info-trade)
 
-This is the CARA-Normal benchmark we will use to check the simulation.
+This is the full-communication allocation we will use to check the simulation.
 
 ### Bayesian update
 
@@ -274,10 +341,6 @@ There are three computational primitives.
 * `equilibrium_XI` solves market clearing $X^I + x^U(X^I; f) = 2$ for $X^I_t$.
 * `bayes_update` applies {eq}`eq:bk-bayes` and renormalises.
 
-```{code-cell} ipython3
-from scipy.optimize import brentq
-```
-
 ```{code-cell} ipython3
 def uninformed_demand(XI, f, θ_grid, θ_U, σ2):
     """
@@ -285,13 +348,13 @@ def uninformed_demand(XI, f, θ_grid, θ_U, σ2):
     a conjectured informed trade XI and posterior density f.
     """
     with np.errstate(divide='ignore'):
-        log_f = np.log(f)            # -inf where f == 0 is fine
+        log_f = np.log(f)
 
     def foc(xU):
         z = xU * σ2 * XI / (θ_grid * θ_U)
         log_w = log_f - z
         M = log_w.max()
-        w = np.exp(log_w - M)        # bounded in [0, 1], max value = 1
+        w = np.exp(log_w - M)
         num = np.sum(w / θ_grid)
         den = np.sum(w)
         return xU / θ_U - XI * num / den
@@ -318,11 +381,11 @@ def bayes_update(f, θ_grid, p_t, xU_t, r_t, σ2, τ2, μ_s):
     """
     XI = 2.0 - xU_t
     s_mean = (σ2 * μ_s + τ2 * r_t) / (σ2 + τ2)
-    s_var  = σ2 * τ2 / (σ2 + τ2)
+    s_var = σ2 * τ2 / (σ2 + τ2)
     s_implied = σ2 * XI / θ_grid + p_t
 
     log_like = -0.5 * (s_implied - s_mean)**2 / s_var
-    log_like -= log_like.max()           # log-shift for stability
+    log_like -= log_like.max()  # log-shift for stability
     f_new = f * np.exp(log_like)
     dθ = θ_grid[1] - θ_grid[0]
     f_new /= np.sum(f_new) * dθ
@@ -347,32 +410,32 @@ def simulate(θ_I_true, θ_U, σ2, μ_s, τ2,
         f = prior(θ_grid)
         f /= np.sum(f) * dθ
 
-    s_seq   = rng.normal(μ_s, np.sqrt(τ2), T)
+    s_seq = rng.normal(μ_s, np.sqrt(τ2), T)
     eps_seq = rng.normal(0.0, np.sqrt(σ2), T)
 
-    XI_path   = np.empty(T)
-    p_path    = np.empty(T)
-    r_path    = np.empty(T)
+    XI_path = np.empty(T)
+    p_path = np.empty(T)
+    r_path = np.empty(T)
     post_mean = np.empty(T + 1)
-    post_var  = np.empty(T + 1)
+    post_var = np.empty(T + 1)
     post_mean[0] = np.sum(θ_grid * f) * dθ
-    post_var[0]  = np.sum((θ_grid - post_mean[0])**2 * f) * dθ
+    post_var[0] = np.sum((θ_grid - post_mean[0])**2 * f) * dθ
 
     snap_times = {0, 5, 20, 50, 100, T}
     snapshots = {0: f.copy()}
 
     for t in range(T):
-        XI  = equilibrium_XI(f, θ_grid, θ_U, σ2)
-        xU  = 2.0 - XI
+        XI = equilibrium_XI(f, θ_grid, θ_U, σ2)
+        xU = 2.0 - XI
         p_t = s_seq[t] - σ2 * XI / θ_I_true
         r_t = s_seq[t] + eps_seq[t]
-        f   = bayes_update(f, θ_grid, p_t, xU, r_t, σ2, τ2, μ_s)
+        f = bayes_update(f, θ_grid, p_t, xU, r_t, σ2, τ2, μ_s)
 
-        XI_path[t]   = XI
-        p_path[t]    = p_t
-        r_path[t]    = r_t
+        XI_path[t] = XI
+        p_path[t] = p_t
+        r_path[t] = r_t
         post_mean[t + 1] = np.sum(θ_grid * f) * dθ
-        post_var[t + 1]  = np.sum(
+        post_var[t + 1] = np.sum(
             (θ_grid - post_mean[t + 1])**2 * f
         ) * dθ
         if (t + 1) in snap_times:
@@ -473,7 +536,7 @@ This is the concrete manifestation of weak convergence of posteriors to a point
 
 The equilibrium informed trade $X^I_t$ depends only on $f_t$, not directly on $s_t$ or $\theta^I$.
 
-As $f_t$ tightens around $\theta^I_{\rm true}$, $X^I_t$ approaches the full-information benchmark in {eq}`eq:bk-full-info-trade`.
+As $f_t$ tightens around $\theta^I_{\rm true}$, $X^I_t$ approaches the full-communication allocation in {eq}`eq:bk-full-info-trade`.
 
 ```{code-cell} ipython3
 ---
@@ -513,236 +576,303 @@ The left panel shows $X^I_t$ approaching the full-information allocation as beli
 
 The right panel shows the price path, which fluctuates because $p_t$ inherits the variation in $s_t$.
 
+The simulation suggests three empirical facts about this equilibrium.
+
+The posterior density on $\theta^I$ concentrates around the true value, the posterior variance vanishes, and the equilibrium informed trade $X^I_t$ converges to its full-information benchmark.
+
+The next sections ask what general theorems guarantee these outcomes and which assumptions they rely on.
+
+The plan is to first state the two convergence theorems of {cite:t}`BrayKreps1987` for the abstract rational-learning model, then specialise to the two-agent example to identify the hypotheses that imply concentration on the true $\theta^I$, and finally explain when those hypotheses can fail.
 
 ## Convergence of posterior assessments
 
-The general theory in {cite:t}`BrayKreps1987` gives two convergence results.
+Let $(\Omega, \mathcal F)$ be a measurable space carrying the equilibrium.
+
+In the two-agent example, $\Omega = \Theta \times \Phi^{\infty}$, where $\Theta = [a,b] \times \{\theta^U\}$ collects the structural parameters, $\Phi$ collects the per-period shocks $(s_t, \epsilon_t)$, and $\mathcal F$ is the product Borel $\sigma$-field.
+
+Agent $n$ enters date $0$ with a prior probability measure $P^n$ on $(\Omega, \mathcal F)$.
+
+Let $G_t^n \subseteq \mathcal F$ denote the $\sigma$-field generated by agent $n$'s private information through date $t$, and let
+
+$$
+H_t^n(p)
+=
+G_t^n \vee \sigma(p_0, p_1, \dots, p_t)
+$$
+
+be the $\sigma$-field that adds observation of equilibrium prices through date $t$.
 
-Let $\Omega$ be the underlying state space, and let $H_t^n(p)$ be the information generated for agent $n$ by private information and observed equilibrium prices up to date $t$.
+The tail $\sigma$-field is
 
-For any event $A$, the posterior assessment $P^n(A \mid H_t^n(p))$ is a bounded martingale in $t$.
+$$
+H_\infty^n(p)
+=
+\bigvee_{t \ge 0} H_t^n(p).
+$$
 
-The first convergence result is therefore an application of the martingale convergence theorem.
+The first result, due to {cite:t}`BrayKreps1987`, states that the conditional probability of any event converges almost surely.
 
 ```{prf:proposition}
 :label: prop-bk-event-convergence
 
-For any event $A$,
+Fix an agent $n$ and an event $A \in \mathcal F$.
+
+The process $M_t = E^n[\mathbf 1_A \mid H_t^n(p)]$ is a $P^n$-bounded martingale with respect to $(H_t^n(p))_{t \ge 0}$, and
 
 $$
-P^n(A \mid H_t^n(p))
-\xrightarrow{a.s.}
-P^n(A \mid H_\infty^n(p)),
-\qquad
-H_\infty^n(p)=\bigvee_{t \geq 0} H_t^n(p).
+\lim_{t\to\infty}
+E^n[\mathbf 1_A \mid H_t^n(p)]
+=
+E^n[\mathbf 1_A \mid H_\infty^n(p)],
+\qquad P^n\text{-a.s.}
 $$
 ```
 
-This is convergence of posterior assessments, not yet convergence to "correct beliefs".
+The proof is the bounded martingale convergence theorem, with $M_t \in [0,1]$ supplying the uniform integrability needed for the limit identification.
 
-If two agents' priors are mutually singular, the almost-sure statements need not hold on a common objective-probability set.
+{prf:ref}`prop-bk-event-convergence` is convergence of posterior assessments, not convergence to "correct" beliefs.
 
-If the priors have identical null sets, simultaneous convergence holds outside a common null set.
+Two qualifications are worth stating.
 
-The second result sharpens the convergence from events to entire posterior distributions.
+The "a.s." statement is relative to agent $n$'s own prior $P^n$, so if two priors $P^n$ and $P^{n'}$ are mutually singular, the conclusion need not hold simultaneously on a common $P$-positive event.
 
-```{prf:proposition}
-:label: prop-bk-measure-convergence
+If the priors share a common null collection, simultaneous convergence holds outside a common null set.
 
-When the parameter space $\Theta$ is a complete separable metric space whose
-Borel $\sigma$-field makes it a Borel space, fixed regular versions of the
-conditional probabilities $P_t^n$ converge weakly $P^n$-a.s. to a regular
-version $P_\infty^n$.
+The second result in {cite:t}`BrayKreps1987` sharpens convergence from individual events to the entire posterior measure on the parameter space, given a topological assumption on $\Theta$.
+
+```{prf:assumption}
+:label: assum-bk-borel
+
+The parameter space $\Theta$ is a complete separable metric (Polish) space, and the Borel $\sigma$-field on $\Theta$ generated by its open sets makes $(\Theta, \mathcal B(\Theta))$ a Borel space.
 ```
 
-Thus rational Bayesian learning always produces a limiting posterior, but additional regularity is needed to ensure the limiting posterior assesses the truth correctly.
+In the two-agent example $\Theta = [a,b]$ trivially satisfies this assumption.
 
-## Sharpening the convergence result
+Under {prf:ref}`assum-bk-borel` one can fix regular versions of the conditional probabilities: maps
 
-Now return to the two-agent example in which agent $U$ is uncertain about $\theta^I$.
+$$
+P_t^n: \Omega \to \mathcal P(\Theta),
+\qquad
+\omega \mapsto P_t^n(\omega),
+$$
 
-Let $F_t$ be agent $U$'s posterior distribution over $\theta^I$ after observing the previous price, allocation, and return data.
+such that for each measurable $A \subseteq \Theta$, $\omega \mapsto P_t^n(\omega)(A)$ is a version of $E^n[\mathbf 1_{A \times \Phi^\infty} \mid H_t^n(p)](\omega)$, and $P_t^n(\omega) \in \mathcal P(\Theta)$ is a probability measure $P^n$-a.s.
 
-By weak convergence of posteriors, $F_t$ converges almost surely to a limiting distribution $F_\infty$.
+The sharpened convergence result says these regular versions converge weakly almost surely.
 
-In the benchmark two-agent example, this limiting posterior must be a point mass at the true $\theta^I$.
+```{prf:proposition}
+:label: prop-bk-measure-convergence
 
-The argument has three parts.
+Under {prf:ref}`assum-bk-borel`, the regular versions $P_t^n$ converge weakly to a regular version $P_\infty^n$, $P^n$-a.s.
 
-First, because current equilibrium demand is continuous in the posterior distribution, prices converge to a limiting price functional
+Equivalently, for $P^n$-a.e. $\omega$ and every bounded continuous $f: \Theta \to \mathbb R$,
 
 $$
-p_\infty(s_t; F_\infty, \theta^I, \theta^U).
+\int_\Theta f \, dP_t^n(\omega)
+\xrightarrow[t \to \infty]{}
+\int_\Theta f \, dP_\infty^n(\omega).
 $$
+```
 
-Second, since the signals are IID, the empirical distribution of observed prices converges to the distribution of this limiting price functional.
+The proof in {cite:t}`BrayKreps1987` applies {prf:ref}`prop-bk-event-convergence` to a countable disjoint partition of $\Theta$ by $1/k$-balls, which exists because $\Theta$ is Polish, and then invokes the Portmanteau characterisation of weak convergence on bounded continuous functions.
 
-Third, in this example that limiting price distribution is stochastically decreasing in $\theta^I$ when $F_\infty$ and $\theta^U$ are fixed.
+Rational Bayesian learning therefore always produces a limiting posterior measure.
 
-Therefore the long-run distribution of prices identifies the true value of $\theta^I$.
+But {prf:ref}`prop-bk-measure-convergence` alone does not pin down what that limit is, and additional structure is needed before the limit assesses the truth correctly.
 
-This is the concrete route from convergence of posterior assessments to convergence to the "correct beliefs".
+## Sharpening the convergence result
 
-It relies on smoothness, ergodicity, and identification, rather than on martingale convergence alone.
+We now return to the two-agent example and identify hypotheses under which $P_\infty^U$ is a point mass at the true $\theta^I$.
 
-## Obstacles to convergence
+Write $F_t$ for the CDF of agent $U$'s posterior on $\theta^I$ at date $t$ after observing $(r_{t-1}, p_{t-1}, x^U_{t-1})$ and all earlier data.
+
+{prf:ref}`prop-bk-measure-convergence` yields a random CDF $F_\infty$ such that $F_t$ converges weakly to $F_\infty$, $P^U$-a.s.
+
+Three hypotheses sharpen this to concentration on the truth, corresponding to the three steps in {cite:t}`BrayKreps1987`.
+
+```{prf:assumption}
+:label: assum-bk-continuity
+
+The equilibrium uninformed demand $x^U(p, F)$ is continuous in $F$ with respect to weak convergence, uniformly in $p$ on a $P^U$-full-measure set of prices.
+```
 
-While the positive convergence results are elegant, the same framework also shows when learning can *fail* to produce convergence to REE.
+```{prf:assumption}
+:label: assum-bk-identification
 
-### Obstacle 1: price maps might not settle down
+For fixed $\theta^U$ and limiting posterior $F_\infty$, the limiting price functional $p_\infty(\,\cdot\,; F_\infty, \theta^I, \theta^U)$ is stochastically monotone in $\theta^I$, in the sense that $\theta^I < \theta^{I\,\prime}$ implies $p_\infty(s; F_\infty, \theta^I, \theta^U)$ first-order stochastically dominates $p_\infty(s; F_\infty, \theta^{I\,\prime}, \theta^U)$ when $s$ is drawn from its marginal distribution.
+```
 
-The step from weak convergence of posteriors to convergence of prices requires smoothness of the equilibrium price functional.
+In the lecture's CARA-Normal setup, {prf:ref}`assum-bk-continuity` holds because the FOC {eq}`eq:bk-foc` defines $x^U$ as a continuous functional of $F$ under weak convergence through bounded integrals, and {prf:ref}`assum-bk-identification` holds because the equilibrium price has the form $p_t = s_t - \sigma^2 X^I_t / \theta^I$ with $X^I_t > 0$ on a full-measure set.
 
-This can be hard, because small changes in a price function can produce large changes in the information communicated by prices.
+The IID assumption on $\{s_t\}$, already part of the model, supplies the ergodicity used in step 2 below.
 
-Thus martingale convergence of beliefs does not by itself guarantee that the economy settles into a stationary price relation.
+Under these three assumptions and the IID signal sequence, the limiting posterior in the two-agent example concentrates on the truth.
 
-### Obstacle 2: prices might not identify the full parameter
+```{prf:proposition}
+:label: prop-bk-sharpening
 
-Even if prices settle down, the long-run distribution of prices need not identify every structural parameter.
+Under {prf:ref}`assum-bk-borel`, {prf:ref}`assum-bk-continuity`, and {prf:ref}`assum-bk-identification`, and given the IID signal sequence $\{s_t\}$, the limiting posterior on $\theta^I$ satisfies
 
-A simple variant has two informed agents whose risk tolerances $\theta^{I1}$ and $\theta^{I2}$ are both unknown to the uninformed agent.
+$$
+F_\infty
+=
+\delta_{\theta^I_{\rm true}}
+\qquad P^U\text{-a.s.}
+$$
+```
 
-In that case, prices reveal only the sum $\theta^{I1}+\theta^{I2}$.
+The proof has three steps.
 
-The uninformed agent cannot disentangle the two risk tolerances from price data alone.
+*Step 1: price functional convergence.*
 
-For decisions in that example, learning the sum is enough, but it is not learning the full state.
+{prf:ref}`assum-bk-continuity` and the weak convergence $F_t \Rightarrow F_\infty$ from {prf:ref}`prop-bk-measure-convergence` imply that equilibrium demands $x^U(p, F_t)$ converge to $x^U(p, F_\infty)$.
 
-### Obstacle 3: the truth might be outside the model
+Combining with market clearing and the price equation {eq}`eq:bk-price` gives $p_t \to p_\infty(s_t; F_\infty, \theta^I, \theta^U)$ on a $P^U$-full-measure set.
 
-An example of {cite:t}`BlumeEasley1982` illustrates a related misspecification problem.
+*Step 2: the limit price distribution is observable.*
 
-In that example, agents can converge to an incorrect model because the true stable price relation has zero prior probability under the models they entertain.
+Since $\{s_t\}$ is IID, the empirical distribution of $\{p_t\}_{t \le T}$ converges almost surely to the distribution of $p_\infty(s_t; F_\infty, \theta^I, \theta^U)$ by the Glivenko-Cantelli theorem.
 
-In the rational-learning formulation, this kind of failure can occur only on a prior-null event.
+The empirical distribution function is $H_\infty^U(p)$-measurable, and so therefore is its limit.
 
-The reason is that rational learning puts the possible price relations generated by the expanded state space inside the Bayesian model from the start.
+*Step 3: identification.*
 
-## Learning *within* versus learning *about* a rational expectations equilibrium
+{prf:ref}`assum-bk-identification` makes the marginal distribution of $p_\infty$ a strictly monotone function of $\theta^I$ given $(F_\infty, \theta^U)$.
 
-One of the deepest conceptual points in {cite:t}`BrayKreps1987` is a distinction between two fundamentally different notions of learning in a rational expectations context.
+Combined with step 2, this means $\theta^I$ is itself $H_\infty^U(p)$-measurable, so for any subinterval $[c,d] \subseteq [a,b]$ the limiting posterior satisfies $P_\infty^U(\theta^I \in [c,d]) = \mathbf 1_{\{\theta^I_{\rm true} \in [c,d]\}}$.
 
-### The distinction
+Combining steps 1, 2, and 3 yields $F_\infty = \delta_{\theta^I_{\rm true}}$.
 
-**Learning *within* a rational expectations equilibrium** is the subject of this lecture.
+The numerical simulation above is consistent with this result.
 
-The phrase refers to Bayesian inference that takes place *inside* a correctly specified model of the economy.
+The posterior density on $\theta^I$ collapses to a spike at $\theta^I_{\rm true}=2$, and the equilibrium informed trade $X^I_t$ converges to the full-information value $2\theta^I_{\rm true}/(\theta^I_{\rm true} + \theta^U)$.
 
-In the rational-learning formulation, agents are uncertain about parameters such as other agents' risk tolerances.
+Hence, the path connecting {prf:ref}`prop-bk-event-convergence` (martingale convergence) to {prf:ref}`prop-bk-sharpening` (concentration on the truth) depends on three model-specific ingredients: continuity, ergodicity, and identification.
 
-But for every possible parameter realization, they are assumed to know the equilibrium price and allocation maps.
+## Obstacles to convergence
 
-The Bayesian learning model is therefore a large rational expectations equilibrium over an expanded state space.
+It is natural to ask when these ingredients can fail, and what the consequences are for learning.
 
-This is why the martingale convergence theorem can be applied so cleanly.
+### Obstacle 1: failure of continuity
 
-**Learning *about* a rational expectations equilibrium** is a quite different enterprise.
+If {prf:ref}`assum-bk-continuity` fails, step 1 of the proof breaks.
 
-Here agents do not begin with the equilibrium map already embedded in their model.
+When the equilibrium price functional is discontinuous in $F$, small changes in beliefs can produce large changes in the information content of prices, and weak convergence of beliefs need not imply convergence of prices.
 
-Instead, they try to infer the price-state relation from data generated while beliefs and behavior are changing.
+{cite:t}`BrayKreps1987` flag this as the most delicate step in their argument.
 
-This is the original problem that motivated the analysis: learning changes behavior, and behavior changes the price-state relation being learned.
+Continuity of $x^U(p, F)$ in $F$ is automatic in this lecture because the FOC integrates a bounded continuous function against $F$, but verifying it in richer market structures often requires non-trivial regularity arguments.
 
-### Why rational learning has limited value
+### Obstacle 2: failure of identification
 
-The expanded-state-space formulation is natural, but it has a main flaw.
+If {prf:ref}`assum-bk-identification` fails, step 3 breaks even when steps 1 and 2 succeed.
 
-It avoids the question of how agents learn the relation between prices and states by assuming that agents already know the equilibrium for every possible economy in the state space.
+Consider a variant with two informed agents and risk tolerances $\theta^{I1}, \theta^{I2}$ both unknown to the uninformed agent.
 
-It does not satisfactorily answer the question "How does a rational expectations equilibrium come about?"
+The CARA-Normal full-communication price has the form
 
-The reason is not that Bayesian convergence is false.
+$$
+p_t
+=
+s_t
+-
+\frac{2\sigma^2}{\theta^{I1} + \theta^{I2} + \theta^U},
+$$
 
-The reason is that the Bayesian agents must have extraordinary insight into the structure of the economy and the implied probabilities of events.
+which depends on $(\theta^{I1}, \theta^{I2})$ only through the sum $\theta^{I1}+\theta^{I2}$.
 
-This is why the framework is useful both as a benchmark and as a warning.
+{prf:ref}`prop-bk-measure-convergence` still applies, but $F_\infty$ is supported on the diagonal
 
-It gives sharp restrictions on what rational learning can imply, but it does not provide a plausible behavioral story for attaining rational expectations.
+$$
+\{(\theta_1, \theta_2): \theta_1 + \theta_2 = \theta^{I1}_{\rm true} + \theta^{I2}_{\rm true}\},
+$$
 
-### The role of "irrational" learning algorithms
+not on the singleton $\{(\theta^{I1}_{\rm true},\theta^{I2}_{\rm true})\}$.
 
-This explains why the literature on learning *about* rational expectations equilibria --- going back to {cite:t}`Bray1982` and {cite:t}`BraySavin1984`, and extended in the influential work of {cite:t}`MarcetSargent1989jet` --- tends to rely on **ordinary least squares (OLS)** or other adaptive algorithms rather than Bayes' rule.
+Convergence occurs, but to a manifold of observationally equivalent parameter values rather than to the truth.
 
-```{note}
-{cite:t}`MarcetSargent1989jet` use some theorems about stochastic approximation to extend some of Bray and Savin's results to other settings.
-```
+### Obstacle 3: misspecification
 
-In those models, agents estimate perceived laws of motion from observed data and update the estimates as new observations arrive.
+A separate obstacle arises if the true pricing relation lies outside the agent's prior support.
 
-Such rules are computationally tractable and can converge in important examples.
+{cite:t}`BlumeEasley1982` give a stylised version of this obstacle, and {doc}`likelihood_ratio_process_2` develops the Blume-Easley heterogeneous-beliefs model in this lecture series.
 
-But they are *"irrational"* in the specific sense used here.
+Each agent entertains two competing models $\psi_n^0$ and $\psi_n^1$ over $(I_t, p_t)$, and an equilibrium can exist in which agents assign asymptotic probability one to a model that places zero probability on the actually-observed price relation.
 
-An agent who already understood the full equilibrium model would not generally use those rules as the Bayesian optimum.
+In strict rational learning the agent's prior must be supported on Bayesian-consistent models in the expanded state space, so this failure can occur only on a $P^U$-null event.
 
-The attraction of these rules is precisely that they ask a different question.
+Rational learning embeds every candidate pricing relation in the prior from date zero, so any candidate with positive prior weight cannot be dominated by one with zero prior weight no matter what the data say.
 
-They ask whether agents using standard statistical procedures on the data generated by the model could eventually learn to form rational expectations.
+## Learning within versus learning about a rational expectations equilibrium
 
-Rational Bayesian learning is demanding as a behavioral assumption, but it also disciplines adaptive learning stories.
+The framework above points to an important conceptual distinction in {cite:t}`BrayKreps1987`.
 
-The proposed discipline is that a stationary limiting equilibrium should not leave agents' beliefs systematically contradicted by observations.
+### The distinction
 
-In the long run, equilibrium expectations must either keep changing or become rational.
+Learning *within* a rational expectations equilibrium is the topic of this lecture.
 
-There is a fundamental tension at the heart of learning about rational expectations equilibria:
+It is Bayesian inference inside a correctly specified model: {prf:ref}`assum-bk-borel`, {prf:ref}`assum-bk-continuity`, and {prf:ref}`assum-bk-identification` all hold, and the prior puts positive weight on the truth.
 
-* A fully rational (Bayesian, correctly specified) learner can only apply Bayes' rule to a model whose structure is *already known*, but the structure of the REE is exactly what the agent is trying to learn.
-* A learner who uses an adaptive algorithm (OLS, least-mean-squares, etc.) can potentially converge to the REE, but only by using a rule that cannot be derived from Bayesian rationality applied to a correctly specified model.
+Agent $U$ is uncertain about $\theta^I$, but for every candidate value he already knows the equilibrium price and allocation maps.
 
-The rational-learning formulation avoids this tension by assumption: agent $U$ knows how each possible risk tolerance would map histories into equilibrium prices and trades.
+The expanded-state-space formulation $\Omega = \Theta \times \Phi^\infty$ embeds a rational expectations equilibrium on the larger space, and inference reduces to conditional probability over $\Theta$.
 
-The full equilibrium simulation above embeds exactly that knowledge, since `equilibrium_XI` is recomputed from $f_t$ at every date.
+Learning *about* a rational expectations equilibrium is a fundamentally different exercise.
 
-The device makes Bayesian consistency transparent, but it still sidesteps the deeper difficulty of learning *about* an REE from scratch.
+The agent does not begin with the equilibrium map embedded in his probability model.
 
+Instead he must infer the price-state relation from data generated while his own beliefs and behavior co-evolve with the data.
 
-## Summary
+### The trade-off
+
+The two notions sit on opposite sides of a precise trade-off.
 
-This lecture has discussed rational learning in the sense of {cite:t}`BrayKreps1987`:
+A correctly-specified Bayesian learner enjoys the convergence guarantees in {prf:ref}`prop-bk-event-convergence` and {prf:ref}`prop-bk-measure-convergence`, but only because the equilibrium has been built into the prior from date zero.
 
-1. **Rational learning** is modeled by expanding the state space to include unknown structural parameters such as risk tolerances.
+An adaptive learner who treats the price-state relation as something to be estimated can hope to discover it from data, but the estimator he uses cannot be derived from Bayes' rule applied to a correctly specified model.
 
-2. **Posterior assessments converge** because conditional probabilities form bounded martingales.
+No learning algorithm delivers both Bayesian rationality and discovery of the equilibrium structure at the same time.
 
-3. **Posterior measures converge weakly** under standard topological assumptions on the parameter space.
+The literature on learning *about* rational expectations equilibria, beginning with {cite:t}`Bray1982` and {cite:t}`BraySavin1984` and extended by {cite:t}`MarcetSargent1989jet`, takes the second side of the trade-off and replaces Bayes' rule with **ordinary least squares** or related recursive estimators.
 
-4. **Correct learning** requires more than martingale convergence, because the limiting price distribution must identify the true parameter.
+The companion lecture {doc}`ls_learning` develops this least-squares-learning framework in self-referential models and traces the resulting dynamics through the associated ordinary differential equation.
 
-5. **In the two-agent example**, the uninformed agent learns the informed agent's risk tolerance because the limiting price distribution is monotone in that parameter.
+Those rules are computationally tractable and converge in important examples, but they are *not* Bayesian-optimal under any correctly specified prior.
 
-6. **Identification can fail** when prices reveal only a composite parameter, such as the sum of two informed agents' risk tolerances.
+## Summary
 
-7. **Misspecification matters** because a stable price relation outside the learner's prior support cannot be learned by Bayes' rule.
+This lecture implemented the rational-learning equilibrium of {cite:t}`BrayKreps1987`.
 
-8. **The full simulation** above solves the within-period equilibrium from the posterior at every date and shows the posterior on $\theta^I$ collapsing to a point mass at $\theta^I_{\rm true}$.
+Posterior assessments converge by bounded martingale convergence ({prf:ref}`prop-bk-event-convergence`), and posterior measures converge weakly under a Polish-Borel assumption ({prf:ref}`prop-bk-measure-convergence`).
 
-The broader message is that while the mathematics of Bayesian learning is powerful, its application to learning *about* rational expectations equilibria is subtle and the conditions under which learning succeeds are more restrictive than they might appear.
+Concentration on the truth additionally requires continuity ({prf:ref}`assum-bk-continuity`), ergodicity, and identification ({prf:ref}`assum-bk-identification`); each obstacle above is a failure of one of these.
+
+The simulation confirms both conclusions: the posterior on $\theta^I$ collapses to $\theta^I_{\rm true}$ and the equilibrium informed trade reaches its full-information value.
+
+Rational learning describes the limits of Bayesian inference *given* the equilibrium structure; adaptive learning, in {doc}`ls_learning`, describes how that structure can be learned in the first place.
 
 
 ## Exercises
 
-```{exercise}
+````{exercise}
 :label: rle_ex1
 
 **Off-centre prior**
 
 The baseline simulation uses a uniform prior on $\theta^I \in [0.5, 4]$.
 
-(a) Re-run the simulation with a prior whose mass sits *above* the true value, for example
+1. Re-run the simulation with a prior whose mass sits *above* the true value, for example
 
-```
+```python
 prior = lambda θ: (θ - 0.5)**3 * (4 - θ)
 ```
 
 which peaks near $\theta = 3.1$.
 
-(b) Plot the posterior mean over time alongside the uniform-prior baseline.
+2. Plot the posterior mean over time alongside the uniform-prior baseline.
 
-(c) Does the posterior eventually concentrate on $\theta^I_{\rm true}$, and how does the speed compare?
-```
+3. Does the posterior eventually concentrate on $\theta^I_{\rm true}$, and how does the speed compare?
+````
 
 ```{solution-start} rle_ex1
 :class: dropdown
@@ -787,11 +917,11 @@ $$
 
 The sensitivity $|\partial s_t/\partial \theta| = \sigma^2 X^I_t/\theta^2$ depends on the level of $\theta^I_{\rm true}$ through $X^I_t$ and $\theta^{-2}$.
 
-(a) Run the simulation for $\theta^I_{\rm true} \in \{0.8, 2.0, 3.5\}$, holding everything else at the baseline.
+1. Run the simulation for $\theta^I_{\rm true} \in \{0.8, 2.0, 3.5\}$, holding everything else at the baseline.
 
-(b) Plot the posterior variance on a log scale for each case.
+2. Plot the posterior variance on a log scale for each case.
 
-(c) Which value of $\theta^I_{\rm true}$ yields the fastest concentration, and does the result match the sensitivity formula above?
+3. Which value of $\theta^I_{\rm true}$ yields the fastest concentration, and does the result match the sensitivity formula above?
 ```
 
 ```{solution-start} rle_ex2
@@ -817,8 +947,6 @@ The smallest $\theta^I_{\rm true}$ produces the steepest decline in posterior va
 
 The reason is that the sensitivity $\sigma^2 X^I_t/\theta^2$ scales as $\theta^{-2}$ for fixed $X^I_t$, so the same noise level conveys much more information about $\theta^I$ when $\theta^I$ is small.
 
-The asymmetry is a feature of the geometry of the equilibrium map, not of the learning rule itself.
-
 ```{solution-end}
 ```
 
@@ -831,11 +959,11 @@ Larger $\sigma^2$ widens the conditional density of $s_t$ given $r_t$, which one
 
 But $\sigma^2$ also scales the price intercept in {eq}`eq:bk-price`, so price dispersion across candidate $\theta$ grows with $\sigma^2$.
 
-(a) Run the simulation with $\sigma^2 \in \{0.25, 1.0, 4.0\}$, keeping $\tau^2 = 1$ fixed.
+1. Run the simulation with $\sigma^2 \in \{0.25, 1.0, 4.0\}$, keeping $\tau^2 = 1$ fixed.
 
-(b) Plot the posterior variance on a log scale for each $\sigma^2$.
+2. Plot the posterior variance on a log scale for each $\sigma^2$.
 
-(c) Which effect dominates? Explain in terms of the signal-to-noise ratio for inferring $\theta^I$ from the price.
+3. Which effect dominates? Explain in terms of the signal-to-noise ratio for inferring $\theta^I$ from the price.
 ```
 
 ```{solution-start} rle_ex3

From 017479794b625b7526865aeb82bc099eb9bc539d Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Tue, 2 Jun 2026 16:58:50 +1000
Subject: [PATCH 21/25] updates

---
 lectures/ls_learning.md          | 1388 +++++++++++++++---------------
 lectures/rational_learning_re.md |  200 ++---
 2 files changed, 810 insertions(+), 778 deletions(-)

diff --git a/lectures/ls_learning.md b/lectures/ls_learning.md
index 8ad46147c..07f7fa9b8 100644
--- a/lectures/ls_learning.md
+++ b/lectures/ls_learning.md
@@ -29,19 +29,16 @@ kernelspec:
 ## Overview
 
 This lecture is a companion to {doc}`rational_learning_re`, which presents the
-Bray–Kreps perspective on rational learning. 
+Bray–Kreps perspective on rational learning.
 
-The present lecture examines the
-closely related but distinct question of whether *least squares* learning
+We examine the related but distinct question of whether *least squares* learning
 converges to a rational expectations equilibrium in self-referential models.
 
-
-This lecture presents the framework of {cite:t}`MarcetSargent1989jet` for studying
+We present the framework of {cite:t}`MarcetSargent1989jet` for studying
 **least squares learning** in a class of **self-referential** linear stochastic models.
 
-A self-referential model is one in which the *actual* law of motion for the
-economy depends on the *perceived* law of motion held by the agents within
-it. 
+A self-referential model is one where the *actual* law of motion for the
+economy depends on the *perceived* law of motion of agents within it.
 
 In a rational expectations equilibrium (REE) the two coincide: the
 perceived and actual laws of motion are the same.
@@ -50,17 +47,19 @@ But if agents start away
 from equilibrium and update their beliefs by running least squares regressions,
 will they converge to the REE?
 
-{cite:t}`MarcetSargent1989jet` answer this question by exploiting a powerful
+{cite:t}`MarcetSargent1989jet` answer this question by using a powerful
 technique from systems-control engineering: the differential equation
 approach of {cite:t}`Ljung1977`.
 
-They applied stochastic
-difference equation to describe how beliefs evolve can be approximated, in the
-limit, by a deterministic ordinary differential equation (ODE).
+The key insight is that the stochastic difference equation of belief
+evolution can be approximated by a deterministic ordinary differential
+equation (ODE) in the limit.
+
+Almost-sure convergence can be studied through the stability of that ODE.
 
-Almost-sure
-convergence of least squares to the REE is then equivalent to *local stability*
-of the REE as a fixed point of that ODE.
+Local stability of the small ODE is closely related to local convergence, while
+global almost-sure convergence also requires boundedness and
+domain-of-attraction conditions.
 
 The framework unifies and extends earlier work by {cite:t}`Bray1982` and
 {cite:t}`BraySavin1984` and connects naturally to the distinction between learning
@@ -71,68 +70,23 @@ whose data-generating process shifts with beliefs) discussed in
 
 
 
-Let's begin with the imports we'll use throughout
+Let's begin with the imports we'll use throughout.
 
 ```{code-cell} ipython3
 import numpy as np
 import matplotlib.pyplot as plt
-from matplotlib.gridspec import GridSpec
 from numpy import linalg as la
 from scipy.integrate import solve_ivp
 from scipy.optimize import fsolve
-
-np.random.seed(42)
 ```
 
-We also define two helper functions used throughout the lecture: one to
-simulate recursive least squares in a scalar self-referential model, and one
-to solve the associated ODE.
-
-```{code-cell} ipython3
-def simulate_rls_scalar(T_map, σ_u, β0, T_periods=500, N_paths=100,
-                        a_seq=None, seed=0):
-    """
-    Simulate recursive least squares for the scalar model
-    z1_t = T(β_t) + u_t with constant regressor z2_t = 1.
-    """
-    rng = np.random.default_rng(seed)
-    if a_seq is None:
-        a_seq = np.ones(T_periods)
-
-    β_paths = np.empty((N_paths, T_periods))
-
-    for i in range(N_paths):
-        β = β0
-        R = 1.0
-
-        for t in range(T_periods):
-            α_t = a_seq[t]
-            z2 = 1.0
-            u_t = rng.normal(0, σ_u)
-            z1 = T_map(β) * z2 + u_t
-
-            R = R + (α_t / (t + 1)) * (z2**2 - R / α_t)
-            R = max(R, 1e-8)
-            β = β + (α_t / (t + 1)) / R * z2 * (z1 - β * z2)
-
-            β_paths[i, t] = β
-
-    return β_paths
-
-
-def solve_ode(f_ode, β0, t_span=(0, 80), n_points=1000):
-    """Solve the scalar ODE dβ/dt = f_ode(β) from β0."""
-    sol = solve_ivp(lambda t, y: [f_ode(y[0])], t_span, [β0],
-                    t_eval=np.linspace(*t_span, n_points), method='RK45',
-                    max_step=0.1)
-    return sol.t, sol.y[0]
-```
+Before simulating anything, we describe the object being learned.
 
 ## The self-referential structure
 
 ### Perceived and actual laws of motion
 
-At each date $t$, agents hold a **perceived law of motion** summarised by a
+At each date $t$, agents hold a **perceived law of motion** summarized by a
 parameter matrix $\beta_t$.
 
 They believe that the variable $z_{1t}$ they care
@@ -145,21 +99,26 @@ $$ (eq:perceived_lom)
 where $z_{2t}$ is a vector of variables agents use to forecast $z_{1,t+1}$, and
 $\eta_t$ is orthogonal to all past $z_2$'s.
 
-Because agents optimise (or behave) on the basis of this belief, their actions
+Because agents optimize (or behave) on the basis of this belief, their actions
 feed back into the economy.
 
-The actual law of motion for the full state
-vector $z_t = (z_{1t}, z_{1t}^c)^\top$ is
+To write the actual law of motion for the full state vector
+$z_t = (z_{1t}, z_{1t}^c)^\top$, partition $A(\beta_t)$ conformably with
+$(z_{2,t-1}^c, z_{2,t-1})$ as
+$A(\beta_t) = [A_c(\beta_t)\; A_2(\beta_t)]$.
 
 $$
-z_t = \begin{bmatrix} 0 & T(\beta_t) \\ A(\beta_t) & \end{bmatrix}
+z_t = \begin{bmatrix}
+        0 & T(\beta_t) \\
+        A_c(\beta_t) & A_2(\beta_t)
+      \end{bmatrix}
       \begin{bmatrix} z_{2,t-1}^c \\ z_{2,t-1} \end{bmatrix}
     + \begin{bmatrix} V(\beta_t) \\ B(\beta_t) \end{bmatrix} u_t ,
 $$ (eq:actual_lom)
 
 where $u_t$ is i.i.d. white noise with covariance $\Sigma$.
 
-The mapping $T$ is the key object: it maps the *perceived* coefficient $\beta$
+The mapping $T$ is the key object, as it maps the *perceived* coefficient $\beta$
 to the coefficient that *actually* governs $z_{1t}$ in equilibrium.
 
 A
@@ -174,9 +133,15 @@ Define $R_t$ as a running estimate of the second-moment matrix $E z_{2t}z_{2t}^\
 Updating equations are
 
 $$
-\beta_t^\top = \beta_{t-1}^\top + \frac{\alpha_t}{t} R_{t-1}^{-1}
-           z_{2,t-2} z_{2,t-2}^\top \bigl[ T(\beta_{t-1})^\top - \beta_{t-1}^\top \bigr]
-         + \frac{\alpha_t}{t} z_{2,t-2} u_{t-1}^\top V(\beta_{t-1})^\top ,
+\beta_t^\top
+=
+\beta_{t-1}^\top
++ \frac{\alpha_t}{t} R_{t-1}^{-1}
+\left\{
+z_{2,t-2} z_{2,t-2}^\top
+\bigl[ T(\beta_{t-1}) - \beta_{t-1} \bigr]^\top
++ z_{2,t-2} u_{t-1}^\top V(\beta_{t-1})^\top
+\right\},
 $$ (eq:rls_beta)
 
 $$
@@ -201,19 +166,16 @@ version in which agents update using $z_{1t}$ and $z_{2,t-1}$ at date $t$.
 That timing creates simultaneous determination, because $z_t$ depends on the
 same estimate $\beta_t$ that is being updated from $z_t$.
 
-The extra requirement is that the date-$t$ system have a unique solution
+An extra requirement is that the date-$t$ system must have a unique solution
 $(\beta_t, R_t, z_t)$ for each history.
 
 Under that uniqueness condition, the same full ODE {eq}`eq:full_ode` and small ODE {eq}`eq:small_ode`
 govern convergence.
 
-Thus the stability criterion below is not an artifact of the one-period lag in
-the displayed learning rule.
-
 ```{note}
-As {cite:t}`BraySavin1984` and {cite:t}`BrayKreps1987` emphasise, the RLS algorithm
+As {cite:t}`BraySavin1984` and {cite:t}`BrayKreps1987` emphasize, the RLS algorithm
 cannot be derived from Bayes' rule applied to a correctly specified model, because
-during the learning transition the data-generating process is non-stationary —
+during the learning transition the data-generating process is non-stationary ---
 beliefs shift the equilibrium, which shifts the data.
 
 The algorithm is
@@ -221,84 +183,116 @@ The algorithm is
 when it is not.
 ```
 
-## The governing ODE
+## Why a differential equation governs the limit
+
+The RLS recursion {eq}`eq:rls_beta`–{eq}`eq:rls_R` is a *stochastic difference equation* with two key features.
+
+First, the *step size* in front of each update is $\alpha_t / t$, which shrinks to zero as $t$ grows.
+
+Second, the bracketed expression on the right-hand side of {eq}`eq:rls_beta` is, at the long-run average values of the regressors and noise, just the discrepancy $T(\beta_{t-1}) - \beta_{t-1}$ that measures how far perceived beliefs are from the law of motion they actually generate.
 
-### Ljung's differential-equation approach
+Combine these two facts and a clear picture emerges.
 
-{cite:t}`MarcetSargent1989jet` apply the theorem of {cite:t}`Ljung1977` to
-characterise the almost-sure limiting behaviour of the stochastic system
-{eq}`eq:rls_beta`–{eq}`eq:rls_R`.
+The recursion makes only small adjustments to $\beta_t$ each period and those adjustments average out, by the law of large numbers, to the deterministic drift $T(\beta) - \beta$.
 
-The central object is the *small ODE*
+As the gain $\alpha_t/t$ vanishes, the time-rescaled trajectory $\beta_t$ looks more and more like a solution to the continuous-time differential equation
 
 $$
-\frac{d\beta}{dt} = T(\beta) - \beta ,
+\frac{d\beta}{dt} = T(\beta) - \beta .
 $$ (eq:small_ode)
 
-whose fixed points are exactly the rational expectations equilibria.
+The ODE clock is cumulative gain time, not calendar time.
 
-The *full ODE* associated with the joint process $(\beta_t, R_t)$ is
+When $\alpha_t=1$, calendar period $t$ corresponds approximately to ODE time
+$\sum_{s=1}^t 1/s \approx \log t$.
+
+This idea, due to {cite:t}`Ljung1977`, is what lets {cite:t}`MarcetSargent1989jet` reduce the analysis of a noisy adaptive learning rule to the much easier study of a deterministic ODE.
+
+The rest of this section makes the connection precise.
+
+## The governing ODE
+
+### Small and full ODEs
+
+The *small ODE* {eq}`eq:small_ode` keeps only the drift in $\beta$, holding the variance estimator $R$ at its long-run value.
+
+Its rest points are precisely the rational expectations equilibria of the model: $\dot\beta = 0$ if and only if $T(\beta) = \beta$.
+
+The *full ODE* tracks both $\beta$ and $R$ jointly:
 
 $$
-\frac{d}{dt}\begin{bmatrix} \beta \\ R \end{bmatrix}
+\frac{d}{dt}\begin{bmatrix} \beta^\top \\ R \end{bmatrix}
 = \begin{bmatrix} R^{-1} M_{z_2}(\beta)\,[T(\beta) - \beta]^\top \\ M_{z_2}(\beta) - R \end{bmatrix} ,
 $$ (eq:full_ode)
 
-where $M_{z_2}(\beta) = E z_{2t}z_{2t}^\top$ evaluated at the stationary
-distribution induced by $\beta$.
+where $M_{z_2}(\beta) = E z_{2t}z_{2t}^\top$ is computed at the stationary distribution of $z_{2t}$ that prevails when agents believe the perceived law has constant parameter $\beta$.
+
+The fixed point of {eq}`eq:full_ode` is $(\beta_f, R_f)$ with $R_f = M_{z_2}(\beta_f)$ — the same $\beta_f$ as the small ODE, paired with the second-moment matrix consistent with it.
+
+### Regularity assumptions
 
-The fixed point of {eq}`eq:full_ode` is $(\beta_f, R_f)$ where
-$R_f = M_{z_2}(\beta_f)$.
+{cite:t}`MarcetSargent1989jet` distinguish two groups of assumptions, with quite different roles.
 
-### Regularity and boundedness assumptions
+The first five are standard regularity conditions on the operator $T$, the shocks $u_t$, and the gain sequence; they are usually easy to check in applications.
 
-The convergence theorems below presuppose the following conditions on the
-operator $T$, the shocks $u_t$, the gain sequence $\{\alpha_t\}$, and the
-domain of the algorithm.
+Let $D_s \subset \mathbb{R}^{n_1 \times n_2}$ be the set on which
+$T(\beta)$, $A(\beta)$, $B(\beta)$, $V(\beta)$ are well defined and the
+eigenvalues of
 
-Let $D_s \subset \mathbb{R}^{n_1 \times n_2}$ be the set on which $T(\beta)$,
-$A(\beta)$, $B(\beta)$, $V(\beta)$ are well defined and the eigenvalues of
-$\bigl[\begin{smallmatrix}0 & T(\beta)\\ A(\beta) & \end{smallmatrix}\bigr]$
-are less than unity in modulus.
+$$
+\begin{bmatrix}
+0 & T(\beta) \\
+A_c(\beta) & A_2(\beta)
+\end{bmatrix}
+$$
 
-```{prf:assumption} A.1 (unique fixed point)
+are less than one in modulus, so that the state process induced by belief
+$\beta$ is covariance-stationary.
+
+```{prf:assumption} Unique REE
 :label: ass-ms-a1
 
-The operator $T$ has a unique fixed point $\beta_f = T(\beta_f)$ with
-$\beta_f \in D_s$.
+The operator $T$ has a unique fixed point $\beta_f = T(\beta_f)$ with $\beta_f \in D_s$.
 ```
 
-```{prf:assumption} A.2 (smoothness)
+```{prf:assumption} Smoothness of T, A, B, V
 :label: ass-ms-a2
 
 $T$ is twice differentiable and $A, B, V$ each have one derivative in $D_s$.
 ```
 
-```{prf:assumption} A.3 (nonsingular covariance)
+```{prf:assumption} Nonsingular limit covariance
 :label: ass-ms-a3
 
 The covariance matrix $M_{z_2}(\beta_f)$ is nonsingular.
 ```
 
-```{prf:assumption} A.4 (gain sequence)
+```{prf:assumption} Gain sequence
 :label: ass-ms-a4
 
-For all $t$, $\alpha_t > 0$; $\alpha_t$ is non-decreasing in $t$; $\alpha_t \to 1$
-as $t \to \infty$; and $\limsup_{t \to \infty} t\,|\alpha_t - \alpha_{t-1}| = K < \infty$.
+For all $t$, $\alpha_t > 0$; $\alpha_t$ is non-decreasing in $t$; $\alpha_t \to 1$ as $t \to \infty$; and $\limsup_{t \to \infty} t\,|\alpha_t - \alpha_{t-1}| < \infty$.
 ```
 
-```{prf:assumption} A.5 (shocks)
+```{prf:assumption} Shock moments
 :label: ass-ms-a5
 
-The vector $u_t$ is serially independent, and $E|u_{it}|^p < \infty$ for all
-$p > 1$ and all $i = 1, \ldots, m$.
+The vector $u_t$ is serially independent, and $E|u_{it}|^p < \infty$ for all $p > 1$ and all $i = 1, \ldots, m$.
 ```
 
-```{prf:assumption} A.6 (boundedness along a subsequence)
+These five say: there is a unique target, the model is smooth around it, the regression has a well-defined precision matrix in the limit, the gain shrinks at the right speed, and the shocks have enough moments for laws of large numbers to work.
+
+The remaining two are *boundedness* conditions.
+
+They are needed because Ljung's theorem requires the relevant sample path to
+return to a bounded region infinitely often.
+
+In this model that includes the regressor process $z_{2t}$ and the covariance
+estimate $R_t$.
+
+```{prf:assumption} Boundedness along a subsequence
 :label: ass-ms-a6
 
-There exist a set $\Omega_0$ with $P(\Omega_0) = 1$, random variables
-$C_1(\omega)$ and $C_2(\omega)$, and a subsequence $\{t_k(\omega)\}$ such that
+There exist a set $\Omega_0$ with $P(\Omega_0) = 1$, random variables $C_1(\omega)$ and $C_2(\omega)$, and a subsequence $\{t_k(\omega)\}$ such that
 
 $$
 |z_{2t_k}(\omega)| < C_1(\omega) \quad\text{and}\quad |R_{t_k}(\omega)| < C_2(\omega)
@@ -307,42 +301,43 @@ $$
 for all $\omega \in \Omega_0$ and all $k = 1, 2, \ldots$.
 ```
 
-```{prf:assumption} A.7 (projection or compactness)
+```{prf:assumption} Sample path stays in a workable domain
 :label: ass-ms-a7
 
-Either
+Either of the following holds:
 
-- **(A.7.1)** $D_1 = D_2 = \mathbb{R}^{n_1 \times (n_2)^3}$, and given the
-  set $\Omega_0$ and subsequence $\{t_k\}$ from {prf:ref}`ass-ms-a6`, there
-  exists a compact $D' \subset D_s$ with $\beta_{t_k}(\omega) \in D'$ for all
-  $k$ and all $\omega \in \Omega_0$; moreover, for any initial condition
-  $(\beta(0), R(0))$ with $\beta(0) \in D'$ and $|R(0)| < C_2(\omega)$,
-  trajectories of {eq}`eq:full_ode` never leave a closed subset of $D_s$;
+- *Compact-state version.* $D_1 = D_2 = \mathbb{R}^{n_1 \times (n_2)^3}$ and there exists a compact $D' \subset D_s$ that contains $\beta_{t_k}(\omega)$ for all $k$ and all $\omega \in \Omega_0$, with trajectories of {eq}`eq:full_ode` originating in $D'$ never leaving a closed subset of $D_s$.
 
-- **or (A.7.2)** $D_2$ is closed, $D_1$ is open and bounded, $\beta \in D_s$
-  for every $(\beta, R) \in D_1$, and trajectories of {eq}`eq:full_ode` with
-  initial conditions in $D_2$ never leave a closed subset of $D_1$.
+- *Projection-set version.* $D_2$ is closed, $D_1$ is open and bounded, $\beta \in D_s$ for every $(\beta, R) \in D_1$, and trajectories of {eq}`eq:full_ode` with initial conditions in $D_2$ never leave a closed subset of $D_1$.
 ```
 
-Let $D_A$ denote the domain of attraction of the unique equilibrium
-$(\beta_f, R_f)$ of {eq}`eq:full_ode`.
+{prf:ref}`ass-ms-a6` is automatic when the regressors $z_{2t}$ are *exogenous* and ergodic, but it can be delicate when $z_{2t}$ contains endogenous variables.
+
+{prf:ref}`ass-ms-a7` is satisfied in the compact-state version when the model has a natural bounded domain on which $T$ is well-defined; otherwise the projection-set version corresponds to using the projection facility described below.
+
+Let $D_A$ denote the domain of attraction of the unique equilibrium $(\beta_f, R_f)$ of {eq}`eq:full_ode`.
 
 ### Convergence of least squares
 
-```{prf:proposition}
+```{prf:proposition} Least-squares convergence
 :label: prop-ms-convergence
 
-Assume (A.1)–(A.6). If either
+Assume {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`,
+{prf:ref}`ass-ms-a3`, {prf:ref}`ass-ms-a4`, {prf:ref}`ass-ms-a5`,
+and {prf:ref}`ass-ms-a6`. If either
 
-- (A.7.1) is satisfied and $D' \subset D_A$, or
-- (A.7.2) is satisfied and $D_1 \subset D_A$,
+- the compact-state version of {prf:ref}`ass-ms-a7` is satisfied and
+  $D' \subset D_A$, or
+- the projection-set version of {prf:ref}`ass-ms-a7` is satisfied and
+  $D_1 \subset D_A$,
 
 then $\beta_t \to \beta_f$ almost surely as $t \to \infty$.
 ```
 
-{prf:ref}`prop-ms-convergence` reduces almost-sure convergence of recursive
-least squares to *stability* of the ODE {eq}`eq:full_ode` at $(\beta_f, R_f)$
-plus a boundedness guarantee for the sample path.
+{prf:ref}`prop-ms-convergence` reduces the problem of almost-sure convergence
+of recursive least squares to checking the *stability* of the ODE
+{eq}`eq:full_ode` at $(\beta_f, R_f)$ and guaranteeing the boundedness of the
+sample path.
 
 ### Stability governs convergence
 
@@ -354,9 +349,9 @@ $$
 $$ (eq:jacobian)
 
 and let $h(\beta, R)$ denote the Jacobian of the right-hand side of the full
-ODE {eq}`eq:full_ode` after stacking $(\beta, R)$ into a column vector.
+ODE {eq}`eq:full_ode` after stacking $(\beta^\top, R)$ into a column vector.
 
-```{prf:proposition}
+```{prf:proposition} Jacobian reduction
 :label: prop-ms-jacobian-eigenvalues
 
 The matrix $h(\beta_f, R_f)$ has $(n_2)^2$ repeated eigenvalues equal to
@@ -367,17 +362,20 @@ eigenvalues of $\mathcal{M}$.
 Consequently:
 
 * If all eigenvalues of $\mathcal{M}$ have *strictly negative real parts*, both
-  {eq}`eq:small_ode` and {eq}`eq:full_ode` are locally stable, and
-  {prf:ref}`prop-ms-convergence` then yields $\beta_t \to \beta_f$ almost
-  surely.
+  {eq}`eq:small_ode` and {eq}`eq:full_ode` are locally stable.
+
+* Under the boundedness and domain-of-attraction conditions in
+  {prf:ref}`prop-ms-convergence`, this local stability can be used to obtain
+  almost-sure convergence of $\beta_t$ to $\beta_f$.
 
 * If any eigenvalue of $\mathcal{M}$ has *positive real part*, then the next
   proposition shows that convergence is impossible.
 
-```{prf:proposition}
+```{prf:proposition} Necessity
 :label: prop-ms-necessity
 
-Assume (A.1)–(A.5).
+Assume {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`,
+{prf:ref}`ass-ms-a3`, {prf:ref}`ass-ms-a4`, and {prf:ref}`ass-ms-a5`.
 
 1. Let $\hat\beta \neq \beta_f$ and suppose $M_{z_2}(\hat\beta)$ is positive
    definite and $\hat\beta \in \mathrm{int}(D_2)$. Then $P(\beta_t \to \hat\beta) = 0$.
@@ -386,20 +384,33 @@ Assume (A.1)–(A.5).
    part, then $P(\beta_t \to \beta_f) = 0$.
 ```
 
+The first part says that recursive least squares cannot settle on a
+non-equilibrium belief in the interior of the admissible domain.
+
+If learning has a limit, that limit must be a fixed point of $T$.
+
+The second part says that local instability is not just slow convergence.
+
+If the REE is unstable for the learning ODE, convergence to that REE has
+probability zero.
+
 The stability condition $\mathrm{Re}(\lambda_i(\mathcal{M})) < 0$ for all $i$ is
-what the E-stability literature (see {cite:t}`Evans1985`) calls **E-stability**:
+what is called **E-stability** (See {cite:t}`Evans1985`):
 the REE is a stable rest point of the expectational dynamics
 $\dot\beta = T(\beta) - \beta$.
 
+E-stability plays the role here that prior support plays in Bayesian learning:
+it tells us when the learning process can find its way to the equilibrium.
+
 ### The projection facility
 
 E-stability is necessary but not quite sufficient for almost-sure convergence.
 
-Ljung's theorem requires the sample path $(\beta_t, R_t)$ to remain in a
-*bounded region* with probability one.
+Ljung's theorem requires the learning process and the relevant regressors to
+return to bounded regions with probability one.
 
-This boundedness is the job of the **projection
-facility**.
+The **projection facility** is a procedure that enforces boundedness of the
+estimated coefficients and covariance matrix.
 
 #### What the projection facility does
 
@@ -425,34 +436,31 @@ The set $D_2
 \subset D_1$ is a slightly smaller "safe" region to which the algorithm is
 retracted whenever it threatens to leave $D_1$.
 
-The facility can be thought of as forcing agents to *discard observations that
-are inconsistent with their priors*, a form of bounded rationality that is
-necessary for the mathematical argument but innocuous in practice.
-
-#### Why it is needed
-
 Without the projection facility, the stochastic path $(\beta_t, R_t)$ might
 temporarily wander to regions where the system {eq}`eq:actual_lom` is
 non-stationary (e.g., an explosive VAR).
 
-Ljung's convergence theorem requires
-the algorithm to revisit a compact set infinitely often; the projection facility
-guarantees this by construction.
+The projection facility keeps $(\beta_t, R_t)$ inside a chosen admissible
+region.
 
-Formally, {cite:t}`MarcetSargent1989jet` require that the ODE trajectories
-originating in $D_1$ point *inward* at the boundary $\partial D_1$, that is,
-the vector field $T(\beta) - \beta$ must point back into $D_1$ everywhere on its
-boundary.
+When regressors include endogenous variables, the separate boundedness
+condition {prf:ref}`ass-ms-a6` still has to control $z_{2t}$ along a
+subsequence.
 
-When this holds, the projection is *invoked only finitely many times* with
-probability one, and after the last invocation the algorithm runs as plain RLS.
+Formally, {cite:t}`MarcetSargent1989jet` require that trajectories of the full
+ODE {eq}`eq:full_ode` originating in $D_1$ do not leave $D_1$.
+
+This is often checked by verifying that the full vector field points toward the
+interior at the boundary $\partial D_1$.
 
 ```{prf:corollary}
 :label: cor-ms-projection-dichotomy
 
-Assume (A.1)–(A.6), that $(\beta, R) \in D_1$ implies $\beta \in D_s$, and
-that $D_1$ is open and bounded with $D_1 \subset D_A$. Then for some
-subsequence $\{t_k(\omega)\}$,
+Assume {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`,
+{prf:ref}`ass-ms-a3`, {prf:ref}`ass-ms-a4`, {prf:ref}`ass-ms-a5`,
+and {prf:ref}`ass-ms-a6`. Suppose also that $(\beta, R) \in D_1$ implies
+$\beta \in D_s$, and that $D_1$ is open and bounded with $D_1 \subset D_A$.
+Then for some subsequence $\{t_k(\omega)\}$,
 
 $$
 P(\beta_t \to \beta_f) + P\bigl(\beta_{t_k} \to (D_1 \setminus D_2)\bigr) = 1.
@@ -470,11 +478,11 @@ When the regressors $z_{2t}$ are *exogenous*, so that $E(z_{2t}z_{2t}^\top) =
 M_{z_2}(\beta) \equiv M$ does not depend on $\beta$, the verification of the
 boundary condition becomes routine.
 
-Let $H(\beta)$ be the mean-value slope of the small-ODE drift, i.e. the matrix
+Let $H(\beta)$ be the mean-value slope of $T$, i.e. the matrix
 satisfying
 
 $$
-\operatorname{col}\{[T(\beta)-\beta]-[T(\beta_f)-\beta_f]\}
+\operatorname{col}\{T(\beta)-T(\beta_f)\}
 =
 H(\beta)\operatorname{col}(\beta-\beta_f).
 $$ (eq:corollary2_cond)
@@ -485,7 +493,8 @@ $$ (eq:corollary2_cond)
 Consider the algorithm defined by {eq}`eq:rls_beta`–{eq}`eq:rls_R` with
 projection rule {eq}`eq:projection`. Choose $0 < K' < K < \infty$ and assume
 
-1. (A.1)–(A.5) hold;
+1. {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`, {prf:ref}`ass-ms-a3`,
+   {prf:ref}`ass-ms-a4`, and {prf:ref}`ass-ms-a5` hold;
 2. $z_{2t}$ is exogenous, so that $E(z_{2t}z_{2t}^\top) = M_{z_2}(\beta) \equiv M$;
 3. the small ODE $\dot\beta = T(\beta) - \beta$ is globally stable in
    $\mathbb{R}^{n_1 \times n_2}$;
@@ -504,189 +513,222 @@ rule retract to any value with $|\beta - \beta_f| \leq K'$.
 Then $\beta_t \to \beta_f$ almost surely.
 ```
 
-For the scalar linear examples below, condition (4) reduces to the familiar
-requirement that the slope of $T(\beta) - \beta$ be negative.
+For the scalar linear examples below, condition (4) reduces to checking that
+the feedback slope of $T$ is not too strong.
 
-For the first four examples below, $T$ is linear and $M_{z_2}$ is independent of
-$\beta$, so {prf:ref}`cor-ms-exogenous` reduces to checking stability of the
-small ODE.
+For the first four examples below, $T$ is linear and $M_{z_2}$ is independent
+of $\beta$, so {prf:ref}`cor-ms-exogenous` reduces the problem to the scalar
+ODE stability checks shown in the examples.
 
 ```{note}
-In the scalar self-referential examples studied here (Bray, Bray–Savin,
-present-value model), the state $z_{2t} = 1$ is a constant regressor, so
-$M_{z_2} = 1$ is trivially exogenous.
+In Bray's model, the state $z_{2t}=1$ is a constant regressor.
+
+In the Bray-Savin and present-value examples, $z_{2t}$ is stochastic but
+exogenous, so $M_{z_2}$ is still independent of beliefs.
 
 For the investment model with endogenous regressors, verifying the
 boundary condition on $D_1$ is much harder and may require numerical solution of
 the ODE on a grid of boundary points.
 ```
 
-#### Simulating the projection facility
+## Computational helpers
 
-The following code demonstrates the projection facility at work.
+We now work through five examples from {cite:t}`MarcetSargent1989jet`.
 
-We use Bray's
-model with $b = 0.6$ and deliberately start $\beta_0$ far from $\beta_f$,
-imposing a projection set $D_1 = \{|\beta| < K\}$ with $K = 5$.
+Before we start, we define helper functions for the scalar simulations.
 
-We track how
-often the facility is invoked and show that after a finite number of
-interventions, the path converges normally.
+The first covers Bray's constant-regressor model.
+
+The next two simulate the actual exogenous-regressor structures in the
+Bray-Savin and present-value examples.
+
+The final helper evaluates the small ODE {eq}`eq:small_ode` on the
+cumulative-gain time scale of RLS.
 
 ```{code-cell} ipython3
----
-mystnb:
-  figure:
-    caption: Projection facility
-    name: fig-projection-facility
----
-def simulate_rls_with_projection(T_map, σ_u, β0, K_proj,
-                                 T_periods=500, N_paths=50, seed=0):
+def simulate_rls_scalar(T_map, σ_u, β0, T_periods=500, N_paths=100,
+                        a_seq=None, seed=0):
     """
-    Simulate RLS with a projection facility that retracts β_t to 0
-    whenever the update would push it outside [-K_proj, K_proj].
+    Simulate the RLS recursion for z1_t = T(β_t) + u_t, z2_t = 1.
+    Returns an (N_paths, T_periods) array of belief paths.
     """
     rng = np.random.default_rng(seed)
+    if a_seq is None:
+        a_seq = np.ones(T_periods)
+
     β_paths = np.empty((N_paths, T_periods))
-    n_projections = np.zeros(N_paths, dtype=int)
-    last_proj = np.full(N_paths, -1, dtype=int)
 
     for i in range(N_paths):
         β = β0
         R = 1.0
 
         for t in range(T_periods):
+            α_t = a_seq[t]
+            z2 = 1.0
             u_t = rng.normal(0, σ_u)
-            z1 = T_map(β) + u_t
+            z1 = T_map(β) * z2 + u_t
 
-            R_new = R + (1.0 / (t + 1)) * (1.0 - R)
-            β_new = β + (1.0 / (t + 1)) / R_new * (z1 - β)
+            R = R + (α_t / (t + 1)) * (z2**2 - R / α_t)
+            R = max(R, 1e-8)
+            β = β + (α_t / (t + 1)) / R * z2 * (z1 - β * z2)
 
-            if abs(β_new) > K_proj:
-                β_new = 0.0
-                n_projections[i] += 1
-                last_proj[i] = t
+            β_paths[i, t] = β
+
+    return β_paths
+
+
+def simulate_bray_savin(m, a, σ_x, σ_u, β0, T_periods=500,
+                        N_paths=100, seed=0):
+    """
+    Simulate RLS for p_t = x_t (m + a β_t) + u_t.
+    Agents regress p_t on x_t.
+    """
+    rng = np.random.default_rng(seed)
+    β_paths = np.empty((N_paths, T_periods))
+
+    for i in range(N_paths):
+        β = β0
+        R = 1.0
+
+        for t in range(T_periods):
+            x_t = rng.normal(0.0, σ_x)
+            u_t = rng.normal(0.0, σ_u)
+            p_t = x_t * (m + a * β) + u_t
+            step = 1.0 / (t + 1)
 
-            β = β_new
-            R = max(R_new, 1e-8)
+            R = R + step * (x_t**2 - R)
+            R = max(R, 1e-8)
+            β = β + step / R * x_t * (p_t - β * x_t)
             β_paths[i, t] = β
 
-    first_proj_free = last_proj + 1
+    return β_paths
 
-    return β_paths, n_projections, first_proj_free
 
+def simulate_present_value_rls(λ, ρ, σ_ε, β0, T_periods=500,
+                               N_paths=100, seed=0):
+    """
+    Simulate RLS for y_t = (λ β_t + 1) x_t,
+    x_t = ρ x_{t-1} + ε_t, with regression of y_t on x_{t-1}.
+    """
+    rng = np.random.default_rng(seed)
+    β_paths = np.empty((N_paths, T_periods))
+    σ_x = σ_ε / np.sqrt(1 - ρ**2)
 
-a_bray_pf, b_bray_pf, σ_pf = 1.0, 0.6, 1.5
+    for i in range(N_paths):
+        β = β0
+        R = 1.0
+        x_lag = rng.normal(0.0, σ_x)
 
+        for t in range(T_periods):
+            ε_t = rng.normal(0.0, σ_ε)
+            x_t = ρ * x_lag + ε_t
+            y_t = (λ * β + 1.0) * x_t
+            step = 1.0 / (t + 1)
 
-def T_bray_pf(β):
-    return a_bray_pf + b_bray_pf * β
+            R = R + step * (x_lag**2 - R)
+            R = max(R, 1e-8)
+            β = β + step / R * x_lag * (y_t - β * x_lag)
+            β_paths[i, t] = β
+            x_lag = x_t
 
+    return β_paths
 
-β_f_pf = a_bray_pf / (1 - b_bray_pf)
-β0_far = 8.0
-K_pf = 5.0
-T_pf_sim = 600
-N_pf_sim = 80
 
-paths_pf, n_proj, first_free = simulate_rls_with_projection(
-    T_bray_pf, σ_pf, β0_far, K_pf,
-    T_periods=T_pf_sim, N_paths=N_pf_sim)
+def solve_ode_calendar(f_ode, β0, T_periods, a_seq=None):
+    """
+    Solve dβ/dτ = f_ode(β) and evaluate it at RLS cumulative-gain time.
+    """
+    if a_seq is None:
+        a_seq = np.ones(T_periods)
 
-paths_no_pf = simulate_rls_scalar(
-    T_bray_pf, σ_pf, β0_far,
-    T_periods=T_pf_sim, N_paths=N_pf_sim, seed=0)
+    periods = np.arange(T_periods + 1)
+    gains = a_seq / np.arange(1, T_periods + 1)
+    ode_time = np.concatenate(([0.0], np.cumsum(gains)))
+    sol = solve_ivp(
+        lambda τ, y: [f_ode(y[0])],
+        (0.0, ode_time[-1]),
+        [β0],
+        t_eval=ode_time,
+        method='RK45',
+        max_step=0.05
+    )
+    return periods, sol.y[0]
+
+
+def plot_scalar_drift(ax, β_grid, drift, β_f, color):
+    """Plot the one-dimensional learning drift T(β) - β."""
+    ax.plot(β_grid, drift, color=color, lw=2)
+    ax.axhline(0, color='black', lw=1.5)
+    ax.axvline(β_f, color='red', ls='--', lw=2,
+               label=f'$\\beta_f = {β_f:.2f}$')
+    ax.fill_between(β_grid, drift, 0, where=(drift > 0),
+                    color=color, alpha=0.12)
+    ax.fill_between(β_grid, drift, 0, where=(drift < 0),
+                    color=color, alpha=0.12)
 
-fig = plt.figure(figsize=(15, 10))
-gs = GridSpec(2, 2, figure=fig)
+    for β_arrow in np.linspace(β_grid[20], β_grid[-20], 7):
+        dβ = np.interp(β_arrow, β_grid, drift)
+        if abs(dβ) > 1e-10:
+            ax.annotate(
+                '', xy=(β_arrow + 0.25 * np.sign(dβ), 0),
+                xytext=(β_arrow, 0),
+                arrowprops=dict(arrowstyle='->', color=color, lw=1.8)
+            )
 
-ax1 = fig.add_subplot(gs[0, 0])
-for i in range(min(30, N_pf_sim)):
-    ax1.plot(paths_pf[i], color='steelblue', alpha=0.25, lw=2)
-ax1.plot(np.mean(paths_pf, axis=0), color='navy', lw=2, label='average')
-ax1.axhline(β_f_pf, color='red', ls='--', lw=2,
-            label=f'$\\beta_f={β_f_pf:.1f}$')
-ax1.axhline(
-    K_pf, color='gray', ls=':', lw=2,
-    label=f'$D_1$ boundary ($K={K_pf}$)'
-)
-ax1.axhline(-K_pf, color='gray', ls=':', lw=2)
-ax1.set_xlabel('$t$')
-ax1.set_ylabel('$\\beta_t$')
-ax1.legend(fontsize=8)
-
-ax2 = fig.add_subplot(gs[0, 1])
-for i in range(min(30, N_pf_sim)):
-    ax2.plot(paths_no_pf[i], color='darkorange', alpha=0.25, lw=2)
-ax2.plot(
-    np.mean(paths_no_pf, axis=0), color='saddlebrown', lw=2,
-    label='average'
-)
-ax2.axhline(β_f_pf, color='red', ls='--', lw=2,
-            label=f'$\\beta_f={β_f_pf:.1f}$')
-ax2.set_xlabel('$t$')
-ax2.set_ylabel('$\\beta_t$')
-ax2.legend(fontsize=8)
-
-ax3 = fig.add_subplot(gs[1, 0])
-ax3.hist(n_proj, bins=range(0, int(n_proj.max()) + 2),
-         color='steelblue', edgecolor='white', alpha=0.8)
-ax3.set_xlabel('number of projections invoked')
-ax3.set_ylabel('number of paths')
-
-ax4 = fig.add_subplot(gs[1, 1])
-ax4.hist(first_free[n_proj > 0], bins=20,
-         color='darkorange', edgecolor='white', alpha=0.8)
-ax4.set_xlabel('last period with a projection')
-ax4.set_ylabel('number of paths')
+    ax.set_xlabel('$\\beta$')
+    ax.set_ylabel('$T(\\beta) - \\beta$')
+    ax.legend(fontsize=9)
 
-plt.tight_layout()
-plt.show()
 
-print(f"Paths with at least one projection: {(n_proj > 0).sum()} / {N_pf_sim}")
-print(f"Mean number of projections per path: {n_proj.mean():.2f}")
-print(f"Max number of projections:           {n_proj.max()}")
-print(
-    "Mean last-projection period:         "
-    f"{first_free[n_proj > 0].mean():.1f}"
-)
+T_sim = 400
+N_sim = 80
 ```
 
-The simulation illustrates the key theoretical point from
-{prf:ref}`cor-ms-projection-dichotomy`: the projection is invoked only a
-*finite number of times* on almost every sample path.
+Each substantive learning example follows the same template.
+
+1. Write down the economic equations that determine the equilibrium.
+2. Substitute the perceived law of motion for any expectational variables to obtain the actual law of motion as a function of $\beta$.
+3. Read off the operator $T$ and the REE $\beta_f = T(\beta_f)$.
+4. Check E-stability by computing $\mathcal M = dT/d\beta - I$ at $\beta_f$.
+5. Plot the RLS paths, the small ODE evaluated at cumulative-gain time, and the drift $T(\beta)-\beta$.
 
-After the last invocation the algorithm runs as unconstrained RLS and
-converges to $\beta_f$ at the usual rate.
+In each case, $\beta_t$ is the law of motion agents currently use when making decisions, while $T(\beta_t)$ is the law of motion their decisions actually produce.
 
-The projection does not bias the
-asymptotic estimate — it merely provides the boundedness guarantee that Ljung's
-theorem requires.
+Least-squares learning asks whether repeated observations move $\beta_t$ toward a fixed point of this map.
 
-## Five illustrative examples
+The first example is the control case with no self-referential feedback.
 
-We now work through five examples from {cite:t}`MarcetSargent1989jet`,
-computing the ODE, finding the REE, checking E-stability, and simulating the RLS
-learning path.
+For the remaining scalar examples, the simulation figure appears immediately
+after the model description.
 
-### Example 1: ordinary linear stochastic difference equations
+## Example 1: Ordinary linear stochastic difference equations
 
 The first example has no self-referential component.
 
 Let the actual law of motion be fixed, with $T(\beta)=\Gamma$ for a stable
 matrix $\Gamma$ and with $V(\beta)=I$.
 
+Economically, this is the control case.
+
+Agents are simply estimating a stable data-generating process that is already
+there.
+
+Their beliefs do not feed back into prices, quantities, or future data.
+
 The REE is $\beta_f=\Gamma$.
 
-Since $T$ is constant, $H(\beta)=-I$ and the small ODE is globally stable.
+Since $T$ is constant, $H(\beta)=0$ in {eq}`eq:corollary2_cond`.
+
+The small-ODE Jacobian is $\mathcal{M}=-I$, so the small ODE is globally
+stable.
 
 {prf:ref}`cor-ms-exogenous` then implies that recursive least squares converges almost surely
 to the true law of motion.
 
-This benchmark shows that the Marcet-Sargent machinery nests ordinary strong
+This case shows that the Marcet-Sargent machinery nests ordinary strong
 consistency of least squares for stable linear stochastic difference equations.
 
-### Example 2: Bray's cobweb model
+## Example 2: Bray's cobweb model
 
 {cite:t}`Bray1982` studied a simple cobweb economy in which the equilibrium price
 satisfies
@@ -699,10 +741,23 @@ where $\beta_t$ is agents' OLS estimate of the price (their point forecast of
 $p_t$), and $\tilde{u}_t$ is i.i.d. noise with mean zero and variance
 $\sigma_u^2$.
 
+Here the forecast itself is a state variable for the economy.
+
+If producers expect a high price, their current supply decisions alter the
+market-clearing price.
+
+The parameter $b$ measures the strength of this expectational feedback.
+
+When $b < 1$, the actual price response is weaker than the forecast error, so
+least squares has a force pushing beliefs back toward the fixed point.
+
+When $b > 1$, the feedback is too strong and the same learning rule moves
+beliefs away from the REE.
+
 The mapping $T$ is simply $T(\beta) = a + b\beta$.  The REE is
 
 $$
-\beta_f = \frac{a}{1 - b} , \quad |b| < 1 .
+\beta_f = \frac{a}{1 - b} , \quad b \neq 1 .
 $$ (eq:bray_ree)
 
 The small ODE is
@@ -714,158 +769,220 @@ $$ (eq:bray_ode)
 which has the unique fixed point $\beta_f = a/(1-b)$.
 
 Its Jacobian is
-$\mathcal{M} = b - 1 < 0$ when $|b| < 1$, so the REE is E-stable and RLS
-converges almost surely.
+$\mathcal{M} = b - 1 < 0$ when $b < 1$.
+
+Under the boundedness and domain conditions above, RLS then converges almost
+surely.
 
 When $b > 1$, $\mathcal{M} > 0$ and convergence fails.
 
-### Example 3: Bray–Savin supply-shifter model
+The code below sets $a=1$ and $b=0.6$.
 
-{cite:t}`BraySavin1984` studied a model where
+The rational expectations price forecast is then $\beta_f=2.5$.
 
-$$
-p_t = x_t^\top(m + a\beta_{t-1}) + \tilde{u}_t , \quad p_t^e = x_t^\top\beta_{t-1} ,
-$$ (eq:bs_price)
+The three panels show the noisy RLS paths, the small-ODE approximation, and
+the learning drift $T(\beta)-\beta$.
 
-with $x_t$ an exogenous supply-shifter, $a$ a scalar feedback parameter, and
-agents running an OLS regression of $p$ on $x$.
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Bray learning dynamics
+    name: fig-bray-learning-dynamics
+---
+a_bray, b_bray, σ_bray = 1.0, 0.6, 1.0
 
-The mapping is $T(\beta) = m + a\beta$ (scalar case), giving
 
-$$
-\dot\beta = (a-1)\beta + m , \quad \beta_f = \frac{m}{1-a} ,
-$$ (eq:bs_ode)
+def T_bray(β):
+    return a_bray + b_bray * β
 
-with Jacobian $\mathcal{M} = a - 1 < 0$ iff $a < 1$.
 
-### Example 4: Hyperinflation / asset prices (Fourgeaud–Gourieroux–Pradel)
+β_f_bray = a_bray / (1 - b_bray)
 
-Consider the present-value asset pricing model
+β0_bray = 0.0
 
-$$
-y_t = \lambda E_t y_{t+1} + x_t , \quad x_t = \rho x_{t-1} + \varepsilon_t ,
-$$ (eq:pv_model)
+β_paths_bray = simulate_rls_scalar(
+    T_bray, σ_bray, β0_bray,
+    T_periods=T_sim, N_paths=N_sim
+)
 
-where $|\lambda| < 1$, $|\rho| < 1$, and agents perceive $y_t = \beta_t x_{t-1}+ v_t$.
- 
-The mapping is $T(\beta) = (\lambda\beta + 1)\rho$ and the REE is
 
-$$
-\beta_f = \frac{\rho}{1 - \lambda\rho} .
-$$ (eq:pv_ree)
+def ode_bray(β):
+    return a_bray + b_bray * β - β
 
-The small ODE is
 
-$$
-\dot\beta = (\lambda\rho - 1)\beta + \rho ,
-$$ (eq:pv_ode)
+t_ode, sol_low = solve_ode_calendar(ode_bray, 0.0, T_sim)
+_, sol_high = solve_ode_calendar(ode_bray, 4.5, T_sim)
 
-with Jacobian $\mathcal{M} = \lambda\rho - 1 < 0$ for $|\lambda\rho| < 1$, so
-convergence is guaranteed.
+β_grid_bray = np.linspace(-0.5, 5.0, 300)
+drift_bray = np.array([ode_bray(b) for b in β_grid_bray])
 
-### Example 5: Investment under uncertainty (self-referential with endogenous regressors)
+fig, axes = plt.subplots(1, 3, figsize=(15, 4.8))
 
-In Sargent's version of the Lucas–Prescott investment model, agents learn about the
-aggregate capital stock $K_t$ by regressing on $(K_{t-1}, w_{t-1})$ where $w_t$
-is an exogenous cost shock.
+ax = axes[0]
+for i in range(min(30, N_sim)):
+    ax.plot(β_paths_bray[i], color='steelblue', alpha=0.25, lw=2)
+ax.plot(np.mean(β_paths_bray, axis=0), color='navy', lw=2,
+        label='cross-path average')
+ax.axhline(β_f_bray, color='red', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_bray:.2f}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta_t$')
+ax.legend()
 
-The perceived law of motion is
+ax = axes[1]
+ax.plot(
+    t_ode, sol_low, color='steelblue', lw=2,
+    label='ODE at $\\tau_t$, $\\beta_0=0$'
+)
+ax.plot(
+    t_ode, sol_high, color='darkorange', lw=2,
+    label='ODE at $\\tau_t$, $\\beta_0=4.5$'
+)
+ax.axhline(β_f_bray, color='red', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_bray:.2f}$')
+ax.set_xlabel('$t$')
+ax.set_ylabel('$\\beta(\\tau_t)$')
+ax.legend()
 
-$$
-K_t = \beta_1 K_{t-1} + \beta_2 w_{t-1} + \eta_t ,
-$$
+plot_scalar_drift(
+    axes[2], β_grid_bray, drift_bray, β_f_bray, 'steelblue'
+)
 
-while the actual law (from firms' optimal investment decisions and market clearing) is
+plt.tight_layout()
+plt.show()
+print(f"REE: β_f = a/(1-b) = {β_f_bray:.4f}")
+print(f"Jacobian M = b - 1 = {b_bray - 1:.4f}  (< 0: E-stable)")
+```
 
-$$
-K_t = T_1(\beta) K_{t-1} + T_2(\beta) w_{t-1} + V(\beta) u_t ,
-$$ (eq:inv_actual)
+The stochastic paths are noisy because prices contain $\tilde u_t$.
 
-where the nonlinear mappings $T_1, T_2$ come from solving the firms' linear
-quadratic control problems.
+The average path and the ODE paths remove most of that noise and reveal the
+mean learning direction.
 
-The small ODE decomposes as:
+## Counterexample: Unstable Bray feedback
 
-$$
-\dot\beta_1 = T_1(\beta_1) - \beta_1 , \quad
-\dot\beta_2 = T_2(\beta_1, \beta_2) - \beta_2 ,
-$$ (eq:inv_ode)
+To see what happens when E-stability is violated, repeat Bray's model with
+$b > 1$.
 
-and E-stability can be verified analytically for $|\beta_1| < b^{-1/2}$ (where
-$b$ is the discount factor).
+It represents an economy where expectations affect actual prices more than
+one-for-one.
+
+If agents start with a forecast that is too high, the market outcome induced by
+that forecast is higher still, so least squares raises the forecast.
 
-## Simulating the learning dynamics
+The same logic works in reverse below the REE.
 
-We now simulate the self-referential examples numerically, plotting both the ODE
-solution (continuous-time approximation) and the sample paths of $\beta_t$ under RLS.
+The REE exists, but it is not learnable by this adaptive rule.
 
-### Bray's model
+The same three diagnostic panels now show how beliefs are pushed away from the REE rather than toward it, and the drift panel shows that the learning force points away from the fixed point on both sides.
 
 ```{code-cell} ipython3
 ---
 mystnb:
   figure:
-    caption: Bray learning dynamics
-    name: fig-bray-learning-dynamics
+    caption: Unstable Bray dynamics
+    name: fig-unstable-bray-dynamics
 ---
-a_bray, b_bray, σ_bray = 1.0, 0.6, 1.0
+b_unstable = 1.4
 
 
-def T_bray(β):
-    return a_bray + b_bray * β
+def T_unstable(β):
+    return a_bray + b_unstable * β
 
 
-β_f_bray = a_bray / (1 - b_bray)
+β_f_unstable = a_bray / (1 - b_unstable)
 
-β0_bray = 0.0
-T_sim = 400
-N_sim = 80
+β_paths_unstable = simulate_rls_scalar(
+    T_unstable, σ_bray, β0=0.0,
+    T_periods=200, N_paths=50
+)
 
-β_paths_bray = simulate_rls_scalar(T_bray, σ_bray, β0_bray,
-                                      T_periods=T_sim, N_paths=N_sim)
 
-def ode_bray(β):
-    return a_bray + b_bray * β - β
+def ode_unstable(β):
+    return T_unstable(β) - β
 
 
-t_ode, sol_low = solve_ode(ode_bray, 0.0)
-_, sol_high = solve_ode(ode_bray, 4.5)
+β_grid = np.linspace(-5, 5, 300)
+drift = np.array([ode_unstable(b) for b in β_grid])
 
-fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+t_ode_un, sol_un_low = solve_ode_calendar(ode_unstable, -4.0, 200)
+_, sol_un_high = solve_ode_calendar(ode_unstable, 0.0, 200)
+
+fig, axes = plt.subplots(1, 3, figsize=(15, 4.8))
 
 ax = axes[0]
-for i in range(min(30, N_sim)):
-    ax.plot(β_paths_bray[i], color='steelblue', alpha=0.25, lw=2)
-ax.plot(np.mean(β_paths_bray, axis=0), color='navy', lw=2,
-        label='cross-path average')
-ax.axhline(β_f_bray, color='red', ls='--', lw=2,
-           label=f'$\\beta_f = {β_f_bray:.2f}$')
+for i in range(min(30, 50)):
+    ax.plot(β_paths_unstable[i], color='crimson', alpha=0.3, lw=2)
+ax.axhline(β_f_unstable, color='black', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_unstable:.2f}$ (unstable)')
 ax.set_xlabel('$t$')
 ax.set_ylabel('$\\beta_t$')
 ax.legend()
 
 ax = axes[1]
 ax.plot(
-    t_ode, sol_low, color='steelblue', lw=2,
-    label='ODE from $\\beta_0=0$'
+    t_ode_un, sol_un_low, color='crimson', lw=2,
+    label='ODE at $\\tau_t$, $\\beta_0=-4$'
 )
 ax.plot(
-    t_ode, sol_high, color='darkorange', lw=2,
-    label='ODE from $\\beta_0=4.5$'
+    t_ode_un, sol_un_high, color='steelblue', lw=2,
+    label='ODE at $\\tau_t$, $\\beta_0=0$'
 )
-ax.axhline(β_f_bray, color='red', ls='--', lw=2,
-           label=f'$\\beta_f = {β_f_bray:.2f}$')
+ax.axhline(β_f_unstable, color='black', ls='--', lw=2,
+           label=f'$\\beta_f = {β_f_unstable:.2f}$')
 ax.set_xlabel('$t$')
-ax.set_ylabel('$\\beta(t)$')
+ax.set_ylabel('$\\beta(\\tau_t)$')
 ax.legend()
 
+plot_scalar_drift(
+    axes[2], β_grid, drift, β_f_unstable, 'crimson'
+)
+
 plt.tight_layout()
 plt.show()
-print(f"REE: β_f = a/(1-b) = {β_f_bray:.4f}")
-print(f"Jacobian M = b - 1 = {b_bray - 1:.4f}  (< 0: E-stable)")
+print(f"Jacobian M = b - 1 = {b_unstable - 1:.2f}  (> 0: NOT E-stable)")
 ```
 
-### Bray–Savin model
+## Example 3: Bray-Savin supply-shifter model
+
+{cite:t}`BraySavin1984` studied a model where
+
+$$
+p_t = x_t^\top(m + a\beta_{t-1}) + \tilde{u}_t , \quad p_t^e = x_t^\top\beta_{t-1} ,
+$$ (eq:bs_price)
+
+with $x_t$ an exogenous supply-shifter, $a$ a scalar feedback parameter, and
+agents running an OLS regression of $p$ on $x$.
+
+This is the same learning problem with one extra economic ingredient:
+expectations are conditional on an observed shifter.
+
+Agents do not merely forecast the average price.
+
+They estimate how price responds to $x_t$.
+
+The actual coefficient on $x_t$ is $m+a\beta$, so the perceived slope changes
+the true slope generated by the market.
+
+If $a < 1$, a mistaken slope feeds back into the actual price equation with
+attenuation, and the least-squares estimate is pulled toward the REE slope.
+
+The mapping is $T(\beta) = m + a\beta$ (scalar case), giving
+
+$$
+\dot\beta = (a-1)\beta + m , \quad \beta_f = \frac{m}{1-a} ,
+$$ (eq:bs_ode)
+
+with Jacobian $\mathcal{M} = a - 1 < 0$ iff $a < 1$.
+
+The simulation below draws the exogenous shifter $x_t$ and lets agents update
+the slope in the regression of $p_t$ on $x_t$.
+
+It uses $a=0.7$.
+
+The drift panel shows that mistaken slopes are pushed back toward the REE
+slope rather than amplified.
 
 ```{code-cell} ipython3
 ---
@@ -874,7 +991,8 @@ mystnb:
     caption: Bray-Savin learning dynamics
     name: fig-bray-savin-learning-dynamics
 ---
-m_bs, a_bs, σ_bs = 0.5, 0.7, 1.0
+m_bs, a_bs = 0.5, 0.7
+σ_x_bs, σ_u_bs = 1.0, 1.0
 
 
 def T_bs(β):
@@ -883,17 +1001,23 @@ def T_bs(β):
 
 β_f_bs = m_bs / (1 - a_bs)
 
-β_paths_bs = simulate_rls_scalar(T_bs, σ_bs, 0.0,
-                                    T_periods=T_sim, N_paths=N_sim)
+β_paths_bs = simulate_bray_savin(
+    m_bs, a_bs, σ_x_bs, σ_u_bs, 0.0,
+    T_periods=T_sim, N_paths=N_sim
+)
+
 
 def ode_bs(β):
     return T_bs(β) - β
 
 
-t_ode_bs, sol_bs_low = solve_ode(ode_bs, 0.0)
-_, sol_bs_high = solve_ode(ode_bs, 4.0)
+t_ode_bs, sol_bs_low = solve_ode_calendar(ode_bs, 0.0, T_sim)
+_, sol_bs_high = solve_ode_calendar(ode_bs, 4.0, T_sim)
 
-fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+β_grid_bs = np.linspace(-0.5, 4.0, 300)
+drift_bs = np.array([ode_bs(b) for b in β_grid_bs])
+
+fig, axes = plt.subplots(1, 3, figsize=(15, 4.8))
 
 ax = axes[0]
 for i in range(min(30, N_sim)):
@@ -909,25 +1033,86 @@ ax.legend()
 ax = axes[1]
 ax.plot(
     t_ode_bs, sol_bs_low, color='darkorange', lw=2,
-    label='ODE from $\\beta_0=0$'
+    label='ODE at $\\tau_t$, $\\beta_0=0$'
 )
 ax.plot(
     t_ode_bs, sol_bs_high, color='steelblue', lw=2,
-    label='ODE from $\\beta_0=4$'
+    label='ODE at $\\tau_t$, $\\beta_0=4$'
 )
 ax.axhline(β_f_bs, color='red', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_bs:.2f}$')
 ax.set_xlabel('$t$')
-ax.set_ylabel('$\\beta(t)$')
+ax.set_ylabel('$\\beta(\\tau_t)$')
 ax.legend()
 
+plot_scalar_drift(
+    axes[2], β_grid_bs, drift_bs, β_f_bs, 'darkorange'
+)
+
 plt.tight_layout()
 plt.show()
 print(f"REE: β_f = m/(1-a) = {β_f_bs:.4f}")
 print(f"Jacobian M = a - 1 = {a_bs - 1:.4f}  (< 0: E-stable)")
 ```
 
-### Present-value / hyperinflation model
+The interpretation is parallel to Bray's cobweb model.
+
+The difference is that agents are now learning how prices co-move with an
+exogenous variable, not just learning an unconditional mean.
+
+## Example 4: Hyperinflation / asset prices
+
+Consider the present-value asset pricing model
+
+$$
+y_t = \lambda E_t y_{t+1} + x_t , \quad x_t = \rho x_{t-1} + \varepsilon_t ,
+$$ (eq:pv_model)
+
+where $|\lambda| < 1$, $|\rho| < 1$, and agents perceive $y_t = \beta_t x_{t-1}+ v_t$.
+
+Under the perceived law, $E_t y_{t+1} = \beta_t x_t$, so {eq}`eq:pv_model` becomes
+
+$$
+y_t = (\lambda \beta_t + 1) x_t = (\lambda \beta_t + 1)\rho x_{t-1} + (\lambda \beta_t + 1)\varepsilon_t.
+$$
+
+The coefficient on $x_{t-1}$ in the actual law of motion is therefore $(\lambda \beta_t + 1)\rho$, which gives the operator $T$ below.
+
+If $y_t$ is a price level, the equation resembles a Cagan-style hyperinflation model.
+
+If $y_t$ is an asset price and $x_t$ is a dividend or fundamental, it is a present-value relation.
+
+In both cases, beliefs about the future value of $y$ enter the current value of $y$.
+
+The product $\lambda\rho$ measures the strength of the forward-looking feedback: $\lambda$ discounts the future and $\rho$ measures how persistent the fundamental is.
+
+When $|\lambda\rho|<1$, a mistaken perceived coefficient does not amplify without bound.
+
+The mapping is $T(\beta) = (\lambda\beta + 1)\rho$ and the REE is
+
+$$
+\beta_f = \frac{\rho}{1 - \lambda\rho} .
+$$ (eq:pv_ree)
+
+The small ODE is
+
+$$
+\dot\beta = (\lambda\rho - 1)\beta + \rho ,
+$$ (eq:pv_ode)
+
+with Jacobian $\mathcal{M} = \lambda\rho - 1 < 0$ for $|\lambda\rho| < 1$, so
+the small ODE is stable.
+
+The convergence theorem then applies under the exogenous-regressor conditions
+above.
+
+The simulation uses $\lambda=0.8$ and $\rho=0.9$, so $\lambda\rho=0.72$.
+
+This means the feedback from expectations is strong enough to matter but still
+less than one.
+
+The code simulates the autoregressive fundamental $x_t$ and updates an OLS
+regression of $y_t$ on $x_{t-1}$.
 
 ```{code-cell} ipython3
 ---
@@ -945,17 +1130,23 @@ def T_pv(β):
 
 β_f_pv = ρ_pv / (1 - λ * ρ_pv)
 
-β_paths_pv = simulate_rls_scalar(T_pv, σ_pv, 0.0,
-                                    T_periods=T_sim, N_paths=N_sim)
+β_paths_pv = simulate_present_value_rls(
+    λ, ρ_pv, σ_pv, 0.0,
+    T_periods=T_sim, N_paths=N_sim
+)
+
 
 def ode_pv(β):
     return T_pv(β) - β
 
 
-t_ode_pv, sol_pv_low = solve_ode(ode_pv, 0.0, t_span=(0, 50))
-_, sol_pv_high = solve_ode(ode_pv, 10.0, t_span=(0, 50))
+t_ode_pv, sol_pv_low = solve_ode_calendar(ode_pv, 0.0, T_sim)
+_, sol_pv_high = solve_ode_calendar(ode_pv, 10.0, T_sim)
+
+β_grid_pv = np.linspace(-1.0, 7.0, 300)
+drift_pv = np.array([ode_pv(b) for b in β_grid_pv])
 
-fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+fig, axes = plt.subplots(1, 3, figsize=(15, 4.8))
 
 ax = axes[0]
 for i in range(min(30, N_sim)):
@@ -971,159 +1162,95 @@ ax.legend()
 ax = axes[1]
 ax.plot(
     t_ode_pv, sol_pv_low, color='seagreen', lw=2,
-    label='ODE from $\\beta_0=0$'
+    label='ODE at $\\tau_t$, $\\beta_0=0$'
 )
 ax.plot(
     t_ode_pv, sol_pv_high, color='steelblue', lw=2,
-    label='ODE from $\\beta_0=10$'
+    label='ODE at $\\tau_t$, $\\beta_0=10$'
 )
 ax.axhline(β_f_pv, color='red', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_pv:.2f}$')
 ax.set_xlabel('$t$')
-ax.set_ylabel('$\\beta(t)$')
+ax.set_ylabel('$\\beta(\\tau_t)$')
 ax.legend()
 
+plot_scalar_drift(
+    axes[2], β_grid_pv, drift_pv, β_f_pv, 'seagreen'
+)
+
 plt.tight_layout()
 plt.show()
 print(f"REE: β_f = ρ/(1 - lambda*ρ) = {β_f_pv:.4f}")
 print(f"Jacobian M = lambda*ρ - 1 = {λ*ρ_pv - 1:.4f}  (< 0: E-stable)")
 ```
 
-### Instability when E-stability fails
+The REE coefficient is larger than $\rho$ because current prices capitalize
+expected future fundamentals.
 
-To see what happens when E-stability is violated, we repeat Bray's model with $b > 1$.
+Learning converges because each revision in $\beta_t$ changes expectations,
+but the discounted persistence term $\lambda\rho$ prevents revisions from
+feeding back explosively.
 
-```{code-cell} ipython3
----
-mystnb:
-  figure:
-    caption: Unstable Bray dynamics
-    name: fig-unstable-bray-dynamics
----
-b_unstable = 1.4
+## Example 5: Investment under uncertainty
 
+Let's now consider a version of the Lucas–Prescott investment model where agents learn about the
+aggregate capital stock $K_t$ by regressing on $(K_{t-1}, w_{t-1})$ where $w_t$
+is an exogenous cost shock.
 
-def T_unstable(β):
-    return a_bray + b_unstable * β
 
+Firms choose investment using a perceived law of motion for aggregate capital.
 
-β_f_unstable = a_bray / (1 - b_unstable)
+Those investment choices then determine the actual law of motion for aggregate
+capital through market clearing.
 
-β_paths_unstable = simulate_rls_scalar(
-    T_unstable, σ_bray, β0=0.0,
-    T_periods=200, N_paths=50)
+Learning is therefore about a state variable that agents themselves help
+create through their investment decisions.
 
-def ode_unstable(β):
-    return T_unstable(β) - β
+The perceived law of motion is
 
+$$
+K_t = \beta_1 K_{t-1} + \beta_2 w_{t-1} + \eta_t ,
+$$
 
+while the actual law (from firms' optimal investment decisions and market clearing) is
 
-β_grid = np.linspace(-5, 5, 300)
-drift = np.array([ode_unstable(b) for b in β_grid])
+$$
+K_t = T_1(\beta) K_{t-1} + T_2(\beta) w_{t-1} + V(\beta) u_t ,
+$$ (eq:inv_actual)
 
-fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+where the nonlinear mappings $T_1, T_2$ come from solving the firms' linear
+quadratic control problems.
 
-ax = axes[0]
-for i in range(min(30, 50)):
-    ax.plot(β_paths_unstable[i], color='crimson', alpha=0.3, lw=2)
-ax.axhline(β_f_unstable, color='black', ls='--', lw=2,
-           label=f'$\\beta_f = {β_f_unstable:.2f}$ (unstable)')
-ax.set_xlabel('$t$')
-ax.set_ylabel('$\\beta_t$')
-ax.legend()
+The two coefficients have different economic roles.
 
-ax = axes[1]
-ax.plot(β_grid, drift, color='crimson', lw=2)
-ax.axhline(0, color='black', lw=2)
-ax.axvline(β_f_unstable, color='black', ls='--', lw=2,
-           label=f'$\\beta_f = {β_f_unstable:.2f}$')
-ax.fill_between(β_grid, drift, 0,
-                where=(drift > 0), color='crimson', alpha=0.15)
-ax.fill_between(β_grid, drift, 0,
-                where=(drift < 0), color='steelblue', alpha=0.15)
-ax.set_xlabel('$\\beta$')
-ax.set_ylabel('$T(\\beta) - \\beta$')
-ax.legend()
+$\beta_1$ describes persistence in aggregate capital, while $\beta_2$ describes
+how the exogenous shock $w_t$ affects next period's capital.
 
-plt.tight_layout()
-plt.show()
-print(f"Jacobian M = b - 1 = {b_unstable - 1:.2f}  (> 0: NOT E-stable)")
-```
+Because the optimal investment rule depends on the whole perceived law of
+motion, the map $T$ is nonlinear.
 
-## Phase diagrams and E-stability
+This is why the larger ODE is harder to analyze here than in the scalar
+examples with exogenous regressors.
 
-The E-stability condition has a clean geometric interpretation.
+The small ODE decomposes as:
 
-At the REE
-$\beta_f$, the small ODE {eq}`eq:small_ode` must have trajectories *pointing
-inward*.
+$$
+\dot\beta_1 = T_1(\beta_1) - \beta_1 , \quad
+\dot\beta_2 = T_2(\beta_1, \beta_2) - \beta_2 ,
+$$ (eq:inv_ode)
 
-This requires the slope $dT/d\beta - 1$ to be *negative* at $\beta_f$.
+and E-stability can be verified analytically for $|\beta_1| < b^{-1/2}$ (where
+$b$ is the discount factor).
 
-The figure below plots the phase diagrams for all three scalar examples side by
-side.
 
-```{code-cell} ipython3
----
-mystnb:
-  figure:
-    caption: Scalar phase diagrams
-    name: fig-scalar-phase-diagrams
----
-β_vec = np.linspace(-1.0, 5.5, 400)
-
-models = [
-    (
-        "Bray ($b=0.6$)",
-        lambda b: a_bray + 0.6 * b - b,
-        a_bray / (1 - 0.6),
-        'steelblue'
-    ),
-    (
-        "Bray–Savin ($a=0.7$)",
-        lambda b: m_bs + 0.7 * b - b,
-        m_bs / (1 - 0.7),
-        'darkorange'
-    ),
-    (
-        "Present-value",
-        lambda b: T_pv(b) - b,
-        β_f_pv,
-        'seagreen'
-    ),
-]
-
-fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-
-for ax, (name, ode_fn, bf, color) in zip(axes, models):
-    drift = np.array([ode_fn(b) for b in β_vec])
-    ax.plot(β_vec, drift, color=color, lw=2, label=name)
-    ax.axhline(0, color='black', lw=2)
-    ax.axvline(bf, color='red', ls='--', lw=2, label=f'$\\beta_f={bf:.2f}$')
-    ax.fill_between(β_vec, drift, 0, where=(drift > 0),
-                    color=color, alpha=0.12)
-    ax.fill_between(β_vec, drift, 0, where=(drift < 0),
-                    color=color, alpha=0.12)
-    for bv in np.linspace(β_vec[20], β_vec[-20], 7):
-        d = ode_fn(bv)
-        ax.annotate(
-            '', xy=(bv + 0.3 * np.sign(d), 0),
-            xytext=(bv, 0),
-            arrowprops=dict(arrowstyle='->', color=color, lw=2)
-        )
-    ax.set_xlabel('$\\beta$')
-    ax.set_ylabel('$T(\\beta) - \\beta$')
-    ax.legend(fontsize=9)
+The phase portrait below plots the vector field $T(\beta)-\beta$ over
+$(\beta_1,\beta_2)$ space.
 
-plt.tight_layout()
-plt.show()
-```
+Each arrow shows how the perceived law of motion would be revised if the
+economy were currently operating under that belief.
 
-## Two-dimensional example: the investment model
-
-The investment-under-uncertainty example is two-dimensional and highlights how
-E-stability of the composite map $T(\beta) = (T_1(\beta_1), T_2(\beta_1, \beta_2))$
-works when the ODE is recursive.
+The plotted trajectories are deterministic ODE paths, not noisy least-squares
+sample paths.
 
 ```{code-cell} ipython3
 ---
@@ -1148,6 +1275,7 @@ def ode_invest(t, β, **kwargs):
 
 
 params = dict(b=0.95, d=1.0, f=1.0, A1=1.0, N=1.0, ρ_w=0.5)
+β1_bound = params["b"] ** (-0.5)
 β_f_inv = fsolve(lambda b: T_invest(b, **params) - b, [0.5, 0.1])
 print(f"REE: β_f = {β_f_inv}")
 
@@ -1167,7 +1295,7 @@ print(f"E-stable: {np.all(eigs.real < 0)}")
 
 fig, ax = plt.subplots(figsize=(8, 6))
 
-b1_grid = np.linspace(-0.1, 1.2, 20)
+b1_grid = np.linspace(-0.1, min(1.0, β1_bound - 1e-3), 20)
 b2_grid = np.linspace(-0.8, 0.5, 20)
 B1, B2 = np.meshgrid(b1_grid, b2_grid)
 U = np.zeros_like(B1)
@@ -1184,7 +1312,7 @@ speed[speed == 0] = 1e-8
 ax.streamplot(b1_grid, b2_grid, U, V_field, color=speed,
               cmap='Blues', density=1.3, linewidth=1)
 
-starts = [(0.1, 0.0), (0.9, 0.4), (1.1, -0.6), (0.3, -0.7)]
+starts = [(0.1, 0.0), (0.9, 0.4), (0.98, -0.6), (0.3, -0.7)]
 colors_traj = ['red', 'darkorange', 'green', 'purple']
 for (b10, b20), col in zip(starts, colors_traj):
     sol = solve_ivp(lambda t, β: ode_invest(t, β, **params),
@@ -1202,49 +1330,20 @@ plt.tight_layout()
 plt.show()
 ```
 
-## Necessary condition: only REE can be limit points
-
-{prf:ref}`prop-ms-necessity` is a converse to {prf:ref}`prop-ms-convergence`:
-RLS either converges to the REE or fails to converge at all.
+The star marks the REE.
 
-It cannot converge to a non-equilibrium fixed point.
+The paths converge because the capital-persistence coefficient $\beta_1$ is
+stabilized first, and then the shock coefficient $\beta_2$ adjusts given the
+limiting value of $\beta_1$.
 
-The following simulation makes this vivid by starting agents with an initial
-belief that happens to satisfy $T(\beta_0) \approx \beta_0$ only approximately.
-
-```{code-cell} ipython3
----
-mystnb:
-  figure:
-    caption: Non-REE starts
-    name: fig-non-ree-starts
----
-β_false_rest = 3.0
-paths_from_false = simulate_rls_scalar(
-    T_bray, σ_bray, β0=β_false_rest,
-    T_periods=300, N_paths=60, seed=7)
-
-fig, ax = plt.subplots(figsize=(10, 5))
-for i in range(60):
-    ax.plot(paths_from_false[i], color='steelblue', alpha=0.2, lw=2)
-ax.plot(np.mean(paths_from_false, axis=0), color='navy', lw=2,
-        label='cross-path average')
-ax.axhline(β_f_bray, color='red', ls='--', lw=2,
-           label=f'REE $\\beta_f = {β_f_bray:.2f}$')
-ax.axhline(β_false_rest, color='gray', ls=':', lw=2,
-           label=f'False start $\\beta_0 = {β_false_rest}$')
-ax.set_xlabel('$t$')
-ax.set_ylabel('$\\beta_t$')
-ax.legend()
-plt.tight_layout()
-plt.show()
-```
+This recursive structure is why the small ODE is tractable even though the full
+least-squares system has endogenous regressors.
 
 ## Connection to rational learning
 
-The framework of {cite:t}`MarcetSargent1989jet` belongs to the programme of learning
+The framework of {cite:t}`MarcetSargent1989jet` belongs to the program of learning
 *about* a rational expectations equilibrium, as distinct from learning *within*
-one — a distinction emphasised by {cite:t}`BrayKreps1987`.
+one --- a distinction emphasized by {cite:t}`BrayKreps1987`.
 
 **Learning *within* an REE** (the subject of the companion lecture
 [](rational_learning_re)) refers to Bayesian inference inside a correctly
@@ -1253,7 +1352,7 @@ specified model.
 In that setting the data-generating process is stationary from
 the agent's perspective, and Bayes' rule is fully rationalized.
 
-**Learning *about* an REE** — the present lecture's topic — involves an agent who
+**Learning *about* an REE** --- the present lecture's topic --- involves an agent who
 does not know the equilibrium price function.
 
 Because the agent's beliefs shift
@@ -1262,102 +1361,67 @@ generated by a non-stationary process.
 
 As {cite:t}`MarcetSargent1989jet` put it,
 
-> *"The models do not incorporate fully optimal behavior or rational expectations,
+> The models do not incorporate fully optimal behavior or rational expectations,
 > because agents operate under the continually falsified assumption that the law of
-> motion is time invariant and known for sure."*
-
-This "continually falsified" assumption is precisely the sense in which the RLS
-algorithm cannot be derived from Bayesian rationality applied to a correctly
-specified model.
-
-It is nonetheless a compelling learning rule because it is
-consistent, computationally tractable, and — when E-stability holds — converges to
-the REE despite the misspecification.
+> motion is time invariant and known for sure.
 
-The E-stability condition thus plays the same role in this literature that the
-prior-support condition plays in the Bayesian learning literature: it tells us
-when the learning algorithm can find its way to the equilibrium.
+It is nonetheless a compelling learning rule because it is consistent,
+computationally tractable, and --- when E-stability holds --- converges to the
+REE despite the misspecification.
 
-The paper also marks the limits of the argument.
-
-Stability of the small ODE is a local condition, while global convergence still
-requires the larger ODE and the boundedness or projection assumptions.
-
-The framework also does not directly cover hidden-state or private-information
-models, where agents learn from signals rather than directly observed state
-variables.
+It does not require the strong assumptions on agents' prior beliefs about the
+statistical structure of the economy that are needed for Bayesian learning.
 
 
 ## Summary
 
-This lecture has presented the framework of {cite:t}`MarcetSargent1989jet` for analysing
-least squares learning in self-referential linear stochastic models.
-
-Key takeaways:
-
-1. **Self-referential structure**: the actual law of motion depends on the
-   perceived law of motion through the mapping $T$, and a rational expectations
-   equilibrium is a fixed point $\beta_f = T(\beta_f)$.
+This lecture studied least-squares learning in the self-referential models of
+{cite:t}`MarcetSargent1989jet`.
 
-2. **Recursive least squares**: agents update their beliefs by running RLS,
-   which is adaptive but not fully Bayesian — it "continually falsifies" the
-   assumption that the environment is stationary.
+The central object is the map $T$ from a perceived law of motion to the actual
+law of motion generated when agents act on that perception.
 
-3. **The governing ODE**: the almost-sure limiting behaviour of $\beta_t$ is
-   described by the small ODE $\dot\beta = T(\beta) - \beta$, and only fixed
-   points of this ODE (REE) are possible limit points of RLS.
+A rational expectations equilibrium is a fixed point of this map.
 
-4. **E-stability**: the REE is the almost-sure limit of RLS if and only if
-   it is a locally stable fixed point of the small ODE, that is, if all
-   eigenvalues of the Jacobian $\mathcal{M} = dT/d\beta - I$ at $\beta_f$ have
-   strictly negative real parts.
+Recursive least squares converges to that fixed point when the associated ODE
+$\dot\beta = T(\beta)-\beta$ is locally stable and the learning process remains
+in a region where the model is well defined.
 
-5. **Instability**: if any eigenvalue of $\mathcal{M}$ has positive real part,
-   $P(\beta_t \to \beta_f) = 0$ — convergence to that REE is impossible.
-
-6. **Connection to the rational learning literature**: the RLS algorithm
-   studies learning *about* a rational expectations equilibrium; it is
-   complementary to the Bayesian learning *within* an REE studied by
-   {cite:t}`BrayKreps1987`.
+It complements rational learning, studied in {doc}`rational_learning_re`, where
+Bayesian agents learn *within* an equilibrium structure that is already
+specified.
 
 ## Exercises
 
 ```{exercise}
 :label: ls_ex1
 
-E-stability and the slope of $T$
-
 Consider the scalar model with $T(\beta) = a + b\beta$.
 
-(a) Derive a formula for the unique REE $\beta_f$ in terms of $a$ and $b$.
+1. Derive a formula for the unique REE $\beta_f$ in terms of $a$ and $b$.
 
-(b) Show that the small ODE $\dot\beta = T(\beta) - \beta$ is globally stable if
-and only if $b < 1$.
+2. Show that the small ODE $\dot\beta = T(\beta) - \beta$ is globally stable if and only if $b < 1$.
 
-(c) Simulate $N = 200$ paths of length $T = 500$ for $a = 1$ and each of
-$b \in \{0.3, 0.7, 0.9, 0.99\}$ (all less than 1).
+3. Simulate $N = 200$ paths of length $T = 500$ for $a = 1$ and each of $b \in \{0.3, 0.7, 0.9, 0.99\}$ (all less than 1).
 
-Plot the cross-path
-average of $\beta_t$ for each $b$ value on the same figure and comment on how the
-rate of convergence changes as $b \to 1$.
+Plot the cross-path average of $\beta_t$ for each $b$ value on the same figure and comment on how the rate of convergence changes as $b \to 1$.
 ```
 
 ```{solution-start} ls_ex1
 :class: dropdown
 ```
 
-**(a)** The REE satisfies $\beta_f = T(\beta_f) = a + b\beta_f$, so
+*Part 1.* The REE satisfies $\beta_f = T(\beta_f) = a + b\beta_f$, so
 
 $$
 \beta_f (1 - b) = a \implies \beta_f = \frac{a}{1-b} .
 $$
 
-**(b)** The small ODE is $\dot\beta = a + b\beta - \beta = a - (1-b)\beta$.
+*Part 2.* The small ODE is $\dot\beta = a + b\beta - \beta = a - (1-b)\beta$.
 
-This is linear with slope $-(1-b)$, so the unique fixed point $\beta_f = a/(1-b)$
-is globally stable iff $1-b > 0$, i.e., $b < 1$.
+This is linear with slope $-(1-b)$, so the unique fixed point $\beta_f = a/(1-b)$ is globally stable iff $1-b > 0$, i.e., $b < 1$.
 
-**(c)**
+*Part 3.*
 
 ```{code-cell} ipython3
 a_ex, T_ex, N_ex = 1.0, 500, 200
@@ -1381,44 +1445,31 @@ ax.set_title('Convergence Rate Slows as $b \\to 1$')
 ax.legend()
 plt.tight_layout()
 plt.show()
-
-print("As b → 1, the Jacobian M = b - 1 → 0, so the ODE becomes slow to")
-print("return to the fixed point.  Convergence still occurs but takes longer.")
 ```
 
+As $b \to 1$, the Jacobian $\mathcal M = b - 1$ approaches zero, so the ODE
+becomes slow to return to the fixed point.
+
+Convergence still occurs, but it takes longer.
+
 ```{solution-end}
 ```
 
 ```{exercise}
 :label: ls_ex2
 
-Necessary condition: non-REE limit points
-
-{prf:ref}`prop-ms-necessity` states that $P(\beta_t \to \hat\beta) = 0$
-for any $\hat\beta \neq \beta_f$ in the interior.
-
-(a) Using the Bray model with $a=1$, $b=0.6$, simulate 100 paths of length
-$T = 600$ starting from $\beta_0 = 6$ (far from $\beta_f = 2.5$).
-
-Show that
-paths still converge to $\beta_f$.
-
-(b) Now consider the **unstable** case $b = 1.5$.
-
-Simulate 50 paths of length
-$T = 200$ starting from $\beta_0 = 0.1$ (close to the REE $\beta_f = -2$).
+{prf:ref}`prop-ms-necessity` states that $P(\beta_t \to \hat\beta) = 0$ for any $\hat\beta \neq \beta_f$ in the interior.
 
-Describe what happens.
+1. Using the Bray model with $a=1$, $b=0.6$, simulate 100 paths of length $T = 600$ starting from $\beta_0 = 6$ (far from $\beta_f = 2.5$) and show that paths still converge to $\beta_f$.
 
-(c) For the unstable case, plot the phase diagram and explain geometrically why
-the paths diverge.
+2. Now consider the *unstable* case $b = 1.5$, simulate 50 paths of length $T = 200$ starting from $\beta_0 = -1.9$ (close to the REE $\beta_f = -2$), and describe what happens.
 ```
 
 ```{solution-start} ls_ex2
 :class: dropdown
 ```
 
-**(a) and (b)**
+*Parts 1 and 2.*
 
 ```{code-cell} ipython3
 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
@@ -1444,7 +1495,7 @@ def T_un(β):
 
 
 β_f_un = 1.0 / (1 - 1.5)
-paths_un = simulate_rls_scalar(T_un, 1.0, β0=0.1,
+paths_un = simulate_rls_scalar(T_un, 1.0, β0=-1.9,
                                T_periods=200, N_paths=50, seed=2)
 ax = axes[1]
 for i in range(50):
@@ -1460,32 +1511,10 @@ plt.tight_layout()
 plt.show()
 ```
 
-**(c)** Phase diagram of the unstable case:
+Geometrically, the slope $dT/d\beta - 1 = b - 1 = 0.5$ is positive at the
+REE.
 
-```{code-cell} ipython3
-β_g = np.linspace(-8, 6, 400)
-drift_un = np.array([1.0 + 1.5 * b - b for b in β_g])
-
-fig, ax = plt.subplots(figsize=(8, 4))
-ax.plot(β_g, drift_un, color='crimson', lw=2)
-ax.axhline(0, color='black', lw=2)
-ax.axvline(β_f_un, color='black', ls='--', lw=2,
-           label=f'$\\beta_f = {β_f_un}$')
-ax.fill_between(β_g, drift_un, 0, where=(drift_un > 0),
-                color='crimson', alpha=0.15)
-ax.fill_between(β_g, drift_un, 0, where=(drift_un < 0),
-                color='steelblue', alpha=0.15)
-ax.set_xlabel('$\\beta$')
-ax.set_ylabel('$T(\\beta) - \\beta$')
-ax.set_title('Phase Diagram: Unstable REE ($b=1.5$)\n'
-             'Drift points away from $\\beta_f$ everywhere')
-ax.legend()
-plt.tight_layout()
-plt.show()
-
-print("Geometrically: the slope dT/d(β) - 1 = b - 1 = 0.5 > 0 at the REE,")
-print("so the ODE pushes β AWAY from β_f in both directions.")
-```
+The ODE therefore pushes $\beta$ away from $\beta_f$ in both directions.
 
 ```{solution-end}
 ```
@@ -1495,30 +1524,20 @@ print("so the ODE pushes β AWAY from β_f in both directions.")
 
 The present-value model: effect of $\lambda$ on E-stability
 
-In the present-value model {eq}`eq:pv_model`, $T(\beta) = (\lambda\beta + 1)\rho$
-and the Jacobian is $\mathcal{M} = \lambda\rho - 1$.
-
-(a) For $\rho = 0.9$ and each of $\lambda \in \{0.5, 0.8, 0.95, 1.0\}$:
-    - Compute $\beta_f$ and $\mathcal{M}$
-    - Determine whether the REE is E-stable
+In the present-value model {eq}`eq:pv_model`, $T(\beta) = (\lambda\beta + 1)\rho$ and the Jacobian is $\mathcal{M} = \lambda\rho - 1$.
 
-(b) For the E-stable cases, simulate 100 paths of length $T=400$ and
-plot the cross-path average against the ODE solution.
+1. For $\rho = 0.9$ and each of $\lambda \in \{0.5, 0.8, 0.95, 1.0\}$, compute $\beta_f$ and $\mathcal{M}$ and determine whether the REE is E-stable.
 
-(c) At $\lambda = 1$, $\mathcal{M} = \rho - 1 < 0$ (still E-stable when
-$|\rho| < 1$).
+2. For the E-stable cases, simulate 100 paths of length $T=400$ and plot the cross-path average against the ODE solution evaluated at cumulative-gain time.
 
-Simulate paths for this case and compare the convergence
-speed with the $\lambda = 0.5$ case.
-
-Provide an intuitive explanation.
+3. At $\lambda = 1$, $\mathcal{M} = \rho - 1 < 0$ (still E-stable when $|\rho| < 1$). Simulate paths for this case and compare the convergence speed with the $\lambda = 0.5$ case, providing an intuitive explanation.
 ```
 
 ```{solution-start} ls_ex3
 :class: dropdown
 ```
 
-**(a)**
+*Part 1.*
 
 ```{code-cell} ipython3
 ρ_ex = 0.9
@@ -1533,30 +1552,30 @@ for lv in λ_values:
     print(f"{lv:>8.2f}  {bf:>10.4f}  {M_jac:>15.4f}  {estab:>10}")
 ```
 
-**(b) and (c)**
+*Parts 2 and 3.*
 
 ```{code-cell} ipython3
 fig, axes = plt.subplots(2, 2, figsize=(14, 10))
 colors_λ = ['steelblue', 'darkorange', 'seagreen', 'purple']
 
 for ax, lv, col in zip(axes.flat, λ_values, colors_λ):
-    def T_fn(β, λ_val=lv):
-        return (λ_val * β + 1) * ρ_ex
-
     def ode_fn(β, λ_val=lv):
         return (λ_val * β + 1) * ρ_ex - β
 
     bf = ρ_ex / (1 - lv * ρ_ex) if abs(lv * ρ_ex) < 1 else None
 
-    paths_λ = simulate_rls_scalar(T_fn, 1.0, β0=0.0,
-                                    T_periods=400, N_paths=100, seed=3)
+    paths_λ = simulate_present_value_rls(
+        lv, ρ_ex, 1.0, β0=0.0,
+        T_periods=400, N_paths=100, seed=3
+    )
     for i in range(20):
         ax.plot(paths_λ[i], color=col, alpha=0.2, lw=2)
     ax.plot(np.mean(paths_λ, axis=0), color=col, lw=2, label='RLS average')
 
     if bf is not None:
-        t_o, sol_o = solve_ode(ode_fn, 0.0, t_span=(0, 400), n_points=400)
-        ax.plot(t_o, sol_o, color='black', ls='--', lw=2, label='ODE')
+        t_o, sol_o = solve_ode_calendar(ode_fn, 0.0, 400)
+        ax.plot(t_o, sol_o, color='black', ls='--', lw=2,
+                label='ODE at $\\tau_t$')
         ax.axhline(bf, color='red', ls=':', lw=2,
                    label=f'$\\beta_f={bf:.2f}$')
 
@@ -1568,12 +1587,19 @@ for ax, lv, col in zip(axes.flat, λ_values, colors_λ):
 
 plt.tight_layout()
 plt.show()
-
-print("\n(c) When lambda=1, M = ρ-1 ≈ -0.1 (small in absolute value).")
-print("    This means the ODE is very 'flat' near β_f: the restoring force")
-print("    is weak and convergence is slow.  When lambda=0.5, M = -0.55,")
-print("    giving a stronger restoring force and faster convergence.")
 ```
 
+The dashed ODE curves use $\tau_t = \sum_{s=1}^t 1/s$, so they are on the same
+learning-time scale as RLS.
+
+When $\lambda=1$, $\mathcal M = \rho-1 \approx -0.1$ is small in absolute
+value.
+
+The ODE is nearly flat near $\beta_f$, so the restoring force is weak and
+convergence is slow.
+
+When $\lambda=0.5$, $\mathcal M=-0.55$, which gives a stronger restoring force
+and faster convergence.
+
 ```{solution-end}
 ```
diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index b3a556d71..5f5dbc2c7 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -30,7 +30,7 @@ kernelspec:
 
 This lecture explores an important question in economic theory: can agents *learn* their way to a rational expectations equilibrium?
 
-If they can, then the rational expectations equilibrium can be justiﬁed as a dynamic attractor for learning processes.
+If they can, then the rational expectations equilibrium can be justified as a dynamic attractor for learning processes.
 
 The starting point is {cite:t}`BrayKreps1987`, which gives a rigorous model of Bayesian learning inside a rational expectations equilibrium.
 
@@ -80,11 +80,15 @@ An informed signal $s_t$ satisfies
 $$
 r_t = s_t + \epsilon_t,
 \qquad
+s_t \sim \mathcal N(\mu_s, \tau^2),
+\qquad
 \epsilon_t \sim \mathcal N(0,\sigma^2),
 $$
 
 where $\{s_t\}$ and $\{\epsilon_t\}$ are IID normal sequences and are mutually independent.
 
+Common knowledge of the prior moments $(\mu_s, \tau^2)$ is what makes the price observation informative about $\theta^I$, as we will see.
+
 There are two representative agents:
 
 * **Agent $I$ (informed)** observes $s_t$ before trading at date $t$.
@@ -180,21 +184,32 @@ The second is the Bayesian update of that posterior after the period closes.
 
 ### Uninformed demand given beliefs
 
-Suppose at date $t$ agent $U$ has posterior density $f_t$ on $\theta^I$ supported on $[a, b]$.
+Suppose at date $t$ agent $U$ has posterior density $f_t$ on $\theta^I$, supported on $[a, b]$.
+
+Suppose the equilibrium price is $p$ and the equilibrium informed trade $X^I = 2 - x^U$ has been inferred from market clearing.
+
+Conditional on $\theta$, equation {eq}`eq:bk-signal-implied` pins down the signal as $s_t(\theta) = \sigma^2 X^I/\theta + p$.
 
-Suppose the equilibrium informed trade and price are $X^I$ and $p$.
+Two sources of information about $\theta$ are therefore present at the start of date $t$ trading: the carried-over posterior $f_t(\theta)$ and the Gaussian prior $\phi_s(\cdot;\mu_s,\tau^2)$ on $s_t$ that values some implied signals as more plausible than others.
 
-From {eq}`eq:bk-signal-implied`, conditional on $\theta^I$, agent $U$ infers $s_t = \sigma^2 X^I/\theta^I + p$.
+Bayes' rule combines them into the *intra-period* posterior
 
-Marginalising over $\theta^I \sim f_t$ and combining with $r_t = s_t + \epsilon_t$ where $\epsilon_t \sim \mathcal N(0,\sigma^2)$ gives the implied conditional distribution of $r_t$.
+$$
+f_t^{(p, X^I)}(\theta)
+\propto
+f_t(\theta)\,
+\phi_s\!\left(\frac{\sigma^2 X^I}{\theta} + p\, ;\, \mu_s, \tau^2\right),
+$$ (eq:bk-intra-posterior)
 
-Equivalently, conditional on a candidate value $\theta$, the excess payoff on
-one unit of the risky asset is
+which is the posterior on $\theta^I$ that the agent actually uses to forecast $r_t$ before $r_t$ is observed.
+
+Conditional on a candidate value $\theta$, the excess payoff on one unit of the risky asset is
 
 $$
 r_t - p
 =
-\frac{\sigma^2 X^I}{\theta} + \epsilon_t.
+\frac{\sigma^2 X^I}{\theta} + \epsilon_t,
+\qquad \epsilon_t \sim \mathcal N(0,\sigma^2).
 $$
 
 Because CARA preferences have no wealth effects, agent $U$'s problem reduces to
@@ -208,86 +223,64 @@ u^U(x^U, r_t, p)
 -\exp\!\left(-\frac{x^U(r_t-p)}{\theta^U}\right),
 $$
 
-where the expectation integrates over $\theta^I \sim f_t$ and $\epsilon_t$.
+where the expectation integrates over $\theta^I \sim f_t^{(p, X^I)}$ and $\epsilon_t$.
 
-To derive the expected utility formula, substitute the conditional excess
-payoff above:
-
-$$
-E[u^U]
-=
--\int_a^b f_t(\theta)
-E_\epsilon
-\left[
-\exp\!\left(
--\frac{x^U}{\theta^U}
-\left(\frac{\sigma^2 X^I}{\theta}+\epsilon_t\right)
-\right)
-\right]
-d\theta.
-$$
-
-Using the normal moment-generating formula
-
-$$
-E_\epsilon\!\left[\exp(c\epsilon_t)\right]
-=
-\exp\!\left(\frac{c^2\sigma^2}{2}\right)
-$$
-
-with $c=-x^U/\theta^U$, this becomes
+Substituting the conditional excess payoff and using the normal moment-generating formula gives
 
 $$
 E[u^U]
 =
 -\exp\!\left(\frac{(x^U)^2 \sigma^2}{2(\theta^U)^2}\right)
-\int_a^b f_t(\theta)\,
-\exp\!\left(-\frac{x^U \sigma^2 X^I}{\theta\,\theta^U}\right)
-d\theta.
+\int_a^b
+f_t(\theta)\,
+\phi_s\!\left(\tfrac{\sigma^2 X^I}{\theta} + p; \mu_s, \tau^2\right)
+\exp\!\left(-\tfrac{x^U \sigma^2 X^I}{\theta\,\theta^U}\right)
+d\theta,
 $$
 
-To get the first-order condition, define
-
-$$
-I(x^U)
-=
-\int_a^b f_t(\theta)\,
-\exp\!\left(-\frac{x^U \sigma^2 X^I}{\theta\,\theta^U}\right)
-d\theta.
-$$
+up to a $\theta$-independent constant absorbed in normalisation.
 
-Hence the first-order condition is
+Define the tilted weight
 
 $$
-\frac{d}{dx^U}
-\left[
-\frac{(x^U)^2\sigma^2}{2(\theta^U)^2}
-+ \log I(x^U)
-\right]
-=0.
-$$
+w(\theta;\, p, X^I, x^U)
+=
+f_t(\theta)\,
+\phi_s\!\left(\tfrac{\sigma^2 X^I}{\theta} + p; \mu_s, \tau^2\right)
+\exp\!\left(-\tfrac{x^U \sigma^2 X^I}{\theta\,\theta^U}\right).
+$$ (eq:bk-weight)
 
-Rearranging gives
+The first-order condition rearranges to
 
 $$
 \frac{x^U}{\theta^U}
 =
 X^I \;
-\frac{\int_a^b \theta^{-1} f_t(\theta)\,\exp\!\big(-x^U \sigma^2 X^I/(\theta\theta^U)\big)\,d\theta}
-     {\int_a^b f_t(\theta)\,\exp\!\big(-x^U \sigma^2 X^I/(\theta\theta^U)\big)\,d\theta}.
+\frac{\int_a^b \theta^{-1}\, w(\theta;\, p, X^I, x^U)\, d\theta}
+     {\int_a^b w(\theta;\, p, X^I, x^U)\, d\theta}.
 $$ (eq:bk-foc)
 
-The right-hand side is $X^I$ multiplied by a tilted expectation of $1/\theta^I$ under a weighting that depends on $x^U$ itself.
+The right-hand side is $X^I$ multiplied by a tilted expectation of $1/\theta^I$ under the weighting in {eq}`eq:bk-weight`.
 
-Equation {eq}`eq:bk-foc` implicitly defines $x^U(X^I; f_t)$, the uninformed agent's optimal demand at conjectured informed trade $X^I$ and posterior $f_t$.
+Equation {eq}`eq:bk-foc` implicitly defines $x^U(p, X^I; f_t)$, the uninformed agent's optimal demand at observed price $p$, conjectured informed trade $X^I$, and prior posterior $f_t$.
 
-The optimum does not depend separately on $p$, because the distribution of $r_t - p$ implied by the posterior depends only on $X^I$.
+Dependence on $p$ enters through the prior weight $\phi_s$: at higher prices, candidate values of $\theta$ that imply $s_t$ above the prior mean become less plausible, so the agent's demand schedule slopes downward in $p$ as expected.
 
 ### Market clearing
 
-Market clearing $X^I + x^U(X^I; f_t) = 2$ pins down the equilibrium informed trade $X^I_t$ as a function of beliefs alone.
+Equilibrium requires that the informed and uninformed demands sum to the total endowment.
 
-Plugging $X^I_t$ into {eq}`eq:bk-informed-demand` recovers the equilibrium price
+Substituting {eq}`eq:bk-informed-demand` and the implicit function $x^U(p, X^I; f_t)$, the equilibrium $(p_t, X^I_t)$ satisfies the two equations
+
+$$
+X^I_t = \frac{\theta^I}{\sigma^2}(s_t - p_t),
+\qquad
+X^I_t + x^U(p_t, X^I_t; f_t) = 2.
+$$ (eq:bk-mc)
+
+Eliminating $X^I_t$ between the two leaves a single root-finding problem for $p_t$.
+
+Combining the two equations, the equilibrium price has the form
 
 $$
 p_t = s_t - \frac{\sigma^2 X^I_t}{\theta^I}.
@@ -335,49 +328,56 @@ This is the rule we simulate below.
 
 We discretise the support $[a,b]$ of $\theta^I$ on a fine grid and represent $f_t$ as a vector of density values.
 
-There are three computational primitives.
+The three computational primitives are:
 
-* `uninformed_demand` solves the FOC in {eq}`eq:bk-foc` for $x^U(X^I; f)$ by root-finding.
-* `equilibrium_XI` solves market clearing $X^I + x^U(X^I; f) = 2$ for $X^I_t$.
+* `uninformed_demand` solves the FOC in {eq}`eq:bk-foc` for $x^U(p, X^I; f)$ by root-finding.
+* `equilibrium_price` solves the market-clearing system {eq}`eq:bk-mc` for $p_t$.
 * `bayes_update` applies {eq}`eq:bk-bayes` and renormalises.
 
 ```{code-cell} ipython3
-def uninformed_demand(XI, f, θ_grid, θ_U, σ2):
+def uninformed_demand(p, XI, f, θ_grid, θ_U, σ2, μ_s, τ2):
     """
-    Solve the FOC for the uninformed agent's demand x^U, given
-    a conjectured informed trade XI and posterior density f.
+    Solve the FOC for x^U(p, X^I; f), the uninformed
+    agent's optimal demand given observed price p, conjectured
+    informed trade XI, and carried-over posterior density f.
     """
     with np.errstate(divide='ignore'):
         log_f = np.log(f)
+    s_implied = σ2 * XI / θ_grid + p
+    log_phi_s = -0.5 * (s_implied - μ_s)**2 / τ2  # prior weight on s_t
 
     def foc(xU):
         z = xU * σ2 * XI / (θ_grid * θ_U)
-        log_w = log_f - z
+        log_w = log_f + log_phi_s - z
         M = log_w.max()
         w = np.exp(log_w - M)
         num = np.sum(w / θ_grid)
         den = np.sum(w)
         return xU / θ_U - XI * num / den
 
-    return brentq(foc, -20.0, 20.0, xtol=1e-10)
+    return brentq(foc, -50.0, 50.0, xtol=1e-10)
 ```
 
 ```{code-cell} ipython3
-def equilibrium_XI(f, θ_grid, θ_U, σ2):
+def equilibrium_price(s_t, θ_I_true, f, θ_grid, θ_U, σ2, μ_s, τ2):
     """
-    Solve market clearing X^I + x^U(X^I; f) = 2 for the
-    equilibrium informed trade.
+    Solve the market-clearing system for the equilibrium
+    price p_t given signal s_t, true informed risk tolerance
+    θ_I_true, and posterior f.
     """
-    def mc(XI):
-        return XI + uninformed_demand(XI, f, θ_grid, θ_U, σ2) - 2.0
+    def mc_residual(p):
+        XI = θ_I_true * (s_t - p) / σ2
+        xU = uninformed_demand(p, XI, f, θ_grid, θ_U, σ2, μ_s, τ2)
+        return XI + xU - 2.0
 
-    return brentq(mc, 1e-4, 4.0, xtol=1e-10)
+    return brentq(mc_residual, s_t - 10.0, s_t, xtol=1e-8)
 ```
 
 ```{code-cell} ipython3
 def bayes_update(f, θ_grid, p_t, xU_t, r_t, σ2, τ2, μ_s):
     """
-    Bayesian update of the posterior on θ^I given date-t observations.
+    Bayesian update of the posterior on θ^I given the date-t
+    observations (p_t, x^U_t, r_t).
     """
     XI = 2.0 - xU_t
     s_mean = (σ2 * μ_s + τ2 * r_t) / (σ2 + τ2)
@@ -425,9 +425,11 @@ def simulate(θ_I_true, θ_U, σ2, μ_s, τ2,
     snapshots = {0: f.copy()}
 
     for t in range(T):
-        XI = equilibrium_XI(f, θ_grid, θ_U, σ2)
+        p_t = equilibrium_price(
+            s_seq[t], θ_I_true, f, θ_grid, θ_U, σ2, μ_s, τ2
+        )
+        XI = θ_I_true * (s_seq[t] - p_t) / σ2
         xU = 2.0 - XI
-        p_t = s_seq[t] - σ2 * XI / θ_I_true
         r_t = s_seq[t] + eps_seq[t]
         f = bayes_update(f, θ_grid, p_t, xU, r_t, σ2, τ2, μ_s)
 
@@ -534,9 +536,9 @@ This is the concrete manifestation of weak convergence of posteriors to a point
 
 ## Equilibrium trades and prices
 
-The equilibrium informed trade $X^I_t$ depends only on $f_t$, not directly on $s_t$ or $\theta^I$.
+The equilibrium informed trade $X^I_t$ depends on the current signal $s_t$, on $\theta^I_{\rm true}$, and on the carried-over posterior $f_t$, all through the market-clearing system {eq}`eq:bk-mc`.
 
-As $f_t$ tightens around $\theta^I_{\rm true}$, $X^I_t$ approaches the full-communication allocation in {eq}`eq:bk-full-info-trade`.
+As $f_t$ tightens around $\theta^I_{\rm true}$, the average $X^I_t$ approaches the full-communication allocation in {eq}`eq:bk-full-info-trade`.
 
 ```{code-cell} ipython3
 ---
@@ -582,7 +584,7 @@ The posterior density on $\theta^I$ concentrates around the true value, the post
 
 The next sections ask what general theorems guarantee these outcomes and which assumptions they rely on.
 
-The plan is to first state the two convergence theorems of {cite:t}`BrayKreps1987` for the abstract rational-learning model, then specialise to the two-agent example to identify the hypotheses that imply concentration on the true $\theta^I$, and finally explain when those hypotheses can fail.
+The plan is to first state the two convergence theorems of {cite:t}`BrayKreps1987` for the abstract rational-learning model, then specialize to the two-agent example to identify the hypotheses that imply concentration on the true $\theta^I$, and finally explain when those hypotheses can fail.
 
 ## Convergence of posterior assessments
 
@@ -699,7 +701,9 @@ The equilibrium uninformed demand $x^U(p, F)$ is continuous in $F$ with respect
 ```{prf:assumption}
 :label: assum-bk-identification
 
-For fixed $\theta^U$ and limiting posterior $F_\infty$, the limiting price functional $p_\infty(\,\cdot\,; F_\infty, \theta^I, \theta^U)$ is stochastically monotone in $\theta^I$, in the sense that $\theta^I < \theta^{I\,\prime}$ implies $p_\infty(s; F_\infty, \theta^I, \theta^U)$ first-order stochastically dominates $p_\infty(s; F_\infty, \theta^{I\,\prime}, \theta^U)$ when $s$ is drawn from its marginal distribution.
+For fixed $\theta^U$ and limiting posterior $F_\infty$, the marginal distribution of the limiting price functional $p_\infty(\,\cdot\,; F_\infty, \theta^I, \theta^U)$ is strictly monotone in $\theta^I$ in the first-order-stochastic-dominance order.
+
+That is, $\theta^I \neq \theta^{I\,\prime}$ implies $p_\infty(s; F_\infty, \theta^I, \theta^U)$ and $p_\infty(s; F_\infty, \theta^{I\,\prime}, \theta^U)$ have distinct CDFs when $s$ is drawn from its marginal distribution.
 ```
 
 In the lecture's CARA-Normal setup, {prf:ref}`assum-bk-continuity` holds because the FOC {eq}`eq:bk-foc` defines $x^U$ as a continuous functional of $F$ under weak convergence through bounded integrals, and {prf:ref}`assum-bk-identification` holds because the equilibrium price has the form $p_t = s_t - \sigma^2 X^I_t / \theta^I$ with $X^I_t > 0$ on a full-measure set.
@@ -711,6 +715,8 @@ Under these three assumptions and the IID signal sequence, the limiting posterio
 ```{prf:proposition}
 :label: prop-bk-sharpening
 
+Suppose $\theta^I_{\rm true} \in [a,b]$ and the prior $f_0$ puts positive density in every neighbourhood of $\theta^I_{\rm true}$.
+
 Under {prf:ref}`assum-bk-borel`, {prf:ref}`assum-bk-continuity`, and {prf:ref}`assum-bk-identification`, and given the IID signal sequence $\{s_t\}$, the limiting posterior on $\theta^I$ satisfies
 
 $$
@@ -727,13 +733,13 @@ The proof has three steps.
 
 {prf:ref}`assum-bk-continuity` and the weak convergence $F_t \Rightarrow F_\infty$ from {prf:ref}`prop-bk-measure-convergence` imply that equilibrium demands $x^U(p, F_t)$ converge to $x^U(p, F_\infty)$.
 
-Combining with market clearing and the price equation {eq}`eq:bk-price` gives $p_t \to p_\infty(s_t; F_\infty, \theta^I, \theta^U)$ on a $P^U$-full-measure set.
+Combining with market clearing and the price equation {eq}`eq:bk-price` gives $p_t - p_\infty(s_t; F_\infty, \theta^I, \theta^U) \to 0$ on a $P^U$-full-measure set.
 
 *Step 2: the limit price distribution is observable.*
 
-Since $\{s_t\}$ is IID, the empirical distribution of $\{p_t\}_{t \le T}$ converges almost surely to the distribution of $p_\infty(s_t; F_\infty, \theta^I, \theta^U)$ by the Glivenko-Cantelli theorem.
+Since the deviation $p_t - p_\infty(s_t; F_\infty, \theta^I, \theta^U) \to 0$ almost surely and $\{s_t\}$ is IID, the empirical distribution of observed prices has the same limit as the empirical distribution of the limiting price functional.
 
-The empirical distribution function is $H_\infty^U(p)$-measurable, and so therefore is its limit.
+The latter equals the distribution of $p_\infty(s; F_\infty, \theta^I, \theta^U)$ for $s \sim \mathcal N(\mu_s, \tau^2)$, and that limit is $H_\infty^U(p)$-measurable as a long-run frequency of an observable sequence.
 
 *Step 3: identification.*
 
@@ -769,22 +775,22 @@ If {prf:ref}`assum-bk-identification` fails, step 3 breaks even when steps 1 and
 
 Consider a variant with two informed agents and risk tolerances $\theta^{I1}, \theta^{I2}$ both unknown to the uninformed agent.
 
-The CARA-Normal full-communication price has the form
+With three agents each endowed with one unit of the risky asset, the full-communication formula {eq}`eq:bk-full-communication-price` gives
 
 $$
 p_t
 =
 s_t
 -
-\frac{2\sigma^2}{\theta^{I1} + \theta^{I2} + \theta^U},
+\frac{3\sigma^2}{\theta^{I1} + \theta^{I2} + \theta^U},
 $$
 
 which depends on $(\theta^{I1}, \theta^{I2})$ only through the sum $\theta^{I1}+\theta^{I2}$.
 
-{prf:ref}`prop-bk-measure-convergence` still applies, but $F_\infty$ is supported on the diagonal
+{prf:ref}`prop-bk-measure-convergence` still applies, but $F_\infty$ is supported on the level set
 
 $$
-\{(\theta_1, \theta_2): \theta_1 + \theta_2 = \theta^{I1}_{\rm true} + \theta^{I2}_{\rm true}\},
+\{(\theta_1, \theta_2) \in [a,b]^2 : \theta_1 + \theta_2 = \theta^{I1}_{\rm true} + \theta^{I2}_{\rm true}\},
 $$
 
 not on the singleton $\{(\theta^{I1}_{\rm true},\theta^{I2}_{\rm true})\}$.
@@ -831,7 +837,7 @@ A correctly-specified Bayesian learner enjoys the convergence guarantees in {prf
 
 An adaptive learner who treats the price-state relation as something to be estimated can hope to discover it from data, but the estimator he uses cannot be derived from Bayes' rule applied to a correctly specified model.
 
-No learning algorithm delivers both Bayesian rationality and discovery of the equilibrium structure at the same time.
+Bayesian rational learning can update among equilibrium maps already included in the agent's prior, but it does not explain how agents come to obtain those maps in the first place.
 
 The literature on learning *about* rational expectations equilibria, beginning with {cite:t}`Bray1982` and {cite:t}`BraySavin1984` and extended by {cite:t}`MarcetSargent1989jet`, takes the second side of the trade-off and replaces Bayes' rule with **ordinary least squares** or related recursive estimators.
 
@@ -857,7 +863,7 @@ Rational learning describes the limits of Bayesian inference *given* the equilib
 ````{exercise}
 :label: rle_ex1
 
-**Off-centre prior**
+*Off-center prior*
 
 The baseline simulation uses a uniform prior on $\theta^I \in [0.5, 4]$.
 
@@ -887,7 +893,7 @@ res_biased = simulate(**params_biased)
 
 fig, ax = plt.subplots(figsize=(10, 5))
 ax.plot(res_uniform['post_mean'], lw=2, label='uniform prior')
-ax.plot(res_biased['post_mean'], lw=2, label='off-centre prior')
+ax.plot(res_biased['post_mean'], lw=2, label='off-center prior')
 ax.axhline(params['θ_I_true'], color='black', ls='--',
            label=r'$\theta^I_{\rm true}$')
 ax.set_xlabel('$t$')
@@ -897,7 +903,7 @@ plt.tight_layout()
 plt.show()
 ```
 
-The off-centre prior starts the posterior mean well above $\theta^I_{\rm true} = 2$, but Bayesian updating drives it down to the truth.
+The off-center prior starts the posterior mean well above $\theta^I_{\rm true} = 2$, but Bayesian updating drives it down to the truth.
 
 This is the rational-learning convergence result in action: any prior that puts positive density on $\theta^I_{\rm true}$ eventually concentrates around it.
 
@@ -907,7 +913,7 @@ This is the rational-learning convergence result in action: any prior that puts
 ```{exercise}
 :label: rle_ex2
 
-**Speed of learning across $\theta^I$**
+*Speed of learning across $\theta^I$*
 
 Information from one period about $\theta^I$ comes through the implied signal
 
@@ -953,7 +959,7 @@ The reason is that the sensitivity $\sigma^2 X^I_t/\theta^2$ scales as $\theta^{
 ```{exercise}
 :label: rle_ex3
 
-**Effect of return noise**
+*Effect of return noise*
 
 Larger $\sigma^2$ widens the conditional density of $s_t$ given $r_t$, which one might guess slows learning.
 

From 1028831ba5dfc9118cbb6f90f079604f642902c7 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Tue, 2 Jun 2026 22:10:22 +1000
Subject: [PATCH 22/25] updates

---
 lectures/ls_learning.md          | 517 ++++++++++++++++++-------------
 lectures/rational_learning_re.md |  93 ++++--
 2 files changed, 370 insertions(+), 240 deletions(-)

diff --git a/lectures/ls_learning.md b/lectures/ls_learning.md
index 07f7fa9b8..1b9da990b 100644
--- a/lectures/ls_learning.md
+++ b/lectures/ls_learning.md
@@ -29,7 +29,7 @@ kernelspec:
 ## Overview
 
 This lecture is a companion to {doc}`rational_learning_re`, which presents the
-Bray–Kreps perspective on rational learning.
+Bray-Kreps perspective on rational learning.
 
 We examine the related but distinct question of whether *least squares* learning
 converges to a rational expectations equilibrium in self-referential models.
@@ -152,7 +152,7 @@ where $\{\alpha_t\}$ is a positive, non-decreasing sequence with $\alpha_t \to 1
 as $t \to \infty$.
 
 When $\alpha_t = 1$ for all $t$, equations
-{eq}`eq:rls_beta`–{eq}`eq:rls_R` reduce to **ordinary least squares** updated
+{eq}`eq:rls_beta`-{eq}`eq:rls_R` reduce to **ordinary least squares** updated
 recursively.
 
 ### Lagged and contemporaneous data
@@ -169,8 +169,9 @@ same estimate $\beta_t$ that is being updated from $z_t$.
 An extra requirement is that the date-$t$ system must have a unique solution
 $(\beta_t, R_t, z_t)$ for each history.
 
-Under that uniqueness condition, the same full ODE {eq}`eq:full_ode` and small ODE {eq}`eq:small_ode`
-govern convergence.
+If this uniqueness condition holds, and if the regularity and boundedness
+conditions used below also hold, convergence is still governed by the same full
+ODE {eq}`eq:full_ode` and small ODE {eq}`eq:small_ode`.
 
 ```{note}
 As {cite:t}`BraySavin1984` and {cite:t}`BrayKreps1987` emphasize, the RLS algorithm
@@ -183,9 +184,12 @@ The algorithm is
 when it is not.
 ```
 
+Thus, any state variable that matters for the actual law of motion must appear
+among the regressors in the perceived law.
+
 ## Why a differential equation governs the limit
 
-The RLS recursion {eq}`eq:rls_beta`–{eq}`eq:rls_R` is a *stochastic difference equation* with two key features.
+The RLS recursion {eq}`eq:rls_beta`-{eq}`eq:rls_R` is a *stochastic difference equation* with two key features.
 
 First, the *step size* in front of each update is $\alpha_t / t$, which shrinks to zero as $t$ grows.
 
@@ -201,10 +205,9 @@ $$
 \frac{d\beta}{dt} = T(\beta) - \beta .
 $$ (eq:small_ode)
 
-The ODE clock is cumulative gain time, not calendar time.
+To compare the ODE with RLS after $t$ observations, evaluate the ODE at $\tau_t = \sum_{s=1}^t \alpha_s/s$, the total step size accumulated by the learning algorithm.
 
-When $\alpha_t=1$, calendar period $t$ corresponds approximately to ODE time
-$\sum_{s=1}^t 1/s \approx \log t$.
+When $\alpha_t=1$, this accumulated step size is $\tau_t = \sum_{s=1}^t 1/s \approx \log t$.
 
 This idea, due to {cite:t}`Ljung1977`, is what lets {cite:t}`MarcetSargent1989jet` reduce the analysis of a noisy adaptive learning rule to the much easier study of a deterministic ODE.
 
@@ -304,11 +307,18 @@ for all $\omega \in \Omega_0$ and all $k = 1, 2, \ldots$.
 ```{prf:assumption} Sample path stays in a workable domain
 :label: ass-ms-a7
 
-Either of the following holds:
+Either the compact-state version or the projection-set version holds.
 
-- *Compact-state version.* $D_1 = D_2 = \mathbb{R}^{n_1 \times (n_2)^3}$ and there exists a compact $D' \subset D_s$ that contains $\beta_{t_k}(\omega)$ for all $k$ and all $\omega \in \Omega_0$, with trajectories of {eq}`eq:full_ode` originating in $D'$ never leaving a closed subset of $D_s$.
+- *Compact-state version.*
+  - $D_1 = D_2 = \mathbb{R}^{n_1 \times (n_2)^3}$.
+  - There exists a compact $D' \subset D_s$ that contains $\beta_{t_k}(\omega)$ for all $k$ and all $\omega \in \Omega_0$.
+  - For every initial condition $(\beta(0), R(0))$ with $\beta(0) \in D'$ and $R(0)$ bounded as in {prf:ref}`ass-ms-a6`, the $\beta$-component of the trajectory of {eq}`eq:full_ode` remains in a closed subset of $D_s$.
 
-- *Projection-set version.* $D_2$ is closed, $D_1$ is open and bounded, $\beta \in D_s$ for every $(\beta, R) \in D_1$, and trajectories of {eq}`eq:full_ode` with initial conditions in $D_2$ never leave a closed subset of $D_1$.
+- *Projection-set version.*
+  - $D_2$ is closed.
+  - $D_1$ is open and bounded.
+  - $\beta \in D_s$ for every $(\beta, R) \in D_1$.
+  - Trajectories of {eq}`eq:full_ode` with initial conditions in $D_2$ never leave a closed subset of $D_1$.
 ```
 
 {prf:ref}`ass-ms-a6` is automatic when the regressors $z_{2t}$ are *exogenous* and ergodic, but it can be delicate when $z_{2t}$ contains endogenous variables.
@@ -324,7 +334,9 @@ Let $D_A$ denote the domain of attraction of the unique equilibrium $(\beta_f, R
 
 Assume {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`,
 {prf:ref}`ass-ms-a3`, {prf:ref}`ass-ms-a4`, {prf:ref}`ass-ms-a5`,
-and {prf:ref}`ass-ms-a6`. If either
+and {prf:ref}`ass-ms-a6`.
+
+If either
 
 - the compact-state version of {prf:ref}`ass-ms-a7` is satisfied and
   $D' \subset D_A$, or
@@ -378,7 +390,9 @@ Assume {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`,
 {prf:ref}`ass-ms-a3`, {prf:ref}`ass-ms-a4`, and {prf:ref}`ass-ms-a5`.
 
 1. Let $\hat\beta \neq \beta_f$ and suppose $M_{z_2}(\hat\beta)$ is positive
-   definite and $\hat\beta \in \mathrm{int}(D_2)$. Then $P(\beta_t \to \hat\beta) = 0$.
+   definite and $\hat\beta \in \mathrm{int}(D_2)$.
+
+   Then $P(\beta_t \to \hat\beta) = 0$.
 
 2. If $h(\beta_f, R_f)$ has at least one eigenvalue with strictly positive real
    part, then $P(\beta_t \to \beta_f) = 0$.
@@ -404,7 +418,16 @@ it tells us when the learning process can find its way to the equilibrium.
 
 ### The projection facility
 
-E-stability is necessary but not quite sufficient for almost-sure convergence.
+E-stability is the local condition that makes the REE attractive for the
+learning ODE.
+
+By itself, it is not an almost-sure convergence theorem.
+
+The stochastic recursion must also remain in a bounded region where the ODE
+approximation is valid.
+
+Conversely, a strictly positive eigenvalue rules out convergence to the REE,
+while boundary cases with zero real parts are not covered by the theorem.
 
 Ljung's theorem requires the learning process and the relevant regressors to
 return to bounded regions with probability one.
@@ -458,8 +481,11 @@ interior at the boundary $\partial D_1$.
 
 Assume {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`,
 {prf:ref}`ass-ms-a3`, {prf:ref}`ass-ms-a4`, {prf:ref}`ass-ms-a5`,
-and {prf:ref}`ass-ms-a6`. Suppose also that $(\beta, R) \in D_1$ implies
-$\beta \in D_s$, and that $D_1$ is open and bounded with $D_1 \subset D_A$.
+and {prf:ref}`ass-ms-a6`.
+
+Suppose also that $(\beta, R) \in D_1$ implies $\beta \in D_s$, and that $D_1$
+is open and bounded with $D_1 \subset D_A$.
+
 Then for some subsequence $\{t_k(\omega)\}$,
 
 $$
@@ -490,8 +516,10 @@ $$ (eq:corollary2_cond)
 ```{prf:corollary}
 :label: cor-ms-exogenous
 
-Consider the algorithm defined by {eq}`eq:rls_beta`–{eq}`eq:rls_R` with
-projection rule {eq}`eq:projection`. Choose $0 < K' < K < \infty$ and assume
+Consider the algorithm defined by {eq}`eq:rls_beta`-{eq}`eq:rls_R` with
+projection rule {eq}`eq:projection`.
+
+Choose $0 < K' < K < \infty$ and assume
 
 1. {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`, {prf:ref}`ass-ms-a3`,
    {prf:ref}`ass-ms-a4`, and {prf:ref}`ass-ms-a5` hold;
@@ -508,7 +536,8 @@ projection rule {eq}`eq:projection`. Choose $0 < K' < K < \infty$ and assume
 Take
 $D_1 = \{(\beta, R) : |\beta - \beta_f| < K\}$ and
 $D_2 = \{(\beta, R) : |\beta - \beta_f| \leq K'\}$, and let the projection
-rule retract to any value with $|\beta - \beta_f| \leq K'$.
+rule retract $\beta_t$ to any value with $|\beta - \beta_f| \leq K'$, while
+leaving $R_t = \tilde R_t$.
 
 Then $\beta_t \to \beta_f$ almost surely.
 ```
@@ -516,174 +545,18 @@ Then $\beta_t \to \beta_f$ almost surely.
 For the scalar linear examples below, condition (4) reduces to checking that
 the feedback slope of $T$ is not too strong.
 
-For the first four examples below, $T$ is linear and $M_{z_2}$ is independent
-of $\beta$, so {prf:ref}`cor-ms-exogenous` reduces the problem to the scalar
-ODE stability checks shown in the examples.
+For the first four examples below, $T$ is linear and $M_{z_2}$ is independent of $\beta$, so {prf:ref}`cor-ms-exogenous` uses $D_1$ and $D_2$ only to justify the projection argument and leaves the examples with the scalar ODE stability checks.
 
-```{note}
-In Bray's model, the state $z_{2t}=1$ is a constant regressor.
+$D_1$ and $D_2$ are therefore not economic primitives in the examples below.
 
-In the Bray-Savin and present-value examples, $z_{2t}$ is stochastic but
-exogenous, so $M_{z_2}$ is still independent of beliefs.
+They are proof devices for the projected learning algorithm: $D_1$ is the outer admissible region where the model and ODE remain well defined, and $D_2$ is the smaller region used to reset the estimates if an update tries to leave $D_1$.
 
-For the investment model with endogenous regressors, verifying the
-boundary condition on $D_1$ is much harder and may require numerical solution of
-the ODE on a grid of boundary points.
-```
+Once the corollary says suitable sets can be chosen, the examples no longer need to display them and can focus on $T$, $\beta_f$, and the local stability slope.
 
-## Computational helpers
+## Five examples
 
 We now work through five examples from {cite:t}`MarcetSargent1989jet`.
 
-Before we start, we define helper functions for the scalar simulations.
-
-The first covers Bray's constant-regressor model.
-
-The next two simulate the actual exogenous-regressor structures in the
-Bray-Savin and present-value examples.
-
-The final helper evaluates the small ODE {eq}`eq:small_ode` on the
-cumulative-gain time scale of RLS.
-
-```{code-cell} ipython3
-def simulate_rls_scalar(T_map, σ_u, β0, T_periods=500, N_paths=100,
-                        a_seq=None, seed=0):
-    """
-    Simulate the RLS recursion for z1_t = T(β_t) + u_t, z2_t = 1.
-    Returns an (N_paths, T_periods) array of belief paths.
-    """
-    rng = np.random.default_rng(seed)
-    if a_seq is None:
-        a_seq = np.ones(T_periods)
-
-    β_paths = np.empty((N_paths, T_periods))
-
-    for i in range(N_paths):
-        β = β0
-        R = 1.0
-
-        for t in range(T_periods):
-            α_t = a_seq[t]
-            z2 = 1.0
-            u_t = rng.normal(0, σ_u)
-            z1 = T_map(β) * z2 + u_t
-
-            R = R + (α_t / (t + 1)) * (z2**2 - R / α_t)
-            R = max(R, 1e-8)
-            β = β + (α_t / (t + 1)) / R * z2 * (z1 - β * z2)
-
-            β_paths[i, t] = β
-
-    return β_paths
-
-
-def simulate_bray_savin(m, a, σ_x, σ_u, β0, T_periods=500,
-                        N_paths=100, seed=0):
-    """
-    Simulate RLS for p_t = x_t (m + a β_t) + u_t.
-    Agents regress p_t on x_t.
-    """
-    rng = np.random.default_rng(seed)
-    β_paths = np.empty((N_paths, T_periods))
-
-    for i in range(N_paths):
-        β = β0
-        R = 1.0
-
-        for t in range(T_periods):
-            x_t = rng.normal(0.0, σ_x)
-            u_t = rng.normal(0.0, σ_u)
-            p_t = x_t * (m + a * β) + u_t
-            step = 1.0 / (t + 1)
-
-            R = R + step * (x_t**2 - R)
-            R = max(R, 1e-8)
-            β = β + step / R * x_t * (p_t - β * x_t)
-            β_paths[i, t] = β
-
-    return β_paths
-
-
-def simulate_present_value_rls(λ, ρ, σ_ε, β0, T_periods=500,
-                               N_paths=100, seed=0):
-    """
-    Simulate RLS for y_t = (λ β_t + 1) x_t,
-    x_t = ρ x_{t-1} + ε_t, with regression of y_t on x_{t-1}.
-    """
-    rng = np.random.default_rng(seed)
-    β_paths = np.empty((N_paths, T_periods))
-    σ_x = σ_ε / np.sqrt(1 - ρ**2)
-
-    for i in range(N_paths):
-        β = β0
-        R = 1.0
-        x_lag = rng.normal(0.0, σ_x)
-
-        for t in range(T_periods):
-            ε_t = rng.normal(0.0, σ_ε)
-            x_t = ρ * x_lag + ε_t
-            y_t = (λ * β + 1.0) * x_t
-            step = 1.0 / (t + 1)
-
-            R = R + step * (x_lag**2 - R)
-            R = max(R, 1e-8)
-            β = β + step / R * x_lag * (y_t - β * x_lag)
-            β_paths[i, t] = β
-            x_lag = x_t
-
-    return β_paths
-
-
-def solve_ode_calendar(f_ode, β0, T_periods, a_seq=None):
-    """
-    Solve dβ/dτ = f_ode(β) and evaluate it at RLS cumulative-gain time.
-    """
-    if a_seq is None:
-        a_seq = np.ones(T_periods)
-
-    periods = np.arange(T_periods + 1)
-    gains = a_seq / np.arange(1, T_periods + 1)
-    ode_time = np.concatenate(([0.0], np.cumsum(gains)))
-    sol = solve_ivp(
-        lambda τ, y: [f_ode(y[0])],
-        (0.0, ode_time[-1]),
-        [β0],
-        t_eval=ode_time,
-        method='RK45',
-        max_step=0.05
-    )
-    return periods, sol.y[0]
-
-
-def plot_scalar_drift(ax, β_grid, drift, β_f, color):
-    """Plot the one-dimensional learning drift T(β) - β."""
-    ax.plot(β_grid, drift, color=color, lw=2)
-    ax.axhline(0, color='black', lw=1.5)
-    ax.axvline(β_f, color='red', ls='--', lw=2,
-               label=f'$\\beta_f = {β_f:.2f}$')
-    ax.fill_between(β_grid, drift, 0, where=(drift > 0),
-                    color=color, alpha=0.12)
-    ax.fill_between(β_grid, drift, 0, where=(drift < 0),
-                    color=color, alpha=0.12)
-
-    for β_arrow in np.linspace(β_grid[20], β_grid[-20], 7):
-        dβ = np.interp(β_arrow, β_grid, drift)
-        if abs(dβ) > 1e-10:
-            ax.annotate(
-                '', xy=(β_arrow + 0.25 * np.sign(dβ), 0),
-                xytext=(β_arrow, 0),
-                arrowprops=dict(arrowstyle='->', color=color, lw=1.8)
-            )
-
-    ax.set_xlabel('$\\beta$')
-    ax.set_ylabel('$T(\\beta) - \\beta$')
-    ax.legend(fontsize=9)
-
-
-T_sim = 400
-N_sim = 80
-```
-
 Each substantive learning example follows the same template.
 
 1. Write down the economic equations that determine the equilibrium.
@@ -725,8 +598,8 @@ stable.
 {prf:ref}`cor-ms-exogenous` then implies that recursive least squares converges almost surely
 to the true law of motion.
 
-This case shows that the Marcet-Sargent machinery nests ordinary strong
-consistency of least squares for stable linear stochastic difference equations.
+In this no-feedback benchmark, the Marcet-Sargent result reduces to the
+standard consistency of least squares for stable linear stochastic difference equations.
 
 ## Example 2: Bray's cobweb model
 
@@ -754,7 +627,9 @@ least squares has a force pushing beliefs back toward the fixed point.
 When $b > 1$, the feedback is too strong and the same learning rule moves
 beliefs away from the REE.
 
-The mapping $T$ is simply $T(\beta) = a + b\beta$.  The REE is
+The mapping $T$ is simply $T(\beta) = a + b\beta$.
+
+The REE is
 
 $$
 \beta_f = \frac{a}{1 - b} , \quad b \neq 1 .
@@ -783,6 +658,91 @@ The rational expectations price forecast is then $\beta_f=2.5$.
 The three panels show the noisy RLS paths, the small-ODE approximation, and
 the learning drift $T(\beta)-\beta$.
 
+The next cell sets up the simulator and plotting helpers used in the Bray figures.
+
+```{code-cell} ipython3
+def simulate_rls_scalar(T_map, σ_u, β0, T_periods=500, N_paths=100,
+                        a_seq=None, seed=0):
+    """
+    Simulate the RLS recursion for z1_t = T(β_t) + u_t, z2_t = 1.
+    Returns an (N_paths, T_periods) array of belief paths.
+    """
+    rng = np.random.default_rng(seed)
+    if a_seq is None:
+        a_seq = np.ones(T_periods)
+
+    β_paths = np.empty((N_paths, T_periods))
+
+    for i in range(N_paths):
+        β = β0
+        R = 1.0
+
+        for t in range(T_periods):
+            α_t = a_seq[t]
+            z2 = 1.0
+            u_t = rng.normal(0, σ_u)
+            z1 = T_map(β) * z2 + u_t
+
+            R_old = max(R, 1e-8)
+            β = β + (α_t / (t + 1)) / R_old * z2 * (z1 - β * z2)
+            R = R + (α_t / (t + 1)) * (z2**2 - R / α_t)
+            R = max(R, 1e-8)
+
+            β_paths[i, t] = β
+
+    return β_paths
+
+
+def solve_ode_calendar(f_ode, β0, T_periods, a_seq=None):
+    """
+    Solve dβ/dτ = f_ode(β) and evaluate it at RLS cumulative-gain time.
+    """
+    if a_seq is None:
+        a_seq = np.ones(T_periods)
+
+    periods = np.arange(T_periods + 1)
+    gains = a_seq / np.arange(1, T_periods + 1)
+    ode_time = np.concatenate(([0.0], np.cumsum(gains)))
+    sol = solve_ivp(
+        lambda τ, y: [f_ode(y[0])],
+        (0.0, ode_time[-1]),
+        [β0],
+        t_eval=ode_time,
+        method='RK45',
+        max_step=0.05
+    )
+    return periods, sol.y[0]
+
+
+def plot_scalar_drift(ax, β_grid, drift, β_f, color):
+    """Plot the one-dimensional learning drift T(β) - β."""
+    ax.plot(β_grid, drift, color=color, lw=2)
+    ax.axhline(0, color='black', lw=1.5)
+    ax.axvline(β_f, color='red', ls='--', lw=2,
+               label=f'$\\beta_f = {β_f:.2f}$')
+    ax.fill_between(β_grid, drift, 0, where=(drift > 0),
+                    color=color, alpha=0.12)
+    ax.fill_between(β_grid, drift, 0, where=(drift < 0),
+                    color=color, alpha=0.12)
+
+    for β_arrow in np.linspace(β_grid[20], β_grid[-20], 7):
+        dβ = np.interp(β_arrow, β_grid, drift)
+        if abs(dβ) > 1e-10:
+            ax.annotate(
+                '', xy=(β_arrow + 0.25 * np.sign(dβ), 0),
+                xytext=(β_arrow, 0),
+                arrowprops=dict(arrowstyle='->', color=color, lw=1.8)
+            )
+
+    ax.set_xlabel('$\\beta$')
+    ax.set_ylabel('$T(\\beta) - \\beta$')
+    ax.legend(fontsize=9)
+
+
+T_sim = 400
+N_sim = 80
+```
+
 ```{code-cell} ipython3
 ---
 mystnb:
@@ -984,6 +944,37 @@ It uses $a=0.7$.
 The drift panel shows that mistaken slopes are pushed back toward the REE
 slope rather than amplified.
 
+The next cell adapts the RLS simulator to the supply-shifter regression.
+
+```{code-cell} ipython3
+def simulate_bray_savin(m, a, σ_x, σ_u, β0, T_periods=500,
+                        N_paths=100, seed=0):
+    """
+    Simulate RLS for p_t = x_t (m + a β_t) + u_t.
+    Agents regress p_t on x_t.
+    """
+    rng = np.random.default_rng(seed)
+    β_paths = np.empty((N_paths, T_periods))
+
+    for i in range(N_paths):
+        β = β0
+        R = 1.0
+
+        for t in range(T_periods):
+            x_t = rng.normal(0.0, σ_x)
+            u_t = rng.normal(0.0, σ_u)
+            p_t = x_t * (m + a * β) + u_t
+            step = 1.0 / (t + 1)
+
+            R_old = max(R, 1e-8)
+            β = β + step / R_old * x_t * (p_t - β * x_t)
+            R = R + step * (x_t**2 - R)
+            R = max(R, 1e-8)
+            β_paths[i, t] = β
+
+    return β_paths
+```
+
 ```{code-cell} ipython3
 ---
 mystnb:
@@ -1111,9 +1102,48 @@ The simulation uses $\lambda=0.8$ and $\rho=0.9$, so $\lambda\rho=0.72$.
 This means the feedback from expectations is strong enough to matter but still
 less than one.
 
+The ODE panel below uses $\tau$ itself on the horizontal axis, where $\tau$ is the total step size accumulated by least squares.
+
 The code simulates the autoregressive fundamental $x_t$ and updates an OLS
 regression of $y_t$ on $x_{t-1}$.
 
+The next cell simulates the autoregressive fundamental and the matching RLS update.
+
+It keeps the covariance estimate $R_t$ bounded away from zero, mirroring the projection idea above and avoiding misleading early-sample explosions.
+
+```{code-cell} ipython3
+def simulate_present_value_rls(λ, ρ, σ_ε, β0, T_periods=500,
+                               N_paths=100, seed=0):
+    """
+    Simulate RLS for y_t = (λ β_t + 1) x_t,
+    x_t = ρ x_{t-1} + ε_t, with regression of y_t on x_{t-1}.
+    """
+    rng = np.random.default_rng(seed)
+    β_paths = np.empty((N_paths, T_periods))
+    σ_x = σ_ε / np.sqrt(1 - ρ**2)
+    R_min = 0.05 * σ_x**2
+
+    for i in range(N_paths):
+        β = β0
+        R = σ_x**2
+        x_lag = rng.normal(0.0, σ_x)
+
+        for t in range(T_periods):
+            ε_t = rng.normal(0.0, σ_ε)
+            x_t = ρ * x_lag + ε_t
+            y_t = (λ * β + 1.0) * x_t
+            step = 1.0 / (t + 1)
+
+            R_old = max(R, R_min)
+            β = β + step / R_old * x_lag * (y_t - β * x_lag)
+            R = R + step * (x_lag**2 - R)
+            R = max(R, R_min)
+            β_paths[i, t] = β
+            x_lag = x_t
+
+    return β_paths
+```
+
 ```{code-cell} ipython3
 ---
 mystnb:
@@ -1122,6 +1152,8 @@ mystnb:
     name: fig-present-value-learning-dynamics
 ---
 λ, ρ_pv, σ_pv = 0.8, 0.9, 1.0
+T_pv_sim = T_sim
+N_pv_sim = N_sim
 
 
 def T_pv(β):
@@ -1132,7 +1164,7 @@ def T_pv(β):
 
 β_paths_pv = simulate_present_value_rls(
     λ, ρ_pv, σ_pv, 0.0,
-    T_periods=T_sim, N_paths=N_sim
+    T_periods=T_pv_sim, N_paths=N_pv_sim
 )
 
 
@@ -1140,8 +1172,21 @@ def ode_pv(β):
     return T_pv(β) - β
 
 
-t_ode_pv, sol_pv_low = solve_ode_calendar(ode_pv, 0.0, T_sim)
-_, sol_pv_high = solve_ode_calendar(ode_pv, 10.0, T_sim)
+τ_grid_pv = np.linspace(0.0, 30.0, 300)
+sol_pv_low = solve_ivp(
+    lambda τ, y: [ode_pv(y[0])],
+    (0.0, τ_grid_pv[-1]),
+    [0.0],
+    t_eval=τ_grid_pv,
+    max_step=0.05
+).y[0]
+sol_pv_high = solve_ivp(
+    lambda τ, y: [ode_pv(y[0])],
+    (0.0, τ_grid_pv[-1]),
+    [10.0],
+    t_eval=τ_grid_pv,
+    max_step=0.05
+).y[0]
 
 β_grid_pv = np.linspace(-1.0, 7.0, 300)
 drift_pv = np.array([ode_pv(b) for b in β_grid_pv])
@@ -1149,7 +1194,7 @@ drift_pv = np.array([ode_pv(b) for b in β_grid_pv])
 fig, axes = plt.subplots(1, 3, figsize=(15, 4.8))
 
 ax = axes[0]
-for i in range(min(30, N_sim)):
+for i in range(min(30, N_pv_sim)):
     ax.plot(β_paths_pv[i], color='seagreen', alpha=0.25, lw=2)
 ax.plot(np.mean(β_paths_pv, axis=0), color='darkgreen', lw=2,
         label='cross-path average')
@@ -1161,17 +1206,17 @@ ax.legend()
 
 ax = axes[1]
 ax.plot(
-    t_ode_pv, sol_pv_low, color='seagreen', lw=2,
-    label='ODE at $\\tau_t$, $\\beta_0=0$'
+    τ_grid_pv, sol_pv_low, color='seagreen', lw=2,
+    label='ODE, $\\beta_0=0$'
 )
 ax.plot(
-    t_ode_pv, sol_pv_high, color='steelblue', lw=2,
-    label='ODE at $\\tau_t$, $\\beta_0=10$'
+    τ_grid_pv, sol_pv_high, color='steelblue', lw=2,
+    label='ODE, $\\beta_0=10$'
 )
 ax.axhline(β_f_pv, color='red', ls='--', lw=2,
            label=f'$\\beta_f = {β_f_pv:.2f}$')
-ax.set_xlabel('$t$')
-ax.set_ylabel('$\\beta(\\tau_t)$')
+ax.set_xlabel('Accumulated step size $\\tau$')
+ax.set_ylabel('$\\beta(\\tau)$')
 ax.legend()
 
 plot_scalar_drift(
@@ -1193,7 +1238,7 @@ feeding back explosively.
 
 ## Example 5: Investment under uncertainty
 
-Let's now consider a version of the Lucas–Prescott investment model where agents learn about the
+Let's now consider a version of the Lucas-Prescott investment model where agents learn about the
 aggregate capital stock $K_t$ by regressing on $(K_{t-1}, w_{t-1})$ where $w_t$
 is an exogenous cost shock.
 
@@ -1345,14 +1390,14 @@ The framework of {cite:t}`MarcetSargent1989jet` belongs to the program of learni
 *about* a rational expectations equilibrium, as distinct from learning *within*
 one --- a distinction emphasized by {cite:t}`BrayKreps1987`.
 
-**Learning *within* an REE** (the subject of the companion lecture
+*Learning *within* an REE* (the subject of the companion lecture
 [](rational_learning_re)) refers to Bayesian inference inside a correctly
 specified model.
 
 In that setting the data-generating process is stationary from
 the agent's perspective, and Bayes' rule is fully rationalized.
 
-**Learning *about* an REE** --- the present lecture's topic --- involves an agent who
+**Learning *about* an REE** involves an agent who
 does not know the equilibrium price function.
 
 Because the agent's beliefs shift
@@ -1365,9 +1410,11 @@ As {cite:t}`MarcetSargent1989jet` put it,
 > because agents operate under the continually falsified assumption that the law of
 > motion is time invariant and known for sure.
 
-It is nonetheless a compelling learning rule because it is consistent,
-computationally tractable, and --- when E-stability holds --- converges to the
-REE despite the misspecification.
+It is nonetheless a compelling learning rule.
+
+It is consistent and computationally tractable, and it converges to the REE
+despite the misspecification when E-stability is combined with the required
+boundedness and domain-of-attraction conditions.
 
 It does not require the strong assumptions on agents' prior beliefs about the
 statistical structure of the economy that are needed for Bayesian learning.
@@ -1411,6 +1458,8 @@ Plot the cross-path average of $\beta_t$ for each $b$ value on the same figure a
 :class: dropdown
 ```
 
+Here is one solution:
+
 *Part 1.* The REE satisfies $\beta_f = T(\beta_f) = a + b\beta_f$, so
 
 $$
@@ -1469,6 +1518,8 @@ Convergence still occurs, but it takes longer.
 :class: dropdown
 ```
 
+Here is one solution:
+
 *Parts 1 and 2.*
 
 ```{code-cell} ipython3
@@ -1526,22 +1577,26 @@ The present-value model: effect of $\lambda$ on E-stability
 
 In the present-value model {eq}`eq:pv_model`, $T(\beta) = (\lambda\beta + 1)\rho$ and the Jacobian is $\mathcal{M} = \lambda\rho - 1$.
 
-1. For $\rho = 0.9$ and each of $\lambda \in \{0.5, 0.8, 0.95, 1.0\}$, compute $\beta_f$ and $\mathcal{M}$ and determine whether the REE is E-stable.
+1. For $\rho = 0.9$ and each of $\lambda \in \{0.5, 0.8, 0.95, 0.99\}$, compute $\beta_f$ and $\mathcal{M}$ and determine whether the REE is E-stable.
 
-2. For the E-stable cases, simulate 100 paths of length $T=400$ and plot the cross-path average against the ODE solution evaluated at cumulative-gain time.
+2. For the E-stable cases, simulate 100 paths of length $T=400$ and plot representative noisy paths in calendar time, then plot the limiting ODE paths in accumulated-gain time.
 
-3. At $\lambda = 1$, $\mathcal{M} = \rho - 1 < 0$ (still E-stable when $|\rho| < 1$). Simulate paths for this case and compare the convergence speed with the $\lambda = 0.5$ case, providing an intuitive explanation.
+3. At $\lambda = 0.99$, $\mathcal{M} = 0.99\rho - 1 < 0$.
+
+   Simulate paths for this near-boundary case and compare the convergence speed with the $\lambda = 0.5$ case, providing an intuitive explanation.
 ```
 
 ```{solution-start} ls_ex3
 :class: dropdown
 ```
 
+Here is one solution:
+
 *Part 1.*
 
 ```{code-cell} ipython3
 ρ_ex = 0.9
-λ_values = [0.5, 0.8, 0.95, 1.0]
+λ_values = [0.5, 0.8, 0.95, 0.99]
 
 print(f"{'lambda':>8}  {'β_f':>10}  {'M = λ*ρ-1':>15}  {'E-stable':>10}")
 print("-" * 50)
@@ -1555,44 +1610,66 @@ for lv in λ_values:
 *Parts 2 and 3.*
 
 ```{code-cell} ipython3
-fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+fig, axes = plt.subplots(len(λ_values), 2, figsize=(13, 14))
 colors_λ = ['steelblue', 'darkorange', 'seagreen', 'purple']
 
-for ax, lv, col in zip(axes.flat, λ_values, colors_λ):
+for row, (lv, col) in enumerate(zip(λ_values, colors_λ)):
     def ode_fn(β, λ_val=lv):
         return (λ_val * β + 1) * ρ_ex - β
 
-    bf = ρ_ex / (1 - lv * ρ_ex) if abs(lv * ρ_ex) < 1 else None
+    bf = ρ_ex / (1 - lv * ρ_ex)
 
     paths_λ = simulate_present_value_rls(
         lv, ρ_ex, 1.0, β0=0.0,
         T_periods=400, N_paths=100, seed=3
     )
+
+    ax = axes[row, 0]
     for i in range(20):
         ax.plot(paths_λ[i], color=col, alpha=0.2, lw=2)
-    ax.plot(np.mean(paths_λ, axis=0), color=col, lw=2, label='RLS average')
-
-    if bf is not None:
-        t_o, sol_o = solve_ode_calendar(ode_fn, 0.0, 400)
-        ax.plot(t_o, sol_o, color='black', ls='--', lw=2,
-                label='ODE at $\\tau_t$')
-        ax.axhline(bf, color='red', ls=':', lw=2,
-                   label=f'$\\beta_f={bf:.2f}$')
-
-    M_jac = lv * ρ_ex - 1
-    ax.set_title(f'$\\lambda={lv}$,  $\\mathcal{{M}}={M_jac:.3f}$')
+    ax.axhline(bf, color='red', ls=':', lw=2,
+               label=f'$\\beta_f={bf:.2f}$')
+    ax.set_title(f'RLS paths, $\\lambda={lv}$')
     ax.set_xlabel('$t$')
     ax.set_ylabel('$\\beta_t$')
     ax.legend(fontsize=8)
 
+    τ_grid = np.linspace(0.0, 30.0, 300)
+    sol_low = solve_ivp(
+        lambda τ, y: [ode_fn(y[0])],
+        (0.0, τ_grid[-1]),
+        [0.0],
+        t_eval=τ_grid,
+        max_step=0.05
+    ).y[0]
+    sol_high = solve_ivp(
+        lambda τ, y: [ode_fn(y[0])],
+        (0.0, τ_grid[-1]),
+        [1.5 * bf],
+        t_eval=τ_grid,
+        max_step=0.05
+    ).y[0]
+
+    ax = axes[row, 1]
+    ax.plot(τ_grid, sol_low, color=col, lw=2,
+            label='ODE, $\\beta_0=0$')
+    ax.plot(τ_grid, sol_high, color='black', ls='--', lw=2,
+            label='ODE, high start')
+    ax.axhline(bf, color='red', ls=':', lw=2,
+               label=f'$\\beta_f={bf:.2f}$')
+    M_jac = lv * ρ_ex - 1
+    ax.set_title(f'ODE, $\\mathcal{{M}}={M_jac:.3f}$')
+    ax.set_xlabel('Accumulated step size $\\tau$')
+    ax.set_ylabel('$\\beta(\\tau)$')
+    ax.legend(fontsize=8)
+
 plt.tight_layout()
 plt.show()
 ```
 
-The dashed ODE curves use $\tau_t = \sum_{s=1}^t 1/s$, so they are on the same
-learning-time scale as RLS.
+In each row, the left panel shows finite-sample RLS paths in calendar time and the right panel shows deterministic ODE paths in accumulated-gain time.
 
-When $\lambda=1$, $\mathcal M = \rho-1 \approx -0.1$ is small in absolute
+When $\lambda=0.99$, $\mathcal M = 0.99\rho-1 \approx -0.109$ is small in absolute
 value.
 
 The ODE is nearly flat near $\beta_f$, so the restoring force is weak and
diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index 5f5dbc2c7..ddcf5728a 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -28,9 +28,9 @@ kernelspec:
 
 ## Overview
 
-This lecture explores an important question in economic theory: can agents *learn* their way to a rational expectations equilibrium?
+This lecture explores an important question in economic theory: what can agents learn inside a rational expectations equilibrium?
 
-If they can, then the rational expectations equilibrium can be justified as a dynamic attractor for learning processes.
+This question is related to, but distinct from, the adaptive-learning question of whether a rational expectations equilibrium can be justified as a dynamic attractor.
 
 The starting point is {cite:t}`BrayKreps1987`, which gives a rigorous model of Bayesian learning inside a rational expectations equilibrium.
 
@@ -40,12 +40,15 @@ Each agent knows the *statistical relationship* between prices and the underlyin
 
 But this raises a question: where does that knowledge come from?
 
-The **rational learning** approach asks whether agents who start with uncertainty about the equilibrium price function can, over time, learn it from observations of past prices.
+Their **rational learning** approach answers a narrower question.
 
-This lecture develops that idea through an asset-market model.
+Agents are uncertain about structural parameters, but the state space is enlarged so that, for each possible parameter value, the associated equilibrium price and allocation maps are already part of the model.
 
-The aim is to see what rational learning can explain, and where its limits
-appear, before turning to the computational illustration.
+Agents then use Bayes' rule to update over those possibilities as prices and returns are observed.
+
+This lecture develops that Bayesian formulation through an asset-market model.
+
+The aim is to see what rational learning can explain, and where its limits appear, before turning to the computational illustration.
 
 The discussion also connects to earlier work by {cite:t}`Bray1982`, {cite:t}`BraySavin1984`, and the rational expectations literature of {cite:t}`Radner1979`, {cite:t}`grossman1976`, and {cite:t}`Jordan1982`.
 
@@ -690,7 +693,7 @@ Write $F_t$ for the CDF of agent $U$'s posterior on $\theta^I$ at date $t$ after
 
 {prf:ref}`prop-bk-measure-convergence` yields a random CDF $F_\infty$ such that $F_t$ converges weakly to $F_\infty$, $P^U$-a.s.
 
-Three hypotheses sharpen this to concentration on the truth, corresponding to the three steps in {cite:t}`BrayKreps1987`.
+Three ingredients sharpen this to concentration on the truth, corresponding to the three steps in {cite:t}`BrayKreps1987`.
 
 ```{prf:assumption}
 :label: assum-bk-continuity
@@ -701,12 +704,24 @@ The equilibrium uninformed demand $x^U(p, F)$ is continuous in $F$ with respect
 ```{prf:assumption}
 :label: assum-bk-identification
 
-For fixed $\theta^U$ and limiting posterior $F_\infty$, the marginal distribution of the limiting price functional $p_\infty(\,\cdot\,; F_\infty, \theta^I, \theta^U)$ is strictly monotone in $\theta^I$ in the first-order-stochastic-dominance order.
+For fixed $\theta^U$ and limiting posterior $F_\infty$, the map
+
+$$
+\theta^I
+\mapsto
+\mathcal L\{p_\infty(s; F_\infty, \theta^I, \theta^U)\}
+$$
+
+is injective, where $s$ is drawn from its marginal distribution.
 
-That is, $\theta^I \neq \theta^{I\,\prime}$ implies $p_\infty(s; F_\infty, \theta^I, \theta^U)$ and $p_\infty(s; F_\infty, \theta^{I\,\prime}, \theta^U)$ have distinct CDFs when $s$ is drawn from its marginal distribution.
+A sufficient condition is strict stochastic monotonicity of this marginal price distribution in $\theta^I$.
 ```
 
-In the lecture's CARA-Normal setup, {prf:ref}`assum-bk-continuity` holds because the FOC {eq}`eq:bk-foc` defines $x^U$ as a continuous functional of $F$ under weak convergence through bounded integrals, and {prf:ref}`assum-bk-identification` holds because the equilibrium price has the form $p_t = s_t - \sigma^2 X^I_t / \theta^I$ with $X^I_t > 0$ on a full-measure set.
+In the lecture's CARA-Normal setup, these assumptions are plausible under the maintained uniqueness and regularity of the implicit demand and price equations.
+
+Continuity requires continuous dependence of the FOC root in {eq}`eq:bk-foc` on $F$, not just pointwise continuity of the integrals.
+
+Identification also uses the full equilibrium map: the equation $p_t = s_t - \sigma^2 X^I_t / \theta^I$ is informative, but $X^I_t$ itself depends on $\theta^I$, $F_t$, and the equilibrium fixed point.
 
 The IID assumption on $\{s_t\}$, already part of the model, supplies the ergodicity used in step 2 below.
 
@@ -743,7 +758,7 @@ The latter equals the distribution of $p_\infty(s; F_\infty, \theta^I, \theta^U)
 
 *Step 3: identification.*
 
-{prf:ref}`assum-bk-identification` makes the marginal distribution of $p_\infty$ a strictly monotone function of $\theta^I$ given $(F_\infty, \theta^U)$.
+{prf:ref}`assum-bk-identification` makes the marginal distribution of $p_\infty$ identify $\theta^I$ given $(F_\infty, \theta^U)$.
 
 Combined with step 2, this means $\theta^I$ is itself $H_\infty^U(p)$-measurable, so for any subinterval $[c,d] \subseteq [a,b]$ the limiting posterior satisfies $P_\infty^U(\theta^I \in [c,d]) = \mathbf 1_{\{\theta^I_{\rm true} \in [c,d]\}}$.
 
@@ -767,7 +782,9 @@ When the equilibrium price functional is discontinuous in $F$, small changes in
 
 {cite:t}`BrayKreps1987` flag this as the most delicate step in their argument.
 
-Continuity of $x^U(p, F)$ in $F$ is automatic in this lecture because the FOC integrates a bounded continuous function against $F$, but verifying it in richer market structures often requires non-trivial regularity arguments.
+Continuity of $x^U(p, F)$ in $F$ is plausible in this lecture's regular CARA-Normal case, but it also requires uniqueness of the FOC solution and continuous dependence of that solution on $F$.
+
+Verifying the same property in richer market structures often requires non-trivial regularity arguments.
 
 ### Obstacle 2: failure of identification
 
@@ -803,11 +820,13 @@ A separate obstacle arises if the true pricing relation lies outside the agent's
 
 {cite:t}`BlumeEasley1982` give a stylised version of this obstacle, and {doc}`likelihood_ratio_process_2` develops the Blume-Easley heterogeneous-beliefs model in this lecture series.
 
-Each agent entertains two competing models $\psi_n^0$ and $\psi_n^1$ over $(I_t, p_t)$, and an equilibrium can exist in which agents assign asymptotic probability one to a model that places zero probability on the actually-observed price relation.
+Each agent entertains competing conditional likelihoods for other agents' information given his own information and the price.
+
+An equilibrium can exist in which agents assign asymptotic probability one to an incorrect model that gives the observed events positive likelihood, while the true stable price relation receives zero posterior probability because that relation was absent from the prior model class.
 
-In strict rational learning the agent's prior must be supported on Bayesian-consistent models in the expanded state space, so this failure can occur only on a $P^U$-null event.
+In strict rational learning the agent's prior is supported on Bayesian-consistent models in the expanded state space, and the truth is assumed to have positive prior support.
 
-Rational learning embeds every candidate pricing relation in the prior from date zero, so any candidate with positive prior weight cannot be dominated by one with zero prior weight no matter what the data say.
+Bayes' rule can only reweight that initial model class: a pricing relation assigned zero prior probability remains impossible after any history.
 
 ## Learning within versus learning about a rational expectations equilibrium
 
@@ -843,7 +862,7 @@ The literature on learning *about* rational expectations equilibria, beginning w
 
 The companion lecture {doc}`ls_learning` develops this least-squares-learning framework in self-referential models and traces the resulting dynamics through the associated ordinary differential equation.
 
-Those rules are computationally tractable and converge in important examples, but they are *not* Bayesian-optimal under any correctly specified prior.
+Those rules are computationally tractable and converge in important examples, but they are not the Bayesian update implied by the fully specified rational-learning equilibrium prior.
 
 ## Summary
 
@@ -853,7 +872,7 @@ Posterior assessments converge by bounded martingale convergence ({prf:ref}`prop
 
 Concentration on the truth additionally requires continuity ({prf:ref}`assum-bk-continuity`), ergodicity, and identification ({prf:ref}`assum-bk-identification`); each obstacle above is a failure of one of these.
 
-The simulation confirms both conclusions: the posterior on $\theta^I$ collapses to $\theta^I_{\rm true}$ and the equilibrium informed trade reaches its full-information value.
+The simulation illustrates both conclusions: the posterior on $\theta^I$ collapses toward $\theta^I_{\rm true}$ and the equilibrium informed trade approaches its full-information value.
 
 Rational learning describes the limits of Bayesian inference *given* the equilibrium structure; adaptive learning, in {doc}`ls_learning`, describes how that structure can be learned in the first place.
 
@@ -884,6 +903,8 @@ which peaks near $\theta = 3.1$.
 :class: dropdown
 ```
 
+Here is one solution:
+
 ```{code-cell} ipython3
 res_uniform = simulate(**params)
 
@@ -934,6 +955,8 @@ The sensitivity $|\partial s_t/\partial \theta| = \sigma^2 X^I_t/\theta^2$ depen
 :class: dropdown
 ```
 
+Here is one solution:
+
 ```{code-cell} ipython3
 fig, ax = plt.subplots(figsize=(10, 5))
 for θ_val in [0.8, 2.0, 3.5]:
@@ -969,13 +992,15 @@ But $\sigma^2$ also scales the price intercept in {eq}`eq:bk-price`, so price di
 
 2. Plot the posterior variance on a log scale for each $\sigma^2$.
 
-3. Which effect dominates? Explain in terms of the signal-to-noise ratio for inferring $\theta^I$ from the price.
+3. Explain which effect dominates in terms of the signal-to-noise ratio for inferring $\theta^I$ from the price.
 ```
 
 ```{solution-start} rle_ex3
 :class: dropdown
 ```
 
+Here is one solution:
+
 ```{code-cell} ipython3
 fig, ax = plt.subplots(figsize=(10, 5))
 for σ2_val in [0.25, 1.0, 4.0]:
@@ -993,9 +1018,37 @@ plt.show()
 
 The posterior variance falls *faster* for larger $\sigma^2$.
 
-The reason is visible in the price equation $p_t = s_t - \sigma^2 X^I_t/\theta^I$: the price gap between two candidate $\theta$ values grows linearly with $\sigma^2$, while the conditional variance of the implied signal $g(s\mid r)$ is bounded above by $\tau^2$.
+To see why, write $\nu=\sigma^2$ and hold the realized trade $X^I_t$ fixed.
+
+Two nearby values of $\theta$ imply signals separated by approximately
+
+$$
+\left|\frac{\partial s_t(\theta)}{\partial \theta}\right|
+=
+\frac{\nu |X^I_t|}{\theta^2}.
+$$
+
+The likelihood compares these implied signals using $g(s\mid r_t)$, whose conditional variance is
+
+$$
+\operatorname{Var}(s_t\mid r_t)
+=
+\frac{\nu \tau^2}{\nu+\tau^2}.
+$$
+
+Thus the local signal-to-noise ratio for distinguishing nearby $\theta$ values is proportional to
+
+$$
+\frac{\nu |X^I_t|/\theta^2}
+     {\sqrt{\nu\tau^2/(\nu+\tau^2)}}
+=
+\frac{|X^I_t|}{\theta^2}
+\sqrt{\frac{\nu(\nu+\tau^2)}{\tau^2}},
+$$
+
+This ratio rises with $\nu$ when $\tau^2$ is fixed.
 
-The Grossman-Stiglitz-style trade thus becomes more revealing about $\theta^I$ as the return shock $\epsilon_t$ becomes more volatile, even though each return is individually noisier.
+The price-revelation effect therefore dominates the extra return noise in this experiment.
 
 ```{solution-end}
 ```

From 4bd5259ce3aa5de8139d4bc0ec4d1562e85a0255 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Tue, 2 Jun 2026 23:00:01 +1000
Subject: [PATCH 23/25] updates

---
 lectures/_static/quant-econ.bib    |  89 ++++++++-------
 lectures/long_run_risk_operator.md | 173 ++++++++++++++++-------------
 lectures/ls_learning.md            |  42 ++++---
 lectures/rational_learning_re.md   |  42 ++++---
 4 files changed, 184 insertions(+), 162 deletions(-)

diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 71ceac6b9..862efab0e 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -214,7 +214,8 @@ @article{Epstein_Zin1991
   volume    = {99},
   number    = {2},
   pages     = {263--286},
-  year      = {1991}
+  year      = {1991},
+  doi       = {10.1086/261750}
 }
 
 @article{Duffie_Epstein1992a,
@@ -304,7 +305,8 @@ @article{Lucas_Stokey1984
   volume    = {32},
   number    = {1},
   pages     = {139--171},
-  year      = {1984}
+  year      = {1984},
+  doi       = {10.1016/0022-0531(84)90079-6}
 }
 
 @book{Karlin_Taylor1981,
@@ -2865,7 +2867,7 @@ @article{Evans1985
   volume  = {100},
   number  = {4},
   pages   = {1217--1233},
-  doi     = {10.2307/1885377}
+  doi     = {10.2307/1885681}
 }
 
 @article{FourgeaudGourieroux1986,
@@ -2876,7 +2878,7 @@ @article{FourgeaudGourieroux1986
   volume  = {54},
   number  = {4},
   pages   = {845--868},
-  doi     = {10.2307/1912836}
+  doi     = {10.2307/1912839}
 }
 
 @article{MarcetSargent1989,
@@ -3201,7 +3203,9 @@ @article{Townsend1983
   title   = {Forecasting the Forecasts of Others},
   journal = {Journal of Political Economy},
   volume  = {91},
-  pages   = {546-588}
+  number  = {4},
+  pages   = {546--588},
+  doi     = {10.1086/261166}
 }
 
 @article{tobin1992old,
@@ -3900,10 +3904,11 @@ @incollection{BrayKreps1987
   title     = {Rational Learning and Rational Expectations},
   booktitle = {Arrow and the Ascent of Modern Economic Theory},
   editor    = {Feiwel, George R.},
-  publisher = {New York University Press},
-  address   = {New York},
+  publisher = {Palgrave Macmillan},
+  address   = {London},
   year      = {1987},
-  pages     = {597--625}
+  pages     = {597--625},
+  doi       = {10.1007/978-1-349-07239-2_19}
 }
 
 @article{Bray1982,
@@ -3913,17 +3918,19 @@ @article{Bray1982
   year    = {1982},
   volume  = {26},
   number  = {2},
-  pages   = {318--339}
+  pages   = {318--339},
+  doi     = {10.1016/0022-0531(82)90007-2}
 }
 
-@article{BraySavin1984,
+@article{BraySavin1986,
   author  = {Bray, Margaret M. and Savin, N. E.},
   title   = {Rational Expectations Equilibria, Learning and Model Specification},
   journal = {Econometrica},
   year    = {1986},
   volume  = {54},
   number  = {5},
-  pages   = {1129--1160}
+  pages   = {1129--1160},
+  doi     = {10.2307/1912325}
 }
 
 @article{Radner1979,
@@ -3933,7 +3940,8 @@ @article{Radner1979
   year    = {1979},
   volume  = {47},
   number  = {3},
-  pages   = {655--678}
+  pages   = {655--678},
+  doi     = {10.2307/1910413}
 }
 
 @article{Jordan1982,
@@ -3943,7 +3951,8 @@ @article{Jordan1982
   year    = {1982},
   volume  = {26},
   number  = {2},
-  pages   = {224--243}
+  pages   = {224--243},
+  doi     = {10.1016/0022-0531(82)90002-3}
 }
 
 @article{Jordan1982b,
@@ -3953,7 +3962,8 @@ @article{Jordan1982b
   year    = {1982},
   volume  = {28},
   number  = {1},
-  pages   = {19--31}
+  pages   = {19--31},
+  doi     = {10.1016/0022-0531(82)90089-8}
 }
 
 @article{Admati1985,
@@ -3963,7 +3973,8 @@ @article{Admati1985
   year    = {1985},
   volume  = {53},
   number  = {3},
-  pages   = {629--658}
+  pages   = {629--657},
+  doi     = {10.2307/1911659}
 }
 
 @article{GrossmanStiglitz1980,
@@ -3973,14 +3984,19 @@ @article{GrossmanStiglitz1980
   year    = {1980},
   volume  = {70},
   number  = {3},
-  pages   = {393--408}
+  pages   = {393--408},
+  doi     = {10.2307/1805228}
 }
 
-@article{GrossmanSonnenschein1982,
-  author  = {Grossman, Sanford J. and Sonnenschein, Hugo},
-  title   = {Notes on Expectations Equilibria in Bayesian Settings},
-  journal = {Working Paper},
-  year    = {1982}
+@article{AndersonSonnenschein1982,
+  author  = {Anderson, Robert M. and Sonnenschein, Hugo},
+  title   = {On the Existence of Rational Expectations Equilibrium},
+  journal = {Journal of Economic Theory},
+  year    = {1982},
+  volume  = {26},
+  number  = {2},
+  pages   = {261--278},
+  doi     = {10.1016/0022-0531(82)90004-7}
 }
 
 @article{BlumeEasley1982,
@@ -3990,7 +4006,8 @@ @article{BlumeEasley1982
   year    = {1982},
   volume  = {26},
   number  = {2},
-  pages   = {340--351}
+  pages   = {340--351},
+  doi     = {10.1016/0022-0531(82)90008-4}
 }
 
 @article{Frydman1982,
@@ -4003,43 +4020,35 @@ @article{Frydman1982
   pages   = {652--668}
 }
 
-@article{Lewis1981,
-  author  = {Lewis, Karen K.},
+@article{Grossman1981,
+  author  = {Grossman, Sanford J.},
   title   = {An Introduction to the Theory of Rational Expectations under Asymmetric Information},
   journal = {Review of Economic Studies},
   year    = {1981},
   volume  = {48},
   number  = {4},
-  pages   = {541--560}
-}
-
-@article{Townsend1983b,
-  author  = {Townsend, Robert M.},
-  title   = {Forecasting the Forecasts of Others},
-  journal = {Journal of Political Economy},
-  year    = {1983},
-  volume  = {91},
-  number  = {4},
-  pages   = {546--588}
+  pages   = {541--559},
+  doi     = {10.2307/2297195}
 }
 
-@article{ArrowGreen1973,
+@techreport{ArrowGreen1973,
   author      = {Arrow, Kenneth J. and Green, Jerry R.},
   title       = {Notes on Expectations Equilibria in Bayesian Settings},
-  journal     = {Working Paper in Economics},
   year        = {1973},
+  type        = {Working Paper},
   number      = {33},
   institution = {Institute for Mathematical Studies in the Social Sciences, Stanford University}
 }
 
-@article{Kobayashi1977,
-  author  = {Kobayashi, Tetsuya},
+@article{Kreps1977,
+  author  = {Kreps, David M.},
   title   = {A Note on Fulfilled Expectations Equilibria},
   journal = {Journal of Economic Theory},
   year    = {1977},
   volume  = {14},
   number  = {1},
-  pages   = {32--43}
+  pages   = {32--43},
+  doi     = {10.1016/0022-0531(77)90083-7}
 }
 
 @article{Breeden1979,
diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index 03f962837..79e43c1f2 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -32,9 +32,9 @@ Standard short-horizon asset pricing tells us how investors are compensated
 for tiny, instantaneous exposures to shocks, the *short end* of the term
 structure of risk prices.
 
-But many of the most interesting  topics about asset pricing questions -- e.g.,  the equity
+But many of the most interesting asset pricing questions (e.g., the equity
 premium puzzle, the slope of the yield curve, the prices of long-dated
-options -- are about the  *long end* of the term structure of risk prices.
+options) concern the *long end* of the term structure of risk prices.
 
 This lecture studies the long end using the operator approach of
 {cite:t}`HansenScheinkman2009`.
@@ -89,7 +89,7 @@ This lecture is closely related to the advanced lecture
 permanent-transitory decomposition for additive and multiplicative
 functionals in a discrete-time linear-Gaussian setting.
 
-Reading these two lextures together is a good way to learn about representations of long-run risks
+Reading these two lectures together is a good way to learn about representations of long-run risks
 in both continuous and discrete time.
 ```
 
@@ -114,10 +114,9 @@ The plan of this lecture is to:
    the local risk prices appropriate for short-horizon asset
    pricing.
 
-A recurring theme will be that when shocks move persistent state variables,  local and long-run risk prices can differ markedly.
+A recurring theme is that shocks to persistent state variables drive a wedge between local and long-run risk prices.
 
-That diifference underlies the
-mechanism that lets long-run risk models like {cite:t}`Bansal_Yaron_2004`
+That wedge is the mechanism by which long-run risk models like {cite:t}`Bansal_Yaron_2004`
 generate large equity premia.
 
 We start with the following imports
@@ -248,13 +247,13 @@ Why is this a useful  condition to require?
 
 Think of $M_t = S_t$, a stochastic discount factor.
 
-The date-$0$ value of a date-$t$ payoff $\Pi_t$ is $E[S_t\Pi_t \mid \mathcal F_0]$.
+The date-$0$ value of a date-$t$ payoff $\Pi_t$ is $\mathbb{E}[S_t\Pi_t \mid \mathcal F_0]$.
 
 If we instead buy this payoff at intermediate date $\tau$, its date-$\tau$
 price must be
 
 $$
-    E\left[\frac{S_t}{S_\tau}\Pi_t \,\Big|\, \mathcal F_\tau\right].
+    \mathbb{E}\left[\frac{S_t}{S_\tau}\Pi_t \,\Big|\, \mathcal F_\tau\right].
 $$
 
 For the price to depend only on the current Markov state $X_\tau$ (and not on
@@ -289,7 +288,7 @@ $$
 ```
 
 So exponentials of additive functionals are exactly the strictly positive
-multiplicative functionals 
+multiplicative functionals.
 
 In our jump-diffusion setting, a useful parameterization is
 
@@ -335,7 +334,7 @@ A multiplicative functional $M$ together with the Markov process $X$ defines
 a **valuation operator** for each horizon $t$:
 
 $$
-    \mathbb M_t \psi(x) = E\left[M_t \psi(X_t) \mid X_0 = x\right] .
+    \mathbb M_t \psi(x) = \mathbb{E}\left[M_t \psi(X_t) \mid X_0 = x\right] .
 $$
 
 You should read $\mathbb M_t \psi(x)$ as "the date-$0$ value, starting from
@@ -362,7 +361,7 @@ semigroup** is the family of operators
 $$
     \mathbb M_t \psi(x)
     =
-    E\left[M_t \psi(X_t) \mid X_0 = x\right].
+    \mathbb{E}\left[M_t \psi(X_t) \mid X_0 = x\right].
 $$ (eq:m-semigroup)
 ```
 
@@ -400,7 +399,7 @@ value cash flows that grow stochastically over time.
 :label: lrr-def-stochastic-discount-factor
 
 A **stochastic discount factor** $S$ is a positive multiplicative functional
-for which $E[S_t Z_t \mid X_0=x]$ is the date-$0$ value of an
+for which $\mathbb{E}[S_t Z_t \mid X_0=x]$ is the date-$0$ value of an
 $\mathcal F_t$-measurable payoff $Z_t$.
 ```
 
@@ -430,7 +429,7 @@ $$
     \qquad
     \mathbb Q_t \psi(x)
     =
-    E\left[G_t S_t \psi(X_t) \mid X_0=x\right].
+    \mathbb{E}\left[G_t S_t \psi(X_t) \mid X_0=x\right].
 $$
 
 ```{prf:definition} Cash-Flow Valuation Semigroup
@@ -581,10 +580,9 @@ The eigenvalue calculations below describe the other end.
 So far we have a family of operators $\{\mathbb M_t\}_{t \geq 0}$, one for each
 horizon $t$.
 
-That is more information than we can analyze directly.
+That is more information than we can use directly.
 
-Actually,  what we really
-care about is the behaviour of $\mathbb M_t \psi$ as $t \to \infty$.
+What we care about is the long-run behaviour: how $\mathbb M_t \psi$ grows as $t \to \infty$.
 
 The **generator** $\mathbb A$ compresses the entire semigroup into one
 time-independent operator on the state space.
@@ -606,14 +604,12 @@ one-period valuation operator
 $$
     K\psi(x)
     =
-    E\left[M_1 \psi(X_1) \mid X_0=x\right].
+    \mathbb{E}\left[M_1 \psi(X_1) \mid X_0=x\right].
 $$
 
-Iterating gives the $n$-period operator $K^n$ 
+Iterating gives the $n$-period operator $K^n$.
 
-This is exactly the logic by which a
-transition matrix $P$ produces $n$-step probabilities through $P^n$, except
-that $K$ also carries the payoff weight $M_1$.
+This parallels the logic of a transition matrix: just as $P^n$ gives $n$-step probabilities, $K^n$ weights $n$-period payoffs, with $M_1$ folded in at each step.
 
 So one local object, $K$, controls the entire horizon-indexed family.
 
@@ -632,7 +628,7 @@ payoff.
 By multiplicativity,
 
 $$
-    E\left[
+    \mathbb{E}\left[
         M_{n+1}\psi(X_{n+1}) - M_n \psi(X_n)
         \mid \mathcal F_n
     \right]
@@ -677,7 +673,7 @@ $$
 ```{note}
 When $M \equiv 1$, the
 multiplicative semigroup reduces to the standard Markov transition semigroup
-$\mathbb M_t \psi(x) = E[\psi(X_t) \mid X_0=x]$, and $\mathbb A$ becomes the
+$\mathbb M_t \psi(x) = \mathbb{E}[\psi(X_t) \mid X_0=x]$, and $\mathbb A$ becomes the
 familiar infinitesimal generator $\mathcal L$ of $X$ from textbook stochastic
 calculus.
 
@@ -716,7 +712,7 @@ martingale property of $\hat M$.
 
 ### Extended generator
 
-There is a qualification to  the limit definition above.
+The limit definition above needs a qualification.
 
 To make the limit $h \downarrow 0$ rigorous, the textbook definition
 requires $(\mathbb M_h\psi - \psi)/h$ to converge to $\mathbb A\psi$ in a
@@ -787,7 +783,7 @@ state.
 * When $M \equiv 1$, the definition reduces to Dynkin's formula for the
   standard Markov generator
   $\mathcal L \psi(x)
-  = \lim_{t \downarrow 0} t^{-1}\bigl[E\psi(X_t) - \psi(x)\bigr]$.
+  = \lim_{t \downarrow 0} t^{-1}\bigl[\mathbb{E}\psi(X_t) - \psi(x)\bigr]$.
 
 * When $X$ is a jump diffusion, Itô's formula applied to $M_t\psi(X_t)$
   produces the closed-form expression for $\mathbb A\psi$ in {eq}`eq:extended-generator` below.
@@ -957,7 +953,7 @@ $\hat M_t$:
 $$
     \widehat{\Pr}(F \mid X_0=x)
     =
-    E[\hat M_t \mathbf 1_F \mid X_0=x],
+    \mathbb{E}[\hat M_t \mathbf 1_F \mid X_0=x],
     \qquad F \in \mathcal F_t.
 $$
 ```
@@ -970,14 +966,14 @@ We close this gap by adopting Assumption 6.1 of
 {cite:t}`HansenScheinkman2009`:
 
 > The local martingale $\hat M$ defined in {eq}`eq:mhat` is a martingale,
-> i.e. $E[\hat M_t \mid X_0 = x] = 1$ for every $t \geq 0$ and $x$.
+> i.e. $\mathbb{E}[\hat M_t \mid X_0 = x] = 1$ for every $t \geq 0$ and $x$.
 
 We carry this assumption from here on.
 
 Even without closing the gap we still get one-sided control.
 
 Since $\hat M$ is nonnegative with $\hat M_0 = 1$, it is a supermartingale,
-so $E[\hat M_t \mid X_0=x] \leq 1$.
+so $\mathbb{E}[\hat M_t \mid X_0=x] \leq 1$.
 
 Taking expectations in {eq}`eq:mhat`,
 
@@ -988,7 +984,7 @@ $$
 so $\rho$ is at least an upper bound on the long-run growth rate of
 $\mathbb M_t \phi$.
 
-When $\hat M$ is in fact a martingale, $E\hat M_t = 1$, the inequality
+When $\hat M$ is in fact a martingale, $\mathbb{E}[\hat M_t] = 1$, the inequality
 becomes an equality, and the local condition $\mathbb A\phi = \rho\phi$ lifts
 to the semigroup eigenvalue equation
 
@@ -1032,9 +1028,7 @@ for every $\psi$ in the $L^\infty$ domain of $\hat{\mathbb A}$.
 
 $\hat\varsigma$ is the candidate long-run distribution.
 
-If
-it doesn't exist, the twisted process has no steady state for $X_t$ to settle
-into, and the long-run limit cannot be expressed as a state-space integral.
+Without it, the twisted process has no steady state, and the long-run limit cannot be expressed as a state-space integral.
 
 *Condition 2: every important region is reachable.*
 
@@ -1078,14 +1072,12 @@ $$
 
 Reachability (Condition 2) is not enough.
 
-A region
-might be reachable but visited only with small probability, so time averages
-fail to converge to $\hat\varsigma$-averages. 
+A set may be reachable but visited with vanishingly small probability, so time averages fail to converge to $\hat\varsigma$-averages.
 
 Harris recurrence is the
 continuous-state counterpart to a "recurrent state" in a finite chain.
 
-Bundling these together:
+Collecting the three conditions:
 
 ```{prf:definition} Stochastically Stable Twisted Process
 :label: lrr-def-stochastic-stability
@@ -1111,9 +1103,7 @@ $$ (eq:long-run-limit)
 
 Read this as follows:
 
-* The factor $\exp(\rho t)$ captures the exponential growth or decay of the
-  semigroup. 
-  - After we divide it, what remains has a finite limit.
+* The factor $\exp(\rho t)$ captures the exponential growth or decay of the semigroup; once we divide it out, what remains has a finite limit.
 * The state dependence in that limit is *entirely* captured by $\phi(x)$.
 * The scalar $\int (\psi/\phi)\, d\hat\varsigma$ is the **long-run intensity**
   of the payoff $\psi$, weighted by $1/\phi$ and averaged against the
@@ -1164,10 +1154,9 @@ explosive twisted process.
 
 ## A finite-state Markov chain
 
-Now we are all set to apply the framework to a concrete example!
+We now apply the framework to a concrete example.
 
-We start with the simplest possible
-case: a finite-state Markov chain.
+We start with the simplest case: a finite-state Markov chain.
 
 For background on finite Markov chains in discrete time, see
 {doc}`finite_markov`. 
@@ -1413,6 +1402,12 @@ The numerical values converge to the limit; the next plot shows the same
 convergence pictorially.
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Convergence of the rescaled semigroup to its long-run limit
+    name: fig-lrr-semigroup-convergence
+---
 t_grid = np.linspace(0.01, 80, 400)
 rescaled = np.array([np.exp(-ρ * t) * expm(t * A) @ ψ for t in t_grid])
 
@@ -1423,7 +1418,6 @@ ax.axhline(limit[0], color="C0", ls="--", lw=1, alpha=0.7)
 ax.axhline(limit[1], color="C1", ls="--", lw=1, alpha=0.7)
 ax.set_xlabel("$t$")
 ax.set_ylabel(r"$e^{-\rho t}\,(\mathbb{M}_t \psi)(x)$")
-ax.set_title("Convergence of the rescaled semigroup to its long-run limit")
 ax.legend()
 plt.show()
 ```
@@ -1485,6 +1479,12 @@ boom-to-recession multiplier fixed and trace out $\rho$ as the
 recession-to-boom multiplier varies.
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Jumps and the long-run growth rate
+    name: fig-lrr-jumps-eigenvalue
+---
 κ_grid = np.linspace(-0.5, 0.5, 100)
 ρ_grid = np.empty_like(κ_grid)
 
@@ -1500,7 +1500,6 @@ ax.axhline(ρ, color="black", ls="--", lw=1)
 ax.axvline(0, color="black", ls=":", lw=1)
 ax.set_xlabel("jump log multiplier for recession to boom")
 ax.set_ylabel("principal eigenvalue")
-ax.set_title("Jumps and the Long-Run Growth Rate")
 plt.show()
 ```
 
@@ -1941,7 +1940,7 @@ print(f"long-run zero-coupon yield = {-ρ_s:.4f}")
 ```
 
 The long-run zero-coupon yield $-\rho_s$ represents the asymptotic decay
-rate in the SDF expectation $E[S_t]$.
+rate in the SDF expectation $\mathbb{E}[S_t]$.
 
 We can also check that the rejected root for $c_f$ would have produced a
 non-stationary twisted process, a clear example of stochastic stability
@@ -2020,7 +2019,7 @@ error below is just floating-point round-off.
 
 The second, substantive, check is whether the eigenpair $(\rho,\phi)$ we
 solved for really makes $\hat M$ a martingale, which we approximate by
-computing $E[\hat M_t]$ across many simulated paths.
+computing $\mathbb{E}[\hat M_t]$ across many simulated paths.
 
 ```{code-cell} ipython3
 def brownian_increments(n, dt, seed=1234):
@@ -2094,7 +2093,7 @@ print(f"algebraic identity error = {identity_error:.2e}")
 
 The error above is up to machine precision, as expected.
 
-Next we estimate $E[\hat M_t \mid X_0 = \bar x]$ over a Monte Carlo sample
+Next we estimate $\mathbb{E}[\hat M_t \mid X_0 = \bar x]$ over a Monte Carlo sample
 of paths.
 
 If $\hat M$ is a martingale, the population mean is exactly $1$ at every
@@ -2135,27 +2134,33 @@ for t_check in [1.0, 5.0, 10.0, 20.0]:
 ```
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Affine state paths and multiplicative factorization
+    name: fig-lrr-factorization-paths
+---
 fig, axes = plt.subplots(2, 2, figsize=(12, 8))
 
 axes[0, 0].plot(t, Xf)
-axes[0, 0].set_title("$X_t^f$")
+axes[0, 0].set_ylabel("$X_t^f$")
 axes[0, 0].set_xlabel("$t$")
 
 axes[0, 1].plot(t, Xo)
-axes[0, 1].set_title("$X_t^o$")
+axes[0, 1].set_ylabel("$X_t^o$")
 axes[0, 1].set_xlabel("$t$")
 
 axes[1, 0].plot(t, M, label="$M_t$")
 axes[1, 0].plot(t, np.exp(ρ_s * t) * M_hat * transient,
                 "--", label="factorization")
-axes[1, 0].set_title("Multiplicative Factorization")
+axes[1, 0].set_ylabel("multiplicative factorization")
 axes[1, 0].set_xlabel("$t$")
 axes[1, 0].legend()
 
 axes[1, 1].plot(t, np.exp(ρ_s * t), label="$\\exp(\\rho t)$")
 axes[1, 1].plot(t, M_hat, label="$\\hat M_t$", alpha=0.8)
 axes[1, 1].plot(t, transient, label="$\\phi(X_0)/\\phi(X_t)$", alpha=0.8)
-axes[1, 1].set_title("Three Components")
+axes[1, 1].set_ylabel("three components")
 axes[1, 1].set_xlabel("$t$")
 axes[1, 1].legend()
 
@@ -2163,19 +2168,19 @@ plt.tight_layout()
 plt.show()
 ```
 
-We can see how the three components evolve over time.
+The figure shows how the three components evolve over time.
 
 ## Long-run risk prices
 
 We can now use the factorization to compute long-run analogues of the
-instantaneous risk prices that come out of standard continuous-time asset
+instantaneous risk prices that arise in standard continuous-time asset
 pricing.
 
 We can ask:
 
 > If an investor takes on a small exposure to a shock today, how much extra
-> expected return do they need, as compensation, when we measure that
-> compensation as a long-horizon rate rather than as an instantaneous one?
+> expected return do they need when that return is measured as a long-horizon
+> rate rather than an instantaneous one?
 
 The two answers, local and long-run, need not agree.
 
@@ -2276,7 +2281,7 @@ assumed away.
 For a long zero-coupon bond,
 
 $$
-    \exp(-\rho t)\, E[S_t \mid X_0=x]
+    \exp(-\rho t)\, \mathbb{E}[S_t \mid X_0=x]
     \to
     \phi(x)
     \int \frac{1}{\phi}\, d\hat\varsigma ,
@@ -2339,6 +2344,12 @@ The next cell illustrates how persistence changes the wedge between local and
 long-run prices.
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Persistence and long-run risk prices
+    name: fig-lrr-persistence-risk-prices
+---
 ξ_o_grid = np.array([0.10, 0.20, 0.50, 1.00, 2.00, 5.00])
 local_grid = np.full_like(ξ_o_grid, local_price_o)
 long_grid = -γ_s_o - (β_s_o / ξ_o_grid) * σ_o
@@ -2349,7 +2360,6 @@ ax.plot(ξ_o_grid, long_grid, "o-", lw=2, label="long-run")
 ax.set_xscale("log")
 ax.set_xlabel("mean-reversion speed $\\xi_o$")
 ax.set_ylabel("risk price")
-ax.set_title("Persistence and Long-Run Risk Prices")
 ax.legend()
 plt.show()
 ```
@@ -2469,8 +2479,15 @@ def valuation_eigenvalue_for_exposure(γ_v_o, γ_v_f=0.0):
     p = valuation_params_from_exposure(γ_v_o, γ_v_f)
     _, _, ρ, _ = solve_affine_eigenfunction(p)
     return ρ
+```
 
-
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Changing valuation functionals
+    name: fig-lrr-valuation-frontier
+---
 γ_v_o_grid = np.linspace(-0.5, 0.5, 101)
 ρ_v_grid = np.array([
     valuation_eigenvalue_for_exposure(g) for g in γ_v_o_grid
@@ -2480,7 +2497,6 @@ fig, ax = plt.subplots()
 ax.plot(γ_v_o_grid, ρ_v_grid, lw=2)
 ax.set_xlabel("valuation exposure $\\gamma_o^v$")
 ax.set_ylabel("principal eigenvalue $\\rho^v$")
-ax.set_title("Changing Valuation Functionals")
 plt.show()
 ```
 
@@ -2587,8 +2603,15 @@ def required_return_for_growth_exposure(γ_g_o, γ_g_f=0.0, δ=0.02):
 
     _, _, ρ, _ = solve_affine_eigenfunction(p)
     return -ρ + δ
+```
 
-
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Local and long-run pricing of persistent growth risk
+    name: fig-lrr-cashflow-frontier
+---
 γ_g_o_grid = np.linspace(-0.5, 0.5, 101)
 required_returns = np.array([
     required_return_for_growth_exposure(g) for g in γ_g_o_grid
@@ -2604,7 +2627,6 @@ ax.plot(γ_g_o_grid, local_line, "--", lw=2,
         label="local slope")
 ax.set_xlabel("cash-flow exposure $\\gamma_o^g$")
 ax.set_ylabel("rate of return")
-ax.set_title("Local and Long-Run Pricing of Persistent Growth Risk")
 ax.legend()
 plt.show()
 ```
@@ -2630,15 +2652,13 @@ print(f"formula                 = {long_run_price_o:.6f}")
 
 ## Assumptions behind the scenes
 
-The examples above make the eigenfunction calculation look mechanical.
+The examples above make the eigenfunction calculation look routine.
 
-For finite-state chains and the affine model, it really is mechanical;
-Perron-Frobenius theory and closed-form algebra handle every requirement.
+For finite-state chains and the affine model, it really is; Perron-Frobenius theory and closed-form algebra handle every requirement.
 
-But in a general state space, three things can go wrong, and each
-corresponds to one of the assumptions we have been carrying along.
+But in a general state space, three things can go wrong, each corresponding to one of the assumptions we have been carrying.
 
-This section walks through what they are and why they matter.
+The rest of this section examines each in turn.
 
 ### Issue 1: $\hat M$ might fail to be a martingale
 
@@ -2647,7 +2667,7 @@ from {eq}`eq:mhat`, but $\hat M$ is only a nonnegative
 local martingale, hence a supermartingale.
 
 A supermartingale is not enough to define a probability measure: we need
-$E\hat M_t = 1$, i.e. a genuine martingale, the content of Assumption 6.1
+$\mathbb{E}[\hat M_t] = 1$, i.e. a genuine martingale, the content of Assumption 6.1
 in {cite:t}`HansenScheinkman2009`.
 
 A standard way to verify this is a two-sided **Girsanov construction**:
@@ -2691,7 +2711,7 @@ $$
     =
     \int_0^\infty
     \exp(-\alpha t)\,
-    E\!\left[
+    \mathbb{E}\!\left[
         M_t\, \frac{V(X_t)}{V(x)}\, \psi(X_t)
         \,\Big|\, X_0=x
     \right] dt .
@@ -2758,7 +2778,7 @@ The main steps are:
    multiplicative functional $M$.
 
 2. Build the semigroup
-   $\mathbb M_t\psi(x)=E[M_t\psi(X_t)\mid X_0=x]$.
+   $\mathbb M_t\psi(x)=\mathbb{E}[M_t\psi(X_t)\mid X_0=x]$.
 
 3. When $M = VS$ is the product of a valuation functional and an SDF,
    impose the local pricing restriction that $VS$ is a martingale; for
@@ -2794,10 +2814,10 @@ Consider a two-state Markov chain with intensity matrix
 
 $$
 U =
-\begin{pmatrix}
+\begin{bmatrix}
     -\lambda & \lambda \\
     \mu & -\mu
-\end{pmatrix}.
+\end{bmatrix}.
 $$
 
 Let the multiplicative functional have decay rate $r_1>0$ in state 1, decay
@@ -2823,10 +2843,10 @@ Here is one solution:
 
 $$
 A =
-\begin{pmatrix}
+\begin{bmatrix}
     -\lambda-r_1 & \lambda \\
     \mu & -\mu
-\end{pmatrix}.
+\end{bmatrix}.
 $$
 
 *2.* The characteristic equation is
@@ -2960,11 +2980,11 @@ intensity matrix
 
 $$
 U =
-\begin{pmatrix}
+\begin{bmatrix}
     -0.40 &  0.30 &  0.10 \\
      0.20 & -0.50 &  0.30 \\
      0.10 &  0.20 & -0.30
-\end{pmatrix},
+\end{bmatrix},
 $$
 
 decay-rate vector $r = (0.06, 0.04, 0.01)$, and no jumps in the
@@ -3036,7 +3056,6 @@ fig, ax = plt.subplots()
 ax.semilogy(t_vals, errors, lw=2)
 ax.set_xlabel("$t$")
 ax.set_ylabel("error")
-ax.set_title(f"Convergence to the Principal Eigenfunction, gap = {gap:.4f}")
 plt.show()
 
 print(f"spectral gap = {gap:.6f}")
diff --git a/lectures/ls_learning.md b/lectures/ls_learning.md
index 1b9da990b..590172add 100644
--- a/lectures/ls_learning.md
+++ b/lectures/ls_learning.md
@@ -62,7 +62,7 @@ global almost-sure convergence also requires boundedness and
 domain-of-attraction conditions.
 
 The framework unifies and extends earlier work by {cite:t}`Bray1982` and
-{cite:t}`BraySavin1984` and connects naturally to the distinction between learning
+{cite:t}`BraySavin1986` and connects naturally to the distinction between learning
 *within* a rational expectations equilibrium (Bayesian updating inside a
 correctly specified model) and learning *about* one (adapting an OLS estimator
 whose data-generating process shifts with beliefs) discussed in
@@ -116,7 +116,7 @@ z_t = \begin{bmatrix}
     + \begin{bmatrix} V(\beta_t) \\ B(\beta_t) \end{bmatrix} u_t ,
 $$ (eq:actual_lom)
 
-where $u_t$ is i.i.d. white noise with covariance $\Sigma$.
+where $u_t$ is IID white noise with covariance $\Sigma$.
 
 The mapping $T$ is the key object, as it maps the *perceived* coefficient $\beta$
 to the coefficient that *actually* governs $z_{1t}$ in equilibrium.
@@ -174,7 +174,7 @@ conditions used below also hold, convergence is still governed by the same full
 ODE {eq}`eq:full_ode` and small ODE {eq}`eq:small_ode`.
 
 ```{note}
-As {cite:t}`BraySavin1984` and {cite:t}`BrayKreps1987` emphasize, the RLS algorithm
+As {cite:t}`BraySavin1986` and {cite:t}`BrayKreps1987` emphasize, the RLS algorithm
 cannot be derived from Bayes' rule applied to a correctly specified model, because
 during the learning transition the data-generating process is non-stationary ---
 beliefs shift the equilibrium, which shifts the data.
@@ -228,7 +228,7 @@ $$
 = \begin{bmatrix} R^{-1} M_{z_2}(\beta)\,[T(\beta) - \beta]^\top \\ M_{z_2}(\beta) - R \end{bmatrix} ,
 $$ (eq:full_ode)
 
-where $M_{z_2}(\beta) = E z_{2t}z_{2t}^\top$ is computed at the stationary distribution of $z_{2t}$ that prevails when agents believe the perceived law has constant parameter $\beta$.
+where $M_{z_2}(\beta) = \mathbb{E}[z_{2t}z_{2t}^\top]$ is computed at the stationary distribution of $z_{2t}$ that prevails when agents believe the perceived law has constant parameter $\beta$.
 
 The fixed point of {eq}`eq:full_ode` is $(\beta_f, R_f)$ with $R_f = M_{z_2}(\beta_f)$ — the same $\beta_f$ as the small ODE, paired with the second-moment matrix consistent with it.
 
@@ -295,7 +295,7 @@ estimate $R_t$.
 ```{prf:assumption} Boundedness along a subsequence
 :label: ass-ms-a6
 
-There exist a set $\Omega_0$ with $P(\Omega_0) = 1$, random variables $C_1(\omega)$ and $C_2(\omega)$, and a subsequence $\{t_k(\omega)\}$ such that
+There exist a set $\Omega_0$ with $\mathbb{P}(\Omega_0) = 1$, random variables $C_1(\omega)$ and $C_2(\omega)$, and a subsequence $\{t_k(\omega)\}$ such that
 
 $$
 |z_{2t_k}(\omega)| < C_1(\omega) \quad\text{and}\quad |R_{t_k}(\omega)| < C_2(\omega)
@@ -357,7 +357,7 @@ Let $\mathcal{M}$ be the Jacobian matrix of $T(\beta) - \beta$ evaluated at the
 REE $\beta_f$:
 
 $$
-\mathcal{M} = \frac{d\,\text{col}(T(\beta) - \beta)}{d\,\text{col}(\beta)^\top}\Bigg|_{\beta=\beta_f} ,
+\mathcal{M} = \frac{d\,\operatorname{col}(T(\beta) - \beta)}{d\,\operatorname{col}(\beta)^\top}\Bigg|_{\beta=\beta_f} ,
 $$ (eq:jacobian)
 
 and let $h(\beta, R)$ denote the Jacobian of the right-hand side of the full
@@ -392,10 +392,10 @@ Assume {prf:ref}`ass-ms-a1`, {prf:ref}`ass-ms-a2`,
 1. Let $\hat\beta \neq \beta_f$ and suppose $M_{z_2}(\hat\beta)$ is positive
    definite and $\hat\beta \in \mathrm{int}(D_2)$.
 
-   Then $P(\beta_t \to \hat\beta) = 0$.
+   Then $\mathbb{P}(\beta_t \to \hat\beta) = 0$.
 
 2. If $h(\beta_f, R_f)$ has at least one eigenvalue with strictly positive real
-   part, then $P(\beta_t \to \beta_f) = 0$.
+   part, then $\mathbb{P}(\beta_t \to \beta_f) = 0$.
 ```
 
 The first part says that recursive least squares cannot settle on a
@@ -489,7 +489,7 @@ is open and bounded with $D_1 \subset D_A$.
 Then for some subsequence $\{t_k(\omega)\}$,
 
 $$
-P(\beta_t \to \beta_f) + P\bigl(\beta_{t_k} \to (D_1 \setminus D_2)\bigr) = 1.
+\mathbb{P}(\beta_t \to \beta_f) + \mathbb{P}\bigl(\beta_{t_k} \to (D_1 \setminus D_2)\bigr) = 1.
 $$
 ```
 
@@ -583,8 +583,7 @@ matrix $\Gamma$ and with $V(\beta)=I$.
 
 Economically, this is the control case.
 
-Agents are simply estimating a stable data-generating process that is already
-there.
+Agents are estimating a stable, exogenous data-generating process.
 
 Their beliefs do not feed back into prices, quantities, or future data.
 
@@ -611,7 +610,7 @@ p_t = a + b \beta_t + \tilde{u}_t ,
 $$ (eq:bray_price)
 
 where $\beta_t$ is agents' OLS estimate of the price (their point forecast of
-$p_t$), and $\tilde{u}_t$ is i.i.d. noise with mean zero and variance
+$p_t$), and $\tilde{u}_t$ is IID noise with mean zero and variance
 $\sigma_u^2$.
 
 Here the forecast itself is a state variable for the economy.
@@ -906,7 +905,7 @@ print(f"Jacobian M = b - 1 = {b_unstable - 1:.2f}  (> 0: NOT E-stable)")
 
 ## Example 3: Bray-Savin supply-shifter model
 
-{cite:t}`BraySavin1984` studied a model where
+{cite:t}`BraySavin1986` studied a model where
 
 $$
 p_t = x_t^\top(m + a\beta_{t-1}) + \tilde{u}_t , \quad p_t^e = x_t^\top\beta_{t-1} ,
@@ -1056,12 +1055,12 @@ exogenous variable, not just learning an unconditional mean.
 Consider the present-value asset pricing model
 
 $$
-y_t = \lambda E_t y_{t+1} + x_t , \quad x_t = \rho x_{t-1} + \varepsilon_t ,
+y_t = \lambda \mathbb{E}_t y_{t+1} + x_t , \quad x_t = \rho x_{t-1} + \varepsilon_t ,
 $$ (eq:pv_model)
 
 where $|\lambda| < 1$, $|\rho| < 1$, and agents perceive $y_t = \beta_t x_{t-1}+ v_t$.
 
-Under the perceived law, $E_t y_{t+1} = \beta_t x_t$, so {eq}`eq:pv_model` becomes
+Under the perceived law, $\mathbb{E}_t y_{t+1} = \beta_t x_t$, so {eq}`eq:pv_model` becomes
 
 $$
 y_t = (\lambda \beta_t + 1) x_t = (\lambda \beta_t + 1)\rho x_{t-1} + (\lambda \beta_t + 1)\varepsilon_t.
@@ -1390,7 +1389,7 @@ The framework of {cite:t}`MarcetSargent1989jet` belongs to the program of learni
 *about* a rational expectations equilibrium, as distinct from learning *within*
 one --- a distinction emphasized by {cite:t}`BrayKreps1987`.
 
-*Learning *within* an REE* (the subject of the companion lecture
+Learning *within* an REE (the subject of the companion lecture
 [](rational_learning_re)) refers to Bayesian inference inside a correctly
 specified model.
 
@@ -1410,14 +1409,11 @@ As {cite:t}`MarcetSargent1989jet` put it,
 > because agents operate under the continually falsified assumption that the law of
 > motion is time invariant and known for sure.
 
-It is nonetheless a compelling learning rule.
+Yet it is a compelling rule.
 
-It is consistent and computationally tractable, and it converges to the REE
-despite the misspecification when E-stability is combined with the required
-boundedness and domain-of-attraction conditions.
+Convergence holds despite the misspecification whenever E-stability combines with the required boundedness and domain-of-attraction conditions.
 
-It does not require the strong assumptions on agents' prior beliefs about the
-statistical structure of the economy that are needed for Bayesian learning.
+Unlike Bayesian learning, it does not require strong prior assumptions about the statistical structure of the economy.
 
 
 ## Summary
@@ -1507,7 +1503,7 @@ Convergence still occurs, but it takes longer.
 ```{exercise}
 :label: ls_ex2
 
-{prf:ref}`prop-ms-necessity` states that $P(\beta_t \to \hat\beta) = 0$ for any $\hat\beta \neq \beta_f$ in the interior.
+{prf:ref}`prop-ms-necessity` states that $\mathbb{P}(\beta_t \to \hat\beta) = 0$ for any $\hat\beta \neq \beta_f$ in the interior.
 
 1. Using the Bray model with $a=1$, $b=0.6$, simulate 100 paths of length $T = 600$ starting from $\beta_0 = 6$ (far from $\beta_f = 2.5$) and show that paths still converge to $\beta_f$.
 
diff --git a/lectures/rational_learning_re.md b/lectures/rational_learning_re.md
index ddcf5728a..21075efe9 100644
--- a/lectures/rational_learning_re.md
+++ b/lectures/rational_learning_re.md
@@ -42,7 +42,7 @@ But this raises a question: where does that knowledge come from?
 
 Their **rational learning** approach answers a narrower question.
 
-Agents are uncertain about structural parameters, but the state space is enlarged so that, for each possible parameter value, the associated equilibrium price and allocation maps are already part of the model.
+Agents are uncertain about structural parameters, and the state space is enlarged to include, for each candidate value, the associated equilibrium price and allocation maps.
 
 Agents then use Bayes' rule to update over those possibilities as prices and returns are observed.
 
@@ -50,7 +50,7 @@ This lecture develops that Bayesian formulation through an asset-market model.
 
 The aim is to see what rational learning can explain, and where its limits appear, before turning to the computational illustration.
 
-The discussion also connects to earlier work by {cite:t}`Bray1982`, {cite:t}`BraySavin1984`, and the rational expectations literature of {cite:t}`Radner1979`, {cite:t}`grossman1976`, and {cite:t}`Jordan1982`.
+The discussion also connects to earlier work by {cite:t}`Bray1982`, {cite:t}`BraySavin1986`, and the rational expectations literature of {cite:t}`Radner1979`, {cite:t}`grossman1976`, and {cite:t}`Jordan1982`.
 
 Let's start with the following imports
 
@@ -62,9 +62,7 @@ from scipy.optimize import brentq
 
 ## The economy
 
-Let's start with a simple asset-market model that captures the key features of rational learning.
-
-The example is an infinitely repeated version of the information model in {cite:t}`GrossmanStiglitz1980`.
+The model is a simple asset market built around the recurring information model of {cite:t}`GrossmanStiglitz1980`, repeated across infinitely many dates.
 
 
 ### Agents and assets
@@ -83,9 +81,9 @@ An informed signal $s_t$ satisfies
 $$
 r_t = s_t + \epsilon_t,
 \qquad
-s_t \sim \mathcal N(\mu_s, \tau^2),
+s_t \sim N(\mu_s, \tau^2),
 \qquad
-\epsilon_t \sim \mathcal N(0,\sigma^2),
+\epsilon_t \sim N(0,\sigma^2),
 $$
 
 where $\{s_t\}$ and $\{\epsilon_t\}$ are IID normal sequences and are mutually independent.
@@ -212,14 +210,14 @@ $$
 r_t - p
 =
 \frac{\sigma^2 X^I}{\theta} + \epsilon_t,
-\qquad \epsilon_t \sim \mathcal N(0,\sigma^2).
+\qquad \epsilon_t \sim N(0,\sigma^2).
 $$
 
 Because CARA preferences have no wealth effects, agent $U$'s problem reduces to
 
 $$
 \max_{x^U}\,
-E[u^U(x^U, r_t, p)],
+\mathbb{E}[u^U(x^U, r_t, p)],
 \qquad
 u^U(x^U, r_t, p)
 =
@@ -231,7 +229,7 @@ where the expectation integrates over $\theta^I \sim f_t^{(p, X^I)}$ and $\epsil
 Substituting the conditional excess payoff and using the normal moment-generating formula gives
 
 $$
-E[u^U]
+\mathbb{E}[u^U]
 =
 -\exp\!\left(\frac{(x^U)^2 \sigma^2}{2(\theta^U)^2}\right)
 \int_a^b
@@ -305,7 +303,7 @@ After trading, agent $U$ observes $(p_t, x^U_t, r_t)$.
 
 Market clearing gives $X^I_t = 2 - x^U_t$, and equation {eq}`eq:bk-signal-implied` assigns a candidate $s_t(\theta) = \sigma^2 X^I_t/\theta + p_t$ to each $\theta$.
 
-Since $s_t \sim \mathcal N(\mu_s, \tau^2)$ independently of $\epsilon_t \sim \mathcal N(0,\sigma^2)$, the conditional density of $s_t$ given $r_t$ is Gaussian:
+Since $s_t \sim N(\mu_s, \tau^2)$ independently of $\epsilon_t \sim N(0,\sigma^2)$, the conditional density of $s_t$ given $r_t$ is Gaussian:
 
 $$
 g(s\mid r)
@@ -581,13 +579,13 @@ The left panel shows $X^I_t$ approaching the full-information allocation as beli
 
 The right panel shows the price path, which fluctuates because $p_t$ inherits the variation in $s_t$.
 
-The simulation suggests three empirical facts about this equilibrium.
+The simulation illustrates three features of this equilibrium.
 
 The posterior density on $\theta^I$ concentrates around the true value, the posterior variance vanishes, and the equilibrium informed trade $X^I_t$ converges to its full-information benchmark.
 
-The next sections ask what general theorems guarantee these outcomes and which assumptions they rely on.
+The next sections identify the theorems behind these outcomes and the assumptions they require.
 
-The plan is to first state the two convergence theorems of {cite:t}`BrayKreps1987` for the abstract rational-learning model, then specialize to the two-agent example to identify the hypotheses that imply concentration on the true $\theta^I$, and finally explain when those hypotheses can fail.
+We first state the two convergence results of {cite:t}`BrayKreps1987` for the abstract model, then specialize to the two-agent example to pin down the hypotheses that drive concentration on the true $\theta^I$, and finally explain when those hypotheses can fail.
 
 ## Convergence of posterior assessments
 
@@ -622,13 +620,13 @@ The first result, due to {cite:t}`BrayKreps1987`, states that the conditional pr
 
 Fix an agent $n$ and an event $A \in \mathcal F$.
 
-The process $M_t = E^n[\mathbf 1_A \mid H_t^n(p)]$ is a $P^n$-bounded martingale with respect to $(H_t^n(p))_{t \ge 0}$, and
+The process $M_t = \mathbb{E}^n[\mathbf 1_A \mid H_t^n(p)]$ is a $P^n$-bounded martingale with respect to $(H_t^n(p))_{t \ge 0}$, and
 
 $$
 \lim_{t\to\infty}
-E^n[\mathbf 1_A \mid H_t^n(p)]
+\mathbb{E}^n[\mathbf 1_A \mid H_t^n(p)]
 =
-E^n[\mathbf 1_A \mid H_\infty^n(p)],
+\mathbb{E}^n[\mathbf 1_A \mid H_\infty^n(p)],
 \qquad P^n\text{-a.s.}
 $$
 ```
@@ -661,7 +659,7 @@ P_t^n: \Omega \to \mathcal P(\Theta),
 \omega \mapsto P_t^n(\omega),
 $$
 
-such that for each measurable $A \subseteq \Theta$, $\omega \mapsto P_t^n(\omega)(A)$ is a version of $E^n[\mathbf 1_{A \times \Phi^\infty} \mid H_t^n(p)](\omega)$, and $P_t^n(\omega) \in \mathcal P(\Theta)$ is a probability measure $P^n$-a.s.
+such that for each measurable $A \subseteq \Theta$, $\omega \mapsto P_t^n(\omega)(A)$ is a version of $\mathbb{E}^n[\mathbf 1_{A \times \Phi^\infty} \mid H_t^n(p)](\omega)$, and $P_t^n(\omega) \in \mathcal P(\Theta)$ is a probability measure $P^n$-a.s.
 
 The sharpened convergence result says these regular versions converge weakly almost surely.
 
@@ -754,7 +752,7 @@ Combining with market clearing and the price equation {eq}`eq:bk-price` gives $p
 
 Since the deviation $p_t - p_\infty(s_t; F_\infty, \theta^I, \theta^U) \to 0$ almost surely and $\{s_t\}$ is IID, the empirical distribution of observed prices has the same limit as the empirical distribution of the limiting price functional.
 
-The latter equals the distribution of $p_\infty(s; F_\infty, \theta^I, \theta^U)$ for $s \sim \mathcal N(\mu_s, \tau^2)$, and that limit is $H_\infty^U(p)$-measurable as a long-run frequency of an observable sequence.
+The latter equals the distribution of $p_\infty(s; F_\infty, \theta^I, \theta^U)$ for $s \sim N(\mu_s, \tau^2)$, and that limit is $H_\infty^U(p)$-measurable as a long-run frequency of an observable sequence.
 
 *Step 3: identification.*
 
@@ -824,9 +822,9 @@ Each agent entertains competing conditional likelihoods for other agents' inform
 
 An equilibrium can exist in which agents assign asymptotic probability one to an incorrect model that gives the observed events positive likelihood, while the true stable price relation receives zero posterior probability because that relation was absent from the prior model class.
 
-In strict rational learning the agent's prior is supported on Bayesian-consistent models in the expanded state space, and the truth is assumed to have positive prior support.
+In strict rational learning the prior already covers the truth: it is supported on Bayesian-consistent models in the expanded state space, and positive weight is placed on the true parameter.
 
-Bayes' rule can only reweight that initial model class: a pricing relation assigned zero prior probability remains impossible after any history.
+Bayes' rule can only reweight that initial model class, so a pricing relation with zero prior probability stays at zero no matter what data accumulate.
 
 ## Learning within versus learning about a rational expectations equilibrium
 
@@ -858,7 +856,7 @@ An adaptive learner who treats the price-state relation as something to be estim
 
 Bayesian rational learning can update among equilibrium maps already included in the agent's prior, but it does not explain how agents come to obtain those maps in the first place.
 
-The literature on learning *about* rational expectations equilibria, beginning with {cite:t}`Bray1982` and {cite:t}`BraySavin1984` and extended by {cite:t}`MarcetSargent1989jet`, takes the second side of the trade-off and replaces Bayes' rule with **ordinary least squares** or related recursive estimators.
+The literature on learning *about* rational expectations equilibria, beginning with {cite:t}`Bray1982` and {cite:t}`BraySavin1986` and extended by {cite:t}`MarcetSargent1989jet`, takes the second side of the trade-off and replaces Bayes' rule with **ordinary least squares** or related recursive estimators.
 
 The companion lecture {doc}`ls_learning` develops this least-squares-learning framework in self-referential models and traces the resulting dynamics through the associated ordinary differential equation.
 

From eea4f8554329a17eba48ca77fcf7cb0fc98ac7e0 Mon Sep 17 00:00:00 2001
From: thomassargent30 <ts43@nyu.edu>
Date: Fri, 5 Jun 2026 12:19:04 -0600
Subject: [PATCH 24/25] Tom's June 5 edits of long run risk model

---
 lectures/long_run_risk_operator.md | 32 ++++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/lectures/long_run_risk_operator.md b/lectures/long_run_risk_operator.md
index 79e43c1f2..079a5dbfd 100644
--- a/lectures/long_run_risk_operator.md
+++ b/lectures/long_run_risk_operator.md
@@ -28,8 +28,10 @@ kernelspec:
 
 ## Overview
 
-Standard short-horizon asset pricing tells us how investors are compensated
-for tiny, instantaneous exposures to shocks, the *short end* of the term
+Standard short-horizon asset pricing tells us how equilibrium prices compensate investors
+for tiny, instantaneous exposures to shocks.
+
+That is, they tell us about  the *short end* of the term
 structure of risk prices.
 
 But many of the most interesting asset pricing questions (e.g., the equity
@@ -77,14 +79,14 @@ a positive matrix; in general state spaces it is a continuous-state counterpart.
 We will refer to {eq}`eq:hs-factorization` as the **multiplicative
 factorization** associated with $(\rho,\phi,\hat M)$.
 
-{cite:t}`AlvarezJermann2005` introduced a related permanent-transitory
-decomposition for stochastic discount factors. 
+{cite:t}`AlvarezJermann2005` applied a related permanent-transitory
+decomposition to stochastic discount factors. 
 
 The operator approach links
 that decomposition to an explicit eigenvalue problem.
 
 ```{seealso}
-This lecture is closely related to the advanced lecture
+This lecture is closely related to the lecture
 {doc}`advanced:additive_functionals`, which studies the same kind of
 permanent-transitory decomposition for additive and multiplicative
 functionals in a discrete-time linear-Gaussian setting.
@@ -116,7 +118,7 @@ The plan of this lecture is to:
 
 A recurring theme is that shocks to persistent state variables drive a wedge between local and long-run risk prices.
 
-That wedge is the mechanism by which long-run risk models like {cite:t}`Bansal_Yaron_2004`
+Generating that  wedge is the mechanism through which long-run risk models like {cite:t}`Bansal_Yaron_2004`
 generate large equity premia.
 
 We start with the following imports
@@ -138,7 +140,7 @@ history.
 We will work with a strong Markov process whose sample paths are càdlàg
 (defined below).
 
-For the explicit formulas later we will specialize to a semimartingale that 
+To arrive at  the explicit formulas presented later we will specialize to a semimartingale that 
  decomposes into a continuous component $X^c$ and a pure-jump
 component $X^j$:
 
@@ -470,8 +472,7 @@ pick.
 
 ### Local pricing restriction
 
-Before tackling long horizons, it is worth knowing what valuation looks
-like at the *short* end.
+Before tackling long horizons, it is good to understand valuation at the *short* end.
 
 That is the standard instantaneous risk-return relation.
 
@@ -960,7 +961,7 @@ $$
 
 The verification establishes only  that $\hat M$ is a *local* martingale, but
 the definition above (and the change-of-measure interpretation of
-$\hat M$) both require it to be a martingale.
+$\hat M$) require it to be a martingale.
 
 We close this gap by adopting Assumption 6.1 of
 {cite:t}`HansenScheinkman2009`:
@@ -2242,10 +2243,11 @@ slightly different long-run risk price:
    - Set $M = GS$ and compute
    the principal eigenvalue $\rho$ of the cash-flow valuation semigroup.
 
-These two frontiers coincide in simple log-normal examples, but they can
-differ with stochastic volatility, nonlinear dynamics, or jump risk.
+In simple log-normal examples, these two frontiers coincide.
+
+But they can differ with stochastic volatility, nonlinear dynamics, or jump risk.
 
-We will work out both in the affine model below.
+We will work out both types of examples in the affine model below.
 
 ### Stochastic discount factor decomposition
 
@@ -2263,7 +2265,7 @@ $$
 $$
 
 This is the **permanent-transitory decomposition** of
-{cite:t}`AlvarezJermann2005`, now linked to a concrete eigenfunction
+{cite:t}`AlvarezJermann2005`, linked now  to a concrete eigenfunction
 construction.
 
 The factor $\exp(\rho t)$ is the deterministic trend in the SDF and the
@@ -2688,7 +2690,7 @@ The affine example illustrates this concretely, since we *rejected* one
 of the two algebraically valid eigenfunctions because it implied an
 explosive twisted square-root process.
 
-### Issue 3: a principal eigenfunction might not exist at all
+### Issue 3: a principal eigenfunction might not exist
 
 In a general state space, even *existence* of a strictly positive
 eigenfunction is not automatic.

From e7736bbb2619d1547c77cd0b1cbc7897fcfdb673 Mon Sep 17 00:00:00 2001
From: HumphreyYang <humzyyang@gmail.com>
Date: Sat, 6 Jun 2026 20:23:50 +1000
Subject: [PATCH 25/25] update

---
 lectures/misspecified_recovery.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lectures/misspecified_recovery.md b/lectures/misspecified_recovery.md
index 71a073a65..7b360a5c1 100644
--- a/lectures/misspecified_recovery.md
+++ b/lectures/misspecified_recovery.md
@@ -20,7 +20,7 @@ kernelspec:
 </div>
 ```
 
-# Misspecified recovery
+# Misspecified Recovery
 
 ```{contents} Contents
 :depth: 2