From bc9ea85dfe0ea3d9f2deb5ce98646d6c16e71e28 Mon Sep 17 00:00:00 2001 From: Joseph <162703152+josephnef@users.noreply.github.com> Date: Wed, 24 Jun 2026 17:35:37 +0300 Subject: [PATCH] Thermal monitoring: surface chip thermal meter + health bucket Add a read-only thermal probe across the whole Jaguar family so an operator can attribute a TX drop to the chip heating up. The thermal meter (RF[A] reg 0x42, bits [15:10]) was already sampled inside the 8812A TX-power-tracking loop, but only there: gated to CHIP_8812, logged at debug, and never surfaced as health. This exposes it independently of the power-track correction. - RadioManagementModule::ReadThermalStatus() reads RF[A][0x42] standalone (no chip gate, no BB-swing write) and pairs it with the EFUSE baseline, returning a ThermalStatus{raw, baseline, delta, valid}. Works on 8812/8811/8814/8821 (path A is readable on all members). - ThermalBucket() classifies the delta into cool/warm/hot/critical. The meter has no calibrated degC transfer function on the AU family (it is an RF/PA-bias tracking index), so the health signal is deliberately bucketed rather than presented as a fake temperature. - RtlJaguarDevice exposes GetThermalStatus() (synchronous, owning-thread safe) plus an opt-in background poller (start_thermal_poller / get_thermal_snapshot) mirroring the queue-depth poller, with the same shared-handle concurrency caveat. - Demos honour DEVOURER_THERMAL_POLL_MS / DEVOURER_THERMAL_WARN_DELTA and emit lines; the TX demo reads inline on its own thread (no USB contention), the RX demo uses the background poller. A one-shot warn fires when delta crosses the threshold and re-arms on cooldown. - tests/thermal_hwcheck.sh: per-chip hardware smoke test. Jaguar-1 has no automatic hard thermal TX shutdown, so a rising delta is an early warning that the PA is heating and TX power is being backed off, not a shutdown-reason readout. Verified on hardware (8812/8814/8821): meter reads live, delta tracks PA heat from a cold start through to a warm/hot steady state, and the bucket + warn move correctly with physical state. Build green, ctest green. Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 16 +++++++ demo/main.cpp | 40 ++++++++++++++++ src/RadioManagementModule.cpp | 19 ++++++++ src/RadioManagementModule.h | 36 ++++++++++++++ src/RtlJaguarDevice.cpp | 73 ++++++++++++++++++++++++++++ src/RtlJaguarDevice.h | 27 +++++++++++ tests/thermal_hwcheck.sh | 89 +++++++++++++++++++++++++++++++++++ txdemo/main.cpp | 40 ++++++++++++++++ 8 files changed, 340 insertions(+) create mode 100755 tests/thermal_hwcheck.sh diff --git a/CLAUDE.md b/CLAUDE.md index d0d64b2..4acfad8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -114,6 +114,22 @@ Both `WiFiDriverDemo` and `WiFiDriverTxDemo` honour: DEBUG (produces ~7 MB per 15 s — has filled `/tmp` mid-capture and adds 0.5-0.8 s to init even with stderr discarded). `DEVOURER_USB_QUIET` is accepted as a no-op for backwards compatibility. +- `DEVOURER_THERMAL_POLL_MS=N` — emit periodic `` lines from + the chip thermal meter (RF[A][0x42][15:10]) paired with the EFUSE baseline: + `raw` (0..63 thermal units, ~1.5-2 °C each, not absolute °C), `baseline`, + `delta = raw − baseline`, and a coarse `status` bucket (cool/warm/hot/critical, + keyed off delta — the meter has no calibrated °C, so this is deliberately + bucketed rather than a fake temperature). Works on every Jaguar chip; read-only (does not + alter TX-power tracking). 0/unset = disabled. In `WiFiDriverDemo` (RX) this + spawns a background poller at the given cadence; in `WiFiDriverTxDemo` it is + read inline on the TX thread (no extra USB contention) every `N/2` frames. + Jaguar-1 has no hard thermal TX shutdown — a rising `delta` is the early + warning that the PA is heating and TX power is being backed off. NB: on the + 8814 the EFUSE baseline is read at the 8812 offset, so the absolute `delta` + may be off there; the raw trend is still valid. +- `DEVOURER_THERMAL_WARN_DELTA=N` — thermal-units-above-baseline threshold at + which a one-shot `warn` fires (default `15`); re-arms once the chip cools + back below it. `WiFiDriverTxDemo` additionally honours radiotap-encoding knobs that patch the beacon's MCS info field (or, with `_VHT=1`, replace it with a diff --git a/demo/main.cpp b/demo/main.cpp index cd5116e..914477d 100644 --- a/demo/main.cpp +++ b/demo/main.cpp @@ -59,6 +59,19 @@ static const uint32_t g_qd_poll_ms = []() -> uint32_t { return e ? static_cast(std::strtoul(e, nullptr, 0)) : 0u; }(); +/* DEVOURER_THERMAL_POLL_MS=N: periodic snapshot of the chip thermal meter + * (RF[A][0x42][15:10]), one `` line per interval. Works on + * every Jaguar member. 0 = disabled. DEVOURER_THERMAL_WARN_DELTA overrides the + * warn threshold (thermal units above the EFUSE baseline; default 15). */ +static const uint32_t g_thermal_poll_ms = []() -> uint32_t { + const char *e = std::getenv("DEVOURER_THERMAL_POLL_MS"); + return e ? static_cast(std::strtoul(e, nullptr, 0)) : 0u; +}(); +static const int g_thermal_warn_delta = []() -> int { + const char *e = std::getenv("DEVOURER_THERMAL_WARN_DELTA"); + return e ? std::atoi(e) : 15; +}(); + /* DEVOURER_RX_DUMP_CSI=hex,hex,... (or "0x1a,0x20,0x40"): F2 research * spike. On each canonical-SA RX frame (first N frames), read BB * dbgport 0x8FC at each selector and emit @@ -386,6 +399,33 @@ int main() { } }); } + std::atomic therm_emitter_stop{false}; + std::thread therm_emitter; + if (g_thermal_poll_ms > 0) { + logger->info("DEVOURER_THERMAL_POLL_MS={} warn_delta={} — starting thermal " + "poller", g_thermal_poll_ms, g_thermal_warn_delta); + rtlDevice->start_thermal_poller(g_thermal_poll_ms, g_thermal_warn_delta); + therm_emitter = std::thread([&therm_emitter_stop]() { + while (!therm_emitter_stop.load()) { + if (g_rtl_device != nullptr) { + auto t = g_rtl_device->get_thermal_snapshot(); + if (t.valid) { + printf("raw=%u baseline=%u delta=%+d status=%s\n", + t.raw, t.baseline, t.delta, ThermalBucket(t)); + } else { + printf("raw=%u baseline=none status=%s\n", + t.raw, ThermalBucket(t)); + } + fflush(stdout); + } + for (uint32_t slept = 0; + slept < g_thermal_poll_ms && !therm_emitter_stop.load(); + slept += 50) { + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + } + }); + } /* Default channel 36 (5 GHz) for the 8812 reference. Override with * DEVOURER_CHANNEL=N env var (e.g. DEVOURER_CHANNEL=6 for busy 2.4 GHz). */ int channel = 36; diff --git a/src/RadioManagementModule.cpp b/src/RadioManagementModule.cpp index 9531fb8..9e55ff8 100644 --- a/src/RadioManagementModule.cpp +++ b/src/RadioManagementModule.cpp @@ -70,6 +70,25 @@ void RadioManagementModule::TickPwrTrack() { _pwrTrk.TickThermalMeter(current_band_type, _currentChannel); } +ThermalStatus RadioManagementModule::ReadThermalStatus() { + /* Live thermal meter: RF path A, register 0x42, bits [15:10]. phy_query_rf_reg + * already masks + shifts the bits down, so the result is the raw 6-bit reading. + * Same register the 8812A power-track loop samples; here we read it standalone + * (no chip-type gate, no BB-swing write) so the probe works on every Jaguar. */ + ThermalStatus s; + uint32_t rf = phy_query_rf_reg(RfPath::RF_PATH_A, 0x42, 0xfc00u); + s.raw = static_cast(rf & 0x3F); + s.baseline = _eepromManager->GetEepromThermalMeter(); + if (s.baseline == 0xFF) { + s.valid = false; + s.delta = 0; + } else { + s.valid = true; + s.delta = static_cast(s.raw) - static_cast(s.baseline); + } + return s; +} + void RadioManagementModule::hw_var_rcr_config(uint32_t rcr) { _device.rtw_write32(REG_RCR, rcr); } diff --git a/src/RadioManagementModule.h b/src/RadioManagementModule.h index b61b0c1..bce3529 100644 --- a/src/RadioManagementModule.h +++ b/src/RadioManagementModule.h @@ -137,6 +137,36 @@ enum MGN_RATE { MGN_UNKNOWN }; +/* Read-only snapshot of the chip's thermal meter. `raw` is the live + * RF[A][0x42][15:10] reading (0..63, Realtek "thermal units" — roughly + * 1.5-2 C each, NOT absolute degrees). `baseline` is the EFUSE + * factory-calibrated reading (0xFF = autoload failed / no baseline). + * `delta = raw - baseline` (signed) is the heat signal — positive means + * the chip is running hotter than calibration. `valid` is false when no + * EFUSE baseline is available, in which case only `raw` is meaningful. */ +struct ThermalStatus { + uint8_t raw = 0; + uint8_t baseline = 0xFF; + int delta = 0; + bool valid = false; +}; + +/* Coarse, honest health label for a thermal reading. The meter is NOT a + * calibrated °C sensor (Realtek publishes no °C transfer function for the AU + * family; the value is an RF/PA-bias tracking index), so we deliberately bucket + * the delta-from-baseline rather than fake a precise temperature — the same + * stance the rtl88x2eu driver takes (cool/warm/hot/...). Thresholds are in + * thermal units above the EFUSE baseline; "hot" aligns with the default + * DEVOURER_THERMAL_WARN_DELTA of 15. Returns "unknown" when no EFUSE baseline + * is available (delta is meaningless without it). */ +inline const char *ThermalBucket(const ThermalStatus &s) { + if (!s.valid) return "unknown"; + if (s.delta < 8) return "cool"; + if (s.delta < 15) return "warm"; + if (s.delta < 25) return "hot"; + return "critical"; +} + class RadioManagementModule { RtlUsbAdapter _device; std::shared_ptr _eepromManager; @@ -170,6 +200,12 @@ class RadioManagementModule { * callback `odm_txpowertracking_callback_thermal_meter` and writes * the resulting BB-swing index to 0xc1c[31:21] / 0xe1c[31:21]. */ void TickPwrTrack(); + /* Read the chip thermal meter (RF[A][0x42][15:10]) and pair it with the + * EFUSE baseline. Read-only — does NOT touch the TX-power-tracking + * BB-swing registers (that correction lives in TickPwrTrack). Works on + * every Jaguar member: path-A RF reads succeed on 8812/8811/8814/8821 + * (on the 8814 only paths C/D are write-only). */ + ThermalStatus ReadThermalStatus(); /* Run a full I/Q calibration. Mirrors upstream * `phy_iq_calibrate_8812a` triggered from the channel-set callback * when `_needIQK` is asserted. Takes ~50-100 ms per invocation. */ diff --git a/src/RtlJaguarDevice.cpp b/src/RtlJaguarDevice.cpp index 2abdc64..9f729d8 100644 --- a/src/RtlJaguarDevice.cpp +++ b/src/RtlJaguarDevice.cpp @@ -365,6 +365,10 @@ RtlJaguarDevice::~RtlJaguarDevice() { if (_qd_thread.joinable()) { _qd_thread.join(); } + _therm_stop.store(true); + if (_therm_thread.joinable()) { + _therm_thread.join(); + } } void RtlJaguarDevice::start_queue_depth_poller(uint32_t interval_ms) { @@ -409,6 +413,75 @@ std::array RtlJaguarDevice::get_queue_depth() const { return out; } +static uint32_t pack_thermal(const ThermalStatus &s) { + int8_t d = static_cast( + s.delta > 127 ? 127 : (s.delta < -128 ? -128 : s.delta)); + return (s.valid ? 1u : 0u) | (uint32_t(s.raw) << 8) | + (uint32_t(s.baseline) << 16) | + (uint32_t(static_cast(d)) << 24); +} + +static ThermalStatus unpack_thermal(uint32_t v) { + ThermalStatus s; + s.valid = (v & 1u) != 0; + s.raw = static_cast((v >> 8) & 0xFF); + s.baseline = static_cast((v >> 16) & 0xFF); + s.delta = static_cast((v >> 24) & 0xFF); + return s; +} + +ThermalStatus RtlJaguarDevice::GetThermalStatus() { + return _radioManagement->ReadThermalStatus(); +} + +ThermalStatus RtlJaguarDevice::get_thermal_snapshot() const { + return unpack_thermal(_therm_snap.load(std::memory_order_relaxed)); +} + +void RtlJaguarDevice::start_thermal_poller(uint32_t interval_ms, + int warn_delta) { + if (interval_ms == 0) return; + if (_therm_thread.joinable()) { + _logger->warn("thermal poller already running"); + return; + } + _therm_thread = std::thread([this, interval_ms, warn_delta]() { + bool warned = false; + bool baseline_note = false; + while (!_therm_stop.load()) { + ThermalStatus s = _radioManagement->ReadThermalStatus(); + _therm_snap.store(pack_thermal(s), std::memory_order_relaxed); + if (!s.valid) { + if (!baseline_note) { + _logger->info( + "thermal: no EFUSE baseline (0xFF) — reporting raw only " + "(raw={})", + unsigned(s.raw)); + baseline_note = true; + } + } else if (s.delta >= warn_delta) { + if (!warned) { + _logger->warn( + "thermal: chip running hot ({}) — raw={} baseline={} delta=+{} " + "(>= {}); TX power tracking backing off, sustained TX may " + "degrade the PA", + ThermalBucket(s), unsigned(s.raw), unsigned(s.baseline), s.delta, + warn_delta); + warned = true; + } + } else { + warned = false; /* re-arm once it cools back under the threshold */ + } + /* Sleep in short slices so destruction doesn't block for a full + * interval after _therm_stop is set. */ + for (uint32_t slept = 0; slept < interval_ms && !_therm_stop.load(); + slept += 50) { + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + } + }); +} + uint32_t RtlJaguarDevice::read_bb_dbgport(uint32_t selector) { if (!_bb_dbgport) { _bb_dbgport = std::make_unique(_device, _logger); diff --git a/src/RtlJaguarDevice.h b/src/RtlJaguarDevice.h index a9049c0..0295f84 100644 --- a/src/RtlJaguarDevice.h +++ b/src/RtlJaguarDevice.h @@ -67,6 +67,26 @@ class RtlJaguarDevice { void start_queue_depth_poller(uint32_t interval_ms); std::array get_queue_depth() const; + /* Read the chip thermal meter (RF[A][0x42][15:10]) paired with the EFUSE + * baseline. Read-only — leaves the TX-power-tracking BB-swing registers + * untouched. Works on every Jaguar member. Safe to call from the thread + * that owns the device (e.g. inline in a TX loop) — no USB contention. + * See ThermalStatus in RadioManagementModule.h for field semantics. */ + ThermalStatus GetThermalStatus(); + + /* Spawn a background thread that samples the thermal meter every + * interval_ms and stores a snapshot (queryable via get_thermal_snapshot). + * Emits a logger->warn when delta >= warn_delta. 0 interval = disabled. + * Intended for the RX demo, whose Init() blocks the main thread. + * + * CONCURRENCY: an RF read is a multi-step BB register sequence over the + * shared libusb handle. Background phydm-style polling has wedged the chip + * before (ch100 second-channel-set), so this poller is opt-in and should + * use a conservative cadence (>= 1 s). A TX loop on the owning thread + * should prefer the synchronous GetThermalStatus() instead. */ + void start_thermal_poller(uint32_t interval_ms, int warn_delta); + ThermalStatus get_thermal_snapshot() const; + /* F2 research helper: read a u32 from the BB debug port at `selector`, * with save/restore around register 0x8FC. Lazy-constructs the reader * on first call. Returns 0 if the chip wedged on a prior call. See @@ -82,6 +102,13 @@ class RtlJaguarDevice { std::thread _qd_thread; std::atomic _qd_stop{false}; + std::thread _therm_thread; + std::atomic _therm_stop{false}; + /* Packed last thermal snapshot: bit0 = valid, [8:15] = raw, + * [16:23] = baseline, [24:31] = signed delta (clamped to int8). Stored as + * one atomic so a reader sees a consistent tuple without a mutex. */ + std::atomic _therm_snap{0}; + std::unique_ptr _bb_dbgport; }; diff --git a/tests/thermal_hwcheck.sh b/tests/thermal_hwcheck.sh new file mode 100755 index 0000000..00697ba --- /dev/null +++ b/tests/thermal_hwcheck.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# Hardware smoke-test for the thermal monitor probe. +# +# Runs WiFiDriverTxDemo against each plugged Jaguar adapter for a few seconds +# with DEVOURER_THERMAL_POLL_MS enabled, and prints the +# lines it emits. Read-only w.r.t. the probe — this just confirms the thermal +# meter reads back a live, plausible value per chip. +# +# Usage: sudo tests/thermal_hwcheck.sh +set -u + +BUILD_DIR="$(cd "$(dirname "$0")/.." && pwd)/build" +TXDEMO="$BUILD_DIR/WiFiDriverTxDemo" +RUN_SECS=6 +POLL_MS=500 # ~ every 250 TX frames inline +WARN_DELTA=15 + +CHILD_PID="" +cleanup() { + if [[ -n "$CHILD_PID" ]] && kill -0 "$CHILD_PID" 2>/dev/null; then + kill -INT "$CHILD_PID" 2>/dev/null + sleep 0.3 + kill -KILL "$CHILD_PID" 2>/dev/null + fi + # Backstop: reap any stray demo by exact comm name. + pkill -KILL -x WiFiDriverTxDemo 2>/dev/null +} +trap cleanup EXIT INT TERM + +if [[ ! -x "$TXDEMO" ]]; then + echo "ERROR: $TXDEMO not built — run: cmake --build build -j" >&2 + exit 1 +fi + +# pid -> human label +declare -A CHIPS=( + [0x8812]="RTL8812AU" + [0x8813]="RTL8814AU" +) +# 8821AU is OEM-rebadged on the T2U Plus (2357:0120) — needs VID override. +declare -A VID_OVERRIDE=( + [0x0120]="0x2357" +) +declare -A CHIPS_OEM=( + [0x0120]="RTL8821AU (T2U Plus)" +) + +run_one() { + local pid="$1" label="$2" vid="${3:-0x0bda}" + echo + echo "===================================================================" + echo " $label (VID=$vid PID=$pid) — ${RUN_SECS}s" + echo "===================================================================" + local log + log="$(mktemp)" + DEVOURER_VID="$vid" DEVOURER_PID="$pid" \ + DEVOURER_THERMAL_POLL_MS="$POLL_MS" \ + DEVOURER_THERMAL_WARN_DELTA="$WARN_DELTA" \ + "$TXDEMO" >"$log" 2>&1 & + CHILD_PID=$! + sleep "$RUN_SECS" + if kill -0 "$CHILD_PID" 2>/dev/null; then + kill -INT "$CHILD_PID" 2>/dev/null; sleep 0.3 + kill -KILL "$CHILD_PID" 2>/dev/null + fi + wait "$CHILD_PID" 2>/dev/null + CHILD_PID="" + + echo "--- thermal monitor lines ---" + grep -E "|thermal:|ThermalMeter|thermal monitor on" "$log" | head -20 + local n + n="$(grep -c "" "$log")" + echo "--- ($n lines total) ---" + if [[ "$n" -eq 0 ]]; then + echo " no thermal lines — tail of log for context:" + tail -15 "$log" | sed 's/^/ /' + fi + rm -f "$log" +} + +for pid in "${!CHIPS[@]}"; do + run_one "$pid" "${CHIPS[$pid]}" +done +for pid in "${!CHIPS_OEM[@]}"; do + run_one "$pid" "${CHIPS_OEM[$pid]}" "${VID_OVERRIDE[$pid]}" +done + +echo +echo "done." diff --git a/txdemo/main.cpp b/txdemo/main.cpp index d9fe750..53d152f 100644 --- a/txdemo/main.cpp +++ b/txdemo/main.cpp @@ -430,6 +430,26 @@ int main(int argc, char **argv) { tx_buf.assign(beacon_frame, beacon_frame + sizeof(beacon_frame)); } + /* Thermal monitoring — read inline on the TX (owning) thread, so no + * background thread shares the libusb handle (no USB contention). Cadence is + * derived from DEVOURER_THERMAL_POLL_MS over the ~2 ms/packet loop; 0 = + * disabled. DEVOURER_THERMAL_WARN_DELTA overrides the warn threshold (thermal + * units above the EFUSE baseline; default 15). */ + long thermal_every = 0; + if (const char *e = std::getenv("DEVOURER_THERMAL_POLL_MS")) { + long ms = std::strtol(e, nullptr, 0); + if (ms > 0) thermal_every = ms / 2 < 1 ? 1 : ms / 2; + } + int thermal_warn_delta = 15; + if (const char *e = std::getenv("DEVOURER_THERMAL_WARN_DELTA")) { + thermal_warn_delta = std::atoi(e); + } + if (thermal_every > 0) { + logger->info("DEVOURER_THERMAL_POLL_MS — thermal monitor on, every {} TX " + "frames, warn_delta={}", thermal_every, thermal_warn_delta); + } + bool thermal_warned = false; + long tx_count = 0; while (true) { if (tx_count == 0) { @@ -442,6 +462,26 @@ int main(int argc, char **argv) { printf("TX #%ld rc=%d\n", tx_count, rc); fflush(stdout); } + if (thermal_every > 0 && tx_count % thermal_every == 0) { + auto t = rtlDevice->GetThermalStatus(); + if (t.valid) { + printf("raw=%u baseline=%u delta=%+d status=%s\n", + t.raw, t.baseline, t.delta, ThermalBucket(t)); + if (t.delta >= thermal_warn_delta && !thermal_warned) { + logger->warn("thermal: chip running hot — raw={} baseline={} " + "delta=+{} (>= {}); TX power tracking backing off, " + "sustained TX may degrade the PA", + t.raw, t.baseline, t.delta, thermal_warn_delta); + thermal_warned = true; + } else if (t.delta < thermal_warn_delta) { + thermal_warned = false; + } + } else { + printf("raw=%u baseline=none status=%s\n", + t.raw, ThermalBucket(t)); + } + fflush(stdout); + } std::this_thread::sleep_for(std::chrono::milliseconds(2)); /* ~500 fps, gentle on USB bulk EP */ } rc = libusb_release_interface(handle, 0);