diff --git a/.github/workflows/scan.yaml b/.github/workflows/scan.yaml
index c5ea763..f40714c 100644
--- a/.github/workflows/scan.yaml
+++ b/.github/workflows/scan.yaml
@@ -88,22 +88,43 @@ jobs:
             echo "drift=true" >> "$GITHUB_OUTPUT"
             echo "Skill path source/${{ matrix.skill_path }} not present upstream; will report catalogue drift." >&2
           fi
+      - name: Determine LLM mode
+        id: llm
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          set -euo pipefail
+          if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then
+            echo "extra_flags=" >> "$GITHUB_OUTPUT"
+            echo "SkillSpector LLM mode: enabled (anthropic provider, api.anthropic.com)." >&2
+          else
+            echo "extra_flags=--no-llm" >> "$GITHUB_OUTPUT"
+            echo "::warning::ANTHROPIC_API_KEY secret not set; SkillSpector will run with --no-llm. Set the secret on this repo to enable the LLM semantic pass."
+          fi
       - name: SkillSpector (JSON)
         if: steps.path_check.outputs.drift == 'false'
         continue-on-error: true
+        env:
+          SKILLSPECTOR_PROVIDER: anthropic
+          SKILLSPECTOR_MODEL: claude-sonnet-4-6
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
           mkdir -p out
           skillspector scan "source/${{ matrix.skill_path }}" \
-            --no-llm \
+            ${{ steps.llm.outputs.extra_flags }} \
             --format json \
             --output "out/skillspector.json" || true
       - name: SkillSpector (SARIF)
         if: steps.path_check.outputs.drift == 'false'
         continue-on-error: true
+        env:
+          SKILLSPECTOR_PROVIDER: anthropic
+          SKILLSPECTOR_MODEL: claude-sonnet-4-6
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
           mkdir -p out
           skillspector scan "source/${{ matrix.skill_path }}" \
-            --no-llm \
+            ${{ steps.llm.outputs.extra_flags }} \
             --format sarif \
             --output "out/skillspector.sarif" || true
       - name: Combine
diff --git a/README.md b/README.md
index f662a82..99d198e 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,10 @@ Every 6 hours, the scheduled workflow in this repo:
 1. Enumerates every skill in `coder/registry` (both the in-tree
    `.agents/skills/` format and the future external-sources format).
 2. Shallow-clones each source repo.
-3. Runs [NVIDIA SkillSpector](https://github.com/NVIDIA/SkillSpector) in
-   `--no-llm` static mode over the upstream content.
+3. Runs [NVIDIA SkillSpector](https://github.com/NVIDIA/SkillSpector) over
+   the upstream content. The scheduled scan runs SkillSpector's LLM
+   semantic pass when the workflow's LLM credential secret is
+   configured, and falls back to `--no-llm` static-only mode otherwise.
 4. Builds a per-skill verdict (`clean`, `suspicious`, `malicious`,
    `unknown`) from `risk_score` plus the thresholds in `config.yaml`.
 5. Builds the React SPA in `site/` and ships it together with
@@ -97,7 +99,13 @@ This scanner is data-driven. To run it against a different registry:
    "GitHub Actions").
 4. Set Actions workflow permissions to "Read and write" so the
    publish-release job can create releases.
-5. Enable Actions.
+5. To enable the LLM semantic pass, set the credential secret matching
+   `config.yaml`'s `scanners.skillspector.llm.provider` on your fork
+   (for the default `anthropic` provider, `ANTHROPIC_API_KEY`), AND
+   confirm `.github/workflows/scan.yaml` exports that secret into the
+   SkillSpector step. Static-only mode (without the secret) is the
+   default and works out of the box.
+6. Enable Actions.
 
 No source changes required for catalogue changes.
 
@@ -112,10 +120,7 @@ verdict:
 ```
 
 SkillSpector's `risk_score` (0-100) is the only input. The thresholds
-are aligned to SkillSpector's own `HIGH` and `CRITICAL` bands;
-[`docs/CALIBRATION.md`](./docs/CALIBRATION.md) walks through the
-evidence (SkillSpector source, the ClawHub paper, our in-tree
-catalogue) behind the chosen numbers.
+are aligned to SkillSpector's own `HIGH` and `CRITICAL` bands.
 
 The architecture keeps room for additional scanners (gitleaks, Semgrep,
 VirusTotal Premium, etc.); adding one is a new module under `scanner/`,
diff --git a/config.yaml b/config.yaml
index b20462a..18a9019 100644
--- a/config.yaml
+++ b/config.yaml
@@ -6,11 +6,13 @@
 config_version: 1
 
 catalogue:
-  # Where to enumerate skills from. Both the current production format
-  # (in-tree under .agents/skills/) and the future external-sources
-  # format (registry/<ns>/skills/README.md with sources[].repo) are
-  # supported. When both name the same slug, the external-sources entry
-  # wins.
+  # Skills are declared by per-namespace README.md files under
+  # registry/<ns>/skills/ in the catalogue repo. Each README's
+  # frontmatter lists sources[].repo plus per-skill overrides. This is
+  # the canonical declaration; the in-tree .agents/skills/ format is
+  # supported in scanner/enumerate.py for forks that need it but is
+  # not enabled here because coder/registry duplicates the same
+  # upstream skills across both layouts under different slugs.
   registry_repo:
     owner: coder
     repo: registry
@@ -22,13 +24,6 @@ catalogue:
       # has its frontmatter parsed for sources[].repo plus per-skill
       # overrides keyed by slug.
       readme_glob: registry/*/skills/README.md
-    in_tree:
-      enabled: true
-      # The namespace is fixed for in-tree skills today (coder).
-      namespace: coder
-      # Path glob inside the catalogue repo. Each <slug>/SKILL.md is one
-      # skill row in the matrix.
-      base_path: .agents/skills
 
 scanners:
   skillspector:
@@ -39,8 +34,12 @@ scanners:
     # so a bumper bot lives outside the loop until the upstream
     # publishes to PyPI and the pin can move into pyproject.toml.
     pin: "skillspector @ git+https://github.com/NVIDIA/SkillSpector.git@2eb844780ab163f01468ecf142c40a2ec0fcaec0"
-    flags:
-      - "--no-llm"
+    # Empty so .github/workflows/scan.yaml can append --no-llm
+    # dynamically based on whether the LLM credential secret is set.
+    flags: []
+    llm:
+      provider: anthropic
+      model: "claude-sonnet-4-6"
 
 # Per-skill verdict policy. v1 has one input (SkillSpector risk_score).
 # When more scanners join the pipeline we add new threshold fields here
@@ -54,13 +53,12 @@ scanners:
 #   51-80  HIGH      DO_NOT_INSTALL   -> verdict: suspicious
 #   81-100 CRITICAL  DO_NOT_INSTALL   -> verdict: malicious
 #
-# Rationale and source links live in docs/CALIBRATION.md. Short version:
-# SkillSpector's static-analysis layer is loud on real catalogues (the
-# ClawHub paper measured a ~49% positive rate on 67k skills) and is
-# advisory rather than authoritative, so we only escalate above its
-# HIGH cutoff. CAUTION-band findings still appear in the per-skill page
-# so reviewers can see them; we just do not flag the skill as suspicious
-# at the catalogue level.
+# Rationale: SkillSpector's static-analysis layer is loud on real
+# catalogues (the ClawHub paper measured a ~49% positive rate on 67k
+# skills) and is advisory rather than authoritative, so we only
+# escalate above its HIGH cutoff. CAUTION-band findings still appear
+# on the per-skill page so reviewers can see them; we just do not
+# flag the skill as suspicious at the catalogue level.
 verdict:
   malicious_risk_score: 81
   suspicious_risk_score: 51
diff --git a/docs/CALIBRATION.md b/docs/CALIBRATION.md
deleted file mode 100644
index e4cf5a0..0000000
--- a/docs/CALIBRATION.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# Verdict threshold calibration
-
-This document records how the verdict thresholds in `config.yaml` were
-chosen. The thresholds are not arbitrary: they are aligned to
-SkillSpector's own internal severity bands and informed by the published
-evaluation of SkillSpector against a large real-world skill catalogue.
-
-If you bump the thresholds, update this doc in the same PR. Numbers that
-nobody can defend later are how scanners drift into either uselessness
-or boy-who-cried-wolf territory.
-
-## Inputs we are calibrating against
-
-### 1. SkillSpector's published severity bands
-
-NVIDIA's SkillSpector computes `risk_assessment.score` on a 0-100 scale
-from rule hits weighted by severity, plus a 1.3x multiplier when the
-skill carries executable scripts. The score is then bucketed into a
-named severity and a `recommendation` field:
-
-| Score range | `severity` | `recommendation`    |
-|-------------|------------|---------------------|
-| 0-20        | `LOW`      | `SAFE`              |
-| 21-50       | `MEDIUM`   | `CAUTION`           |
-| 51-80       | `HIGH`     | `DO_NOT_INSTALL`    |
-| 81-100      | `CRITICAL` | `DO_NOT_INSTALL`    |
-
-Source: [`skillspector/nodes/report.py`](https://github.com/NVIDIA/SkillSpector/blob/main/skillspector/nodes/report.py)
-(`_compute_risk_score` for the weighting, `_severity_from_score` for the
-bucketing). The SkillSpector CLI exits non-zero when `risk_score > 50`,
-which is the same boundary as the `HIGH` band.
-
-### 2. The ClawHub evaluation
-
-Two NVIDIA-affiliated artifacts describe how SkillSpector performs in
-the wild:
-
-- ClawHub paper, "ClawHub: A large-scale safety analysis of Claude
-  Skills" (arxiv.org/html/2606.01494v1).
-- OpenClaw blog, "SkillSpector at scale on ClawHub"
-  (openclaw.ai/blog/openclaw-nvidia-skill-security).
-- Hugging Face dataset of per-skill signals
-  (huggingface.co/datasets/OpenClaw/clawhub-security-signals).
-
-Two numbers from those sources drive our calibration:
-
-- On 67,453 real Claude skills, SkillSpector returned at least one
-  finding on roughly 49% of them. That is the population our verdict
-  policy will see most of, so a threshold at SkillSpector's MEDIUM band
-  would flag close to half the catalogue as "suspicious," which is not
-  useful.
-- On a labelled subset of known-malicious skills, SkillSpector alone
-  caught about 6.8% (recall), while VirusTotal Premium caught about
-  72.8%. SkillSpector is good for surfacing risky behaviour patterns;
-  it is not a reliable malicious-classifier on its own.
-
-The paper's own pipeline (`ClawScan`) treats SkillSpector as one of
-several signals fed into an LLM-as-judge. That tells us SkillSpector's
-output is best read as advisory until we add more scanners.
-
-### 3. Our existing in-tree results
-
-The current `coder/registry` in-tree catalogue contains five skills:
-`coder/coder-modules`, `coder/coder-templates`, `coder/modules`,
-`coder/templates`, and `coder/setup`. Under the chosen thresholds:
-
-| Skill                  | SkillSpector score | Verdict     |
-|------------------------|-------------------:|-------------|
-| `coder/coder-modules`  | 0                  | `clean`     |
-| `coder/coder-templates`| 0                  | `clean`     |
-| `coder/modules`        | 0                  | `clean`     |
-| `coder/templates`      | 10                 | `clean`     |
-| `coder/setup`          | 100                | `malicious` |
-
-The previous thresholds (40/75) produced the same outcome for these
-five inputs. The change does not silence any signal that was firing
-today; it raises the bar that future skills must clear before being
-called out.
-
-## Threshold choices
-
-```yaml
-verdict:
-  malicious_risk_score: 81
-  suspicious_risk_score: 51
-```
-
-- `malicious_risk_score: 81` matches SkillSpector's `CRITICAL` band.
-  Anything SkillSpector itself describes as `CRITICAL` /
-  `DO_NOT_INSTALL` (top decile) becomes our `malicious` verdict.
-- `suspicious_risk_score: 51` matches the `HIGH` band, which is also
-  the score at which the SkillSpector CLI starts exiting non-zero. A
-  skill that SkillSpector says is `HIGH` / `DO_NOT_INSTALL` becomes
-  our `suspicious` verdict (the registry-server badge surfaces this as
-  "Review before installing").
-- Skills in the `MEDIUM` / `CAUTION` band (21-50) stay `clean` at the
-  catalogue level. Their findings are still rendered on the per-skill
-  page so reviewers can drill in, but they do not trigger a badge.
-  This avoids broadcasting the ~half-of-catalogue base rate that
-  ClawHub measured.
-
-## What we did not change (and why)
-
-- We did not raise `suspicious_risk_score` above `51`. SkillSpector
-  itself escalates at that boundary; staying in sync keeps the
-  recommendation field on the per-skill page consistent with the
-  badge on the catalogue page.
-- We did not add a separate "low confidence" verdict. A fourth tier
-  buys us little until we have a second scanner to combine signals
-  with. The schema's `unknown` verdict already covers the
-  "could not assess" case, which is the only failure mode v1 cares
-  about.
-- We did not move thresholds into the published `latest.json`. The
-  SPA uses defaults that match `config.yaml`. If a future change makes
-  the artifact policy-aware, plumb the values through and drop the
-  defaults from `VerdictExplanation.tsx`.
-
-## When to revisit
-
-Re-run this analysis when any of:
-
-- A new scanner (gitleaks, Semgrep, VirusTotal Premium, ClawScan, etc.)
-  joins the pipeline. The combined verdict logic in
-  `scanner/verdict.py` will need a new branch and most likely
-  different thresholds per signal.
-- SkillSpector bumps its scoring weights or rule catalogue in a way
-  that shifts where its bands sit. The pinned commit in `config.yaml`
-  protects us from drifting silently; a deliberate bump should walk
-  through this doc.
-- We observe a real-world skill that lands in an obviously wrong
-  bucket (false positive or false negative). Open a tracking issue,
-  link it from this doc, and adjust with evidence in the next PR.
diff --git a/scanner/verdict.py b/scanner/verdict.py
index 7e6d46a..8e77360 100644
--- a/scanner/verdict.py
+++ b/scanner/verdict.py
@@ -48,7 +48,7 @@ def evaluate(
 
     thresholds = config.get("verdict") or {}
     # Defaults match config.yaml. Keep these in sync with
-    # docs/CALIBRATION.md and VerdictExplanation.tsx's defaults.
+    # VerdictExplanation.tsx's defaults.
     malicious_at = int(thresholds.get("malicious_risk_score", 81))
     suspicious_at = int(thresholds.get("suspicious_risk_score", 51))
 
diff --git a/site/src/components/RiskBar/RiskBar.tsx b/site/src/components/RiskBar/RiskBar.tsx
index f515c98..94bdb49 100644
--- a/site/src/components/RiskBar/RiskBar.tsx
+++ b/site/src/components/RiskBar/RiskBar.tsx
@@ -19,7 +19,7 @@ interface RiskBarProps {
    * Optional cutoffs (0..100) for the suspicious and malicious bands.
    * When supplied, the bar renders thin tick marks at those positions so
    * the user can see how close a score is to escalating. Defaults match
-   * the policy in config.yaml and docs/CALIBRATION.md.
+   * the policy in config.yaml.
    */
   suspicious_at?: number;
   malicious_at?: number;
diff --git a/site/src/components/VerdictExplanation/VerdictExplanation.tsx b/site/src/components/VerdictExplanation/VerdictExplanation.tsx
index 4db9f35..9d35217 100644
--- a/site/src/components/VerdictExplanation/VerdictExplanation.tsx
+++ b/site/src/components/VerdictExplanation/VerdictExplanation.tsx
@@ -374,8 +374,7 @@ const CategoryCard: FC<CategoryCardProps> = ({ group }) => {
 export const VerdictExplanation: FC<VerdictExplanationProps> = ({
   skill,
   // Defaults match config.yaml and scanner/verdict.py. They are also
-  // SkillSpector's own HIGH and CRITICAL band edges; see
-  // docs/CALIBRATION.md for the calibration write-up.
+  // SkillSpector's own HIGH and CRITICAL band edges.
   malicious_at = 81,
   suspicious_at = 51,
   className,