diff --git a/.github/workflows/scan.yaml b/.github/workflows/scan.yaml index c5ea763..f40714c 100644 --- a/.github/workflows/scan.yaml +++ b/.github/workflows/scan.yaml @@ -88,22 +88,43 @@ jobs: echo "drift=true" >> "$GITHUB_OUTPUT" echo "Skill path source/${{ matrix.skill_path }} not present upstream; will report catalogue drift." >&2 fi + - name: Determine LLM mode + id: llm + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + set -euo pipefail + if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then + echo "extra_flags=" >> "$GITHUB_OUTPUT" + echo "SkillSpector LLM mode: enabled (anthropic provider, api.anthropic.com)." >&2 + else + echo "extra_flags=--no-llm" >> "$GITHUB_OUTPUT" + echo "::warning::ANTHROPIC_API_KEY secret not set; SkillSpector will run with --no-llm. Set the secret on this repo to enable the LLM semantic pass." + fi - name: SkillSpector (JSON) if: steps.path_check.outputs.drift == 'false' continue-on-error: true + env: + SKILLSPECTOR_PROVIDER: anthropic + SKILLSPECTOR_MODEL: claude-sonnet-4-6 + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | mkdir -p out skillspector scan "source/${{ matrix.skill_path }}" \ - --no-llm \ + ${{ steps.llm.outputs.extra_flags }} \ --format json \ --output "out/skillspector.json" || true - name: SkillSpector (SARIF) if: steps.path_check.outputs.drift == 'false' continue-on-error: true + env: + SKILLSPECTOR_PROVIDER: anthropic + SKILLSPECTOR_MODEL: claude-sonnet-4-6 + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | mkdir -p out skillspector scan "source/${{ matrix.skill_path }}" \ - --no-llm \ + ${{ steps.llm.outputs.extra_flags }} \ --format sarif \ --output "out/skillspector.sarif" || true - name: Combine diff --git a/README.md b/README.md index f662a82..99d198e 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,10 @@ Every 6 hours, the scheduled workflow in this repo: 1. Enumerates every skill in `coder/registry` (both the in-tree `.agents/skills/` format and the future external-sources format). 2. Shallow-clones each source repo. -3. Runs [NVIDIA SkillSpector](https://github.com/NVIDIA/SkillSpector) in - `--no-llm` static mode over the upstream content. +3. Runs [NVIDIA SkillSpector](https://github.com/NVIDIA/SkillSpector) over + the upstream content. The scheduled scan runs SkillSpector's LLM + semantic pass when the workflow's LLM credential secret is + configured, and falls back to `--no-llm` static-only mode otherwise. 4. Builds a per-skill verdict (`clean`, `suspicious`, `malicious`, `unknown`) from `risk_score` plus the thresholds in `config.yaml`. 5. Builds the React SPA in `site/` and ships it together with @@ -97,7 +99,13 @@ This scanner is data-driven. To run it against a different registry: "GitHub Actions"). 4. Set Actions workflow permissions to "Read and write" so the publish-release job can create releases. -5. Enable Actions. +5. To enable the LLM semantic pass, set the credential secret matching + `config.yaml`'s `scanners.skillspector.llm.provider` on your fork + (for the default `anthropic` provider, `ANTHROPIC_API_KEY`), AND + confirm `.github/workflows/scan.yaml` exports that secret into the + SkillSpector step. Static-only mode (without the secret) is the + default and works out of the box. +6. Enable Actions. No source changes required for catalogue changes. @@ -112,10 +120,7 @@ verdict: ``` SkillSpector's `risk_score` (0-100) is the only input. The thresholds -are aligned to SkillSpector's own `HIGH` and `CRITICAL` bands; -[`docs/CALIBRATION.md`](./docs/CALIBRATION.md) walks through the -evidence (SkillSpector source, the ClawHub paper, our in-tree -catalogue) behind the chosen numbers. +are aligned to SkillSpector's own `HIGH` and `CRITICAL` bands. The architecture keeps room for additional scanners (gitleaks, Semgrep, VirusTotal Premium, etc.); adding one is a new module under `scanner/`, diff --git a/config.yaml b/config.yaml index b20462a..18a9019 100644 --- a/config.yaml +++ b/config.yaml @@ -6,11 +6,13 @@ config_version: 1 catalogue: - # Where to enumerate skills from. Both the current production format - # (in-tree under .agents/skills/) and the future external-sources - # format (registry//skills/README.md with sources[].repo) are - # supported. When both name the same slug, the external-sources entry - # wins. + # Skills are declared by per-namespace README.md files under + # registry//skills/ in the catalogue repo. Each README's + # frontmatter lists sources[].repo plus per-skill overrides. This is + # the canonical declaration; the in-tree .agents/skills/ format is + # supported in scanner/enumerate.py for forks that need it but is + # not enabled here because coder/registry duplicates the same + # upstream skills across both layouts under different slugs. registry_repo: owner: coder repo: registry @@ -22,13 +24,6 @@ catalogue: # has its frontmatter parsed for sources[].repo plus per-skill # overrides keyed by slug. readme_glob: registry/*/skills/README.md - in_tree: - enabled: true - # The namespace is fixed for in-tree skills today (coder). - namespace: coder - # Path glob inside the catalogue repo. Each /SKILL.md is one - # skill row in the matrix. - base_path: .agents/skills scanners: skillspector: @@ -39,8 +34,12 @@ scanners: # so a bumper bot lives outside the loop until the upstream # publishes to PyPI and the pin can move into pyproject.toml. pin: "skillspector @ git+https://github.com/NVIDIA/SkillSpector.git@2eb844780ab163f01468ecf142c40a2ec0fcaec0" - flags: - - "--no-llm" + # Empty so .github/workflows/scan.yaml can append --no-llm + # dynamically based on whether the LLM credential secret is set. + flags: [] + llm: + provider: anthropic + model: "claude-sonnet-4-6" # Per-skill verdict policy. v1 has one input (SkillSpector risk_score). # When more scanners join the pipeline we add new threshold fields here @@ -54,13 +53,12 @@ scanners: # 51-80 HIGH DO_NOT_INSTALL -> verdict: suspicious # 81-100 CRITICAL DO_NOT_INSTALL -> verdict: malicious # -# Rationale and source links live in docs/CALIBRATION.md. Short version: -# SkillSpector's static-analysis layer is loud on real catalogues (the -# ClawHub paper measured a ~49% positive rate on 67k skills) and is -# advisory rather than authoritative, so we only escalate above its -# HIGH cutoff. CAUTION-band findings still appear in the per-skill page -# so reviewers can see them; we just do not flag the skill as suspicious -# at the catalogue level. +# Rationale: SkillSpector's static-analysis layer is loud on real +# catalogues (the ClawHub paper measured a ~49% positive rate on 67k +# skills) and is advisory rather than authoritative, so we only +# escalate above its HIGH cutoff. CAUTION-band findings still appear +# on the per-skill page so reviewers can see them; we just do not +# flag the skill as suspicious at the catalogue level. verdict: malicious_risk_score: 81 suspicious_risk_score: 51 diff --git a/docs/CALIBRATION.md b/docs/CALIBRATION.md deleted file mode 100644 index e4cf5a0..0000000 --- a/docs/CALIBRATION.md +++ /dev/null @@ -1,132 +0,0 @@ -# Verdict threshold calibration - -This document records how the verdict thresholds in `config.yaml` were -chosen. The thresholds are not arbitrary: they are aligned to -SkillSpector's own internal severity bands and informed by the published -evaluation of SkillSpector against a large real-world skill catalogue. - -If you bump the thresholds, update this doc in the same PR. Numbers that -nobody can defend later are how scanners drift into either uselessness -or boy-who-cried-wolf territory. - -## Inputs we are calibrating against - -### 1. SkillSpector's published severity bands - -NVIDIA's SkillSpector computes `risk_assessment.score` on a 0-100 scale -from rule hits weighted by severity, plus a 1.3x multiplier when the -skill carries executable scripts. The score is then bucketed into a -named severity and a `recommendation` field: - -| Score range | `severity` | `recommendation` | -|-------------|------------|---------------------| -| 0-20 | `LOW` | `SAFE` | -| 21-50 | `MEDIUM` | `CAUTION` | -| 51-80 | `HIGH` | `DO_NOT_INSTALL` | -| 81-100 | `CRITICAL` | `DO_NOT_INSTALL` | - -Source: [`skillspector/nodes/report.py`](https://github.com/NVIDIA/SkillSpector/blob/main/skillspector/nodes/report.py) -(`_compute_risk_score` for the weighting, `_severity_from_score` for the -bucketing). The SkillSpector CLI exits non-zero when `risk_score > 50`, -which is the same boundary as the `HIGH` band. - -### 2. The ClawHub evaluation - -Two NVIDIA-affiliated artifacts describe how SkillSpector performs in -the wild: - -- ClawHub paper, "ClawHub: A large-scale safety analysis of Claude - Skills" (arxiv.org/html/2606.01494v1). -- OpenClaw blog, "SkillSpector at scale on ClawHub" - (openclaw.ai/blog/openclaw-nvidia-skill-security). -- Hugging Face dataset of per-skill signals - (huggingface.co/datasets/OpenClaw/clawhub-security-signals). - -Two numbers from those sources drive our calibration: - -- On 67,453 real Claude skills, SkillSpector returned at least one - finding on roughly 49% of them. That is the population our verdict - policy will see most of, so a threshold at SkillSpector's MEDIUM band - would flag close to half the catalogue as "suspicious," which is not - useful. -- On a labelled subset of known-malicious skills, SkillSpector alone - caught about 6.8% (recall), while VirusTotal Premium caught about - 72.8%. SkillSpector is good for surfacing risky behaviour patterns; - it is not a reliable malicious-classifier on its own. - -The paper's own pipeline (`ClawScan`) treats SkillSpector as one of -several signals fed into an LLM-as-judge. That tells us SkillSpector's -output is best read as advisory until we add more scanners. - -### 3. Our existing in-tree results - -The current `coder/registry` in-tree catalogue contains five skills: -`coder/coder-modules`, `coder/coder-templates`, `coder/modules`, -`coder/templates`, and `coder/setup`. Under the chosen thresholds: - -| Skill | SkillSpector score | Verdict | -|------------------------|-------------------:|-------------| -| `coder/coder-modules` | 0 | `clean` | -| `coder/coder-templates`| 0 | `clean` | -| `coder/modules` | 0 | `clean` | -| `coder/templates` | 10 | `clean` | -| `coder/setup` | 100 | `malicious` | - -The previous thresholds (40/75) produced the same outcome for these -five inputs. The change does not silence any signal that was firing -today; it raises the bar that future skills must clear before being -called out. - -## Threshold choices - -```yaml -verdict: - malicious_risk_score: 81 - suspicious_risk_score: 51 -``` - -- `malicious_risk_score: 81` matches SkillSpector's `CRITICAL` band. - Anything SkillSpector itself describes as `CRITICAL` / - `DO_NOT_INSTALL` (top decile) becomes our `malicious` verdict. -- `suspicious_risk_score: 51` matches the `HIGH` band, which is also - the score at which the SkillSpector CLI starts exiting non-zero. A - skill that SkillSpector says is `HIGH` / `DO_NOT_INSTALL` becomes - our `suspicious` verdict (the registry-server badge surfaces this as - "Review before installing"). -- Skills in the `MEDIUM` / `CAUTION` band (21-50) stay `clean` at the - catalogue level. Their findings are still rendered on the per-skill - page so reviewers can drill in, but they do not trigger a badge. - This avoids broadcasting the ~half-of-catalogue base rate that - ClawHub measured. - -## What we did not change (and why) - -- We did not raise `suspicious_risk_score` above `51`. SkillSpector - itself escalates at that boundary; staying in sync keeps the - recommendation field on the per-skill page consistent with the - badge on the catalogue page. -- We did not add a separate "low confidence" verdict. A fourth tier - buys us little until we have a second scanner to combine signals - with. The schema's `unknown` verdict already covers the - "could not assess" case, which is the only failure mode v1 cares - about. -- We did not move thresholds into the published `latest.json`. The - SPA uses defaults that match `config.yaml`. If a future change makes - the artifact policy-aware, plumb the values through and drop the - defaults from `VerdictExplanation.tsx`. - -## When to revisit - -Re-run this analysis when any of: - -- A new scanner (gitleaks, Semgrep, VirusTotal Premium, ClawScan, etc.) - joins the pipeline. The combined verdict logic in - `scanner/verdict.py` will need a new branch and most likely - different thresholds per signal. -- SkillSpector bumps its scoring weights or rule catalogue in a way - that shifts where its bands sit. The pinned commit in `config.yaml` - protects us from drifting silently; a deliberate bump should walk - through this doc. -- We observe a real-world skill that lands in an obviously wrong - bucket (false positive or false negative). Open a tracking issue, - link it from this doc, and adjust with evidence in the next PR. diff --git a/scanner/verdict.py b/scanner/verdict.py index 7e6d46a..8e77360 100644 --- a/scanner/verdict.py +++ b/scanner/verdict.py @@ -48,7 +48,7 @@ def evaluate( thresholds = config.get("verdict") or {} # Defaults match config.yaml. Keep these in sync with - # docs/CALIBRATION.md and VerdictExplanation.tsx's defaults. + # VerdictExplanation.tsx's defaults. malicious_at = int(thresholds.get("malicious_risk_score", 81)) suspicious_at = int(thresholds.get("suspicious_risk_score", 51)) diff --git a/site/src/components/RiskBar/RiskBar.tsx b/site/src/components/RiskBar/RiskBar.tsx index f515c98..94bdb49 100644 --- a/site/src/components/RiskBar/RiskBar.tsx +++ b/site/src/components/RiskBar/RiskBar.tsx @@ -19,7 +19,7 @@ interface RiskBarProps { * Optional cutoffs (0..100) for the suspicious and malicious bands. * When supplied, the bar renders thin tick marks at those positions so * the user can see how close a score is to escalating. Defaults match - * the policy in config.yaml and docs/CALIBRATION.md. + * the policy in config.yaml. */ suspicious_at?: number; malicious_at?: number; diff --git a/site/src/components/VerdictExplanation/VerdictExplanation.tsx b/site/src/components/VerdictExplanation/VerdictExplanation.tsx index 4db9f35..9d35217 100644 --- a/site/src/components/VerdictExplanation/VerdictExplanation.tsx +++ b/site/src/components/VerdictExplanation/VerdictExplanation.tsx @@ -374,8 +374,7 @@ const CategoryCard: FC = ({ group }) => { export const VerdictExplanation: FC = ({ skill, // Defaults match config.yaml and scanner/verdict.py. They are also - // SkillSpector's own HIGH and CRITICAL band edges; see - // docs/CALIBRATION.md for the calibration write-up. + // SkillSpector's own HIGH and CRITICAL band edges. malicious_at = 81, suspicious_at = 51, className,