Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
a21fe17
feat(config.yaml): add scanners.skillspector.llm block
DevelopmentCats Jun 22, 2026
c3f42db
docs(CALIBRATION.md): add LLM semantic pass section
DevelopmentCats Jun 22, 2026
c10bf52
docs(README.md): document LLM mode and the one-time secret setup
DevelopmentCats Jun 22, 2026
8c57f9a
feat(config.yaml): switch LLM provider to Anthropic Sonnet 4.6
DevelopmentCats Jun 22, 2026
3333b81
docs(README.md): point setup at Anthropic provider, not nv_build
DevelopmentCats Jun 22, 2026
28c05fe
fix(config.yaml): reframe LLM block as contract, swap to openai+aibridge
DevelopmentCats Jun 23, 2026
6bbb6bc
fix(docs/CALIBRATION.md): real measured numbers + workflow-gap note
DevelopmentCats Jun 23, 2026
b301daa
fix(README.md): clarify the workflow-file dependency, point setup at …
DevelopmentCats Jun 23, 2026
0fea10e
fix(config.yaml): swap LLM provider to Anthropic direct, not aibridge
DevelopmentCats Jun 23, 2026
f10bfad
docs(README.md): point setup at ANTHROPIC_API_KEY, drop OPENAI_BASE_U…
DevelopmentCats Jun 23, 2026
d6c9c5d
docs(CALIBRATION.md): record Anthropic-direct decision and model-swap…
DevelopmentCats Jun 23, 2026
9f40801
feat(scanner): bump LLM model from claude-sonnet-4-5 to claude-sonnet…
DevelopmentCats Jun 23, 2026
3170357
chore: strip explanatory cruft from config.yaml and PR-state callouts
DevelopmentCats Jun 23, 2026
82d9a32
docs(README.md): drop reference to LLM-on-vs-off measurement
DevelopmentCats Jun 24, 2026
64414e9
chore: delete docs/CALIBRATION.md, drop all references to it
DevelopmentCats Jun 24, 2026
85911ec
docs(README.md): drop the 'One-time setup on the repo' section
DevelopmentCats Jun 24, 2026
0f10e71
feat(scan.yaml): wire ANTHROPIC_API_KEY + SKILLSPECTOR env into Skill…
DevelopmentCats Jun 24, 2026
2848e3c
fix(config.yaml): disable in_tree enumeration to stop duplicate skills
DevelopmentCats Jun 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions .github/workflows/scan.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,22 +88,43 @@ jobs:
echo "drift=true" >> "$GITHUB_OUTPUT"
echo "Skill path source/${{ matrix.skill_path }} not present upstream; will report catalogue drift." >&2
fi
- name: Determine LLM mode
id: llm
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
set -euo pipefail
if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then
echo "extra_flags=" >> "$GITHUB_OUTPUT"
echo "SkillSpector LLM mode: enabled (anthropic provider, api.anthropic.com)." >&2
else
echo "extra_flags=--no-llm" >> "$GITHUB_OUTPUT"
echo "::warning::ANTHROPIC_API_KEY secret not set; SkillSpector will run with --no-llm. Set the secret on this repo to enable the LLM semantic pass."
fi
- name: SkillSpector (JSON)
if: steps.path_check.outputs.drift == 'false'
continue-on-error: true
env:
SKILLSPECTOR_PROVIDER: anthropic
SKILLSPECTOR_MODEL: claude-sonnet-4-6
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
mkdir -p out
skillspector scan "source/${{ matrix.skill_path }}" \
--no-llm \
${{ steps.llm.outputs.extra_flags }} \
--format json \
--output "out/skillspector.json" || true
- name: SkillSpector (SARIF)
if: steps.path_check.outputs.drift == 'false'
continue-on-error: true
env:
SKILLSPECTOR_PROVIDER: anthropic
SKILLSPECTOR_MODEL: claude-sonnet-4-6
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
mkdir -p out
skillspector scan "source/${{ matrix.skill_path }}" \
--no-llm \
${{ steps.llm.outputs.extra_flags }} \
--format sarif \
--output "out/skillspector.sarif" || true
- name: Combine
Expand Down
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ Every 6 hours, the scheduled workflow in this repo:
1. Enumerates every skill in `coder/registry` (both the in-tree
`.agents/skills/` format and the future external-sources format).
2. Shallow-clones each source repo.
3. Runs [NVIDIA SkillSpector](https://github.com/NVIDIA/SkillSpector) in
`--no-llm` static mode over the upstream content.
3. Runs [NVIDIA SkillSpector](https://github.com/NVIDIA/SkillSpector) over
the upstream content. The scheduled scan runs SkillSpector's LLM
semantic pass when the workflow's LLM credential secret is
configured, and falls back to `--no-llm` static-only mode otherwise.
4. Builds a per-skill verdict (`clean`, `suspicious`, `malicious`,
`unknown`) from `risk_score` plus the thresholds in `config.yaml`.
5. Builds the React SPA in `site/` and ships it together with
Expand Down Expand Up @@ -97,7 +99,13 @@ This scanner is data-driven. To run it against a different registry:
"GitHub Actions").
4. Set Actions workflow permissions to "Read and write" so the
publish-release job can create releases.
5. Enable Actions.
5. To enable the LLM semantic pass, set the credential secret matching
`config.yaml`'s `scanners.skillspector.llm.provider` on your fork
(for the default `anthropic` provider, `ANTHROPIC_API_KEY`), AND
confirm `.github/workflows/scan.yaml` exports that secret into the
SkillSpector step. Static-only mode (without the secret) is the
default and works out of the box.
6. Enable Actions.

No source changes required for catalogue changes.

Expand All @@ -112,10 +120,7 @@ verdict:
```

SkillSpector's `risk_score` (0-100) is the only input. The thresholds
are aligned to SkillSpector's own `HIGH` and `CRITICAL` bands;
[`docs/CALIBRATION.md`](./docs/CALIBRATION.md) walks through the
evidence (SkillSpector source, the ClawHub paper, our in-tree
catalogue) behind the chosen numbers.
are aligned to SkillSpector's own `HIGH` and `CRITICAL` bands.

The architecture keeps room for additional scanners (gitleaks, Semgrep,
VirusTotal Premium, etc.); adding one is a new module under `scanner/`,
Expand Down
40 changes: 19 additions & 21 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
config_version: 1

catalogue:
# Where to enumerate skills from. Both the current production format
# (in-tree under .agents/skills/) and the future external-sources
# format (registry/<ns>/skills/README.md with sources[].repo) are
# supported. When both name the same slug, the external-sources entry
# wins.
# Skills are declared by per-namespace README.md files under
# registry/<ns>/skills/ in the catalogue repo. Each README's
# frontmatter lists sources[].repo plus per-skill overrides. This is
# the canonical declaration; the in-tree .agents/skills/ format is
# supported in scanner/enumerate.py for forks that need it but is
# not enabled here because coder/registry duplicates the same
# upstream skills across both layouts under different slugs.
registry_repo:
owner: coder
repo: registry
Expand All @@ -22,13 +24,6 @@ catalogue:
# has its frontmatter parsed for sources[].repo plus per-skill
# overrides keyed by slug.
readme_glob: registry/*/skills/README.md
in_tree:
enabled: true
# The namespace is fixed for in-tree skills today (coder).
namespace: coder
# Path glob inside the catalogue repo. Each <slug>/SKILL.md is one
# skill row in the matrix.
base_path: .agents/skills

scanners:
skillspector:
Expand All @@ -39,8 +34,12 @@ scanners:
# so a bumper bot lives outside the loop until the upstream
# publishes to PyPI and the pin can move into pyproject.toml.
pin: "skillspector @ git+https://github.com/NVIDIA/SkillSpector.git@2eb844780ab163f01468ecf142c40a2ec0fcaec0"
flags:
- "--no-llm"
# Empty so .github/workflows/scan.yaml can append --no-llm
# dynamically based on whether the LLM credential secret is set.
flags: []
llm:
provider: anthropic
model: "claude-sonnet-4-6"

# Per-skill verdict policy. v1 has one input (SkillSpector risk_score).
# When more scanners join the pipeline we add new threshold fields here
Expand All @@ -54,13 +53,12 @@ scanners:
# 51-80 HIGH DO_NOT_INSTALL -> verdict: suspicious
# 81-100 CRITICAL DO_NOT_INSTALL -> verdict: malicious
#
# Rationale and source links live in docs/CALIBRATION.md. Short version:
# SkillSpector's static-analysis layer is loud on real catalogues (the
# ClawHub paper measured a ~49% positive rate on 67k skills) and is
# advisory rather than authoritative, so we only escalate above its
# HIGH cutoff. CAUTION-band findings still appear in the per-skill page
# so reviewers can see them; we just do not flag the skill as suspicious
# at the catalogue level.
# Rationale: SkillSpector's static-analysis layer is loud on real
# catalogues (the ClawHub paper measured a ~49% positive rate on 67k
# skills) and is advisory rather than authoritative, so we only
# escalate above its HIGH cutoff. CAUTION-band findings still appear
# on the per-skill page so reviewers can see them; we just do not
# flag the skill as suspicious at the catalogue level.
verdict:
malicious_risk_score: 81
suspicious_risk_score: 51
Expand Down
132 changes: 0 additions & 132 deletions docs/CALIBRATION.md

This file was deleted.

2 changes: 1 addition & 1 deletion scanner/verdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def evaluate(

thresholds = config.get("verdict") or {}
# Defaults match config.yaml. Keep these in sync with
# docs/CALIBRATION.md and VerdictExplanation.tsx's defaults.
# VerdictExplanation.tsx's defaults.
malicious_at = int(thresholds.get("malicious_risk_score", 81))
suspicious_at = int(thresholds.get("suspicious_risk_score", 51))

Expand Down
2 changes: 1 addition & 1 deletion site/src/components/RiskBar/RiskBar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ interface RiskBarProps {
* Optional cutoffs (0..100) for the suspicious and malicious bands.
* When supplied, the bar renders thin tick marks at those positions so
* the user can see how close a score is to escalating. Defaults match
* the policy in config.yaml and docs/CALIBRATION.md.
* the policy in config.yaml.
*/
suspicious_at?: number;
malicious_at?: number;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,8 +374,7 @@ const CategoryCard: FC<CategoryCardProps> = ({ group }) => {
export const VerdictExplanation: FC<VerdictExplanationProps> = ({
skill,
// Defaults match config.yaml and scanner/verdict.py. They are also
// SkillSpector's own HIGH and CRITICAL band edges; see
// docs/CALIBRATION.md for the calibration write-up.
// SkillSpector's own HIGH and CRITICAL band edges.
malicious_at = 81,
suspicious_at = 51,
className,
Expand Down