epiforecasts · kathsherratt · May 18, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/report/_quarto/.gitignore → R/.gitignore b/report/_quarto/.gitignore → R/.gitignore
diff --git a/R/README.md b/R/README.md
@@ -0,0 +1,69 @@
+# R/
+
+Analysis scripts for evaluating COVID-19 forecast accuracy across models. Scripts fall into two categories: pipeline scripts run in sequence to produce results, and utility scripts called by the pipeline.
+
+## Pipeline scripts
+
+Run in this order to reproduce the analysis:
+
+| Script | Purpose | Inputs | Outputs |
+|---|---|---|---|
+| `process-score.R` | Score forecasts using WIS on natural and log scales; normalises predictions and observations to per-100,000 population | `data/covid19-forecast-hub-europe.parquet`, `data/observed-*.csv`, `data/populations.csv` | `data/scores-raw-case.csv`, `data/scores-raw-death.csv` |
+| `process-data.R` | Join scores with covariates (model classification, variant phase, observed incidence, trend) into a single analysis-ready dataframe | `data/scores-raw-*.csv`, `data/observed-*.csv`, `data/model-classification.csv` | In-memory dataframe via `process_data()` |
+| `analysis-descriptive.R` | Descriptive summaries: bootstrap confidence intervals, publication-ready Table 1, score distribution ridge plots, time-series plots | `process_data()` output | Plot and table objects |
+| `analysis-model.R` | Fit GAMM to WIS isolating effects of model structure (`Method`) and geographic scope (`CountryTargets`) after adjusting for confounders | `process_data()` output | `output/results.rds`, `output/plots/check_*.pdf` |
+| `dag-check.R` | Define and visualise causal DAG; derive minimal adjustment set for the exposure–outcome relationship | — | Plot objects |
+| `plot-model-flow.R` | Generate model inclusion/exclusion flowchart applying sequential eligibility criteria | `data/covid19-forecast-hub-europe.parquet` | `output/plots/flowchart.png` |
+| `plot-model-results.R` | Forest plots of GAMM random effects (unadjusted and adjusted) by model, method, and target scope | `output/results.rds`, `process_data()` output | ggplot objects |
+
+## Utility scripts
+
+Called by pipeline scripts; not intended to be run directly.
+
+| Script | Key functions | Purpose |
+|---|---|---|
+| `utils-data.R` | `get_forecasts()`, `download_obs()`, `download_pop()` | Read forecast parquet, download observed incidence and population data from JHU/ECDC sources |
+| `utils-metadata.R` | `get_metadata_processed()`, `write_metadata()` | Fetch model metadata from GitHub; read and write qualitative model classifications to/from Google Sheets |
+| `utils-variants.R` | `classify_variant_phases()` | Classify dominant SARS-CoV-2 variant per country-week using ECDC, UK UKHSA, and Swiss surveillance data |
+
+## Data flow
+
+```
+covid19-forecast-hub-europe.parquet
+        │
+        ▼
+utils-data.R ──────────────────────────────► data/observed-*.csv
+        │                                    data/populations.csv
+        │
+        ▼
+process-score.R ───────────────────────────► data/scores-raw-case.csv
+        │                                    data/scores-raw-death.csv
+        │
+        ├── data/model-classification.csv  ◄── utils-metadata.R (Google Sheets)
+        ├── data/observed-*.csv
+        └── utils-variants.R (ECDC/UKHSA/CH)
+                │
+                ▼
+        process-data.R ── process_data() ──► in-memory analysis dataframe
+                │
+                ├──► analysis-descriptive.R ── tables, plots
+                │
+                └──► analysis-model.R ────────► output/results.rds
+                                                output/plots/check_*.pdf
+                                                        │
+                                                        ▼
+                                              plot-model-results.R ── forest plots
+```
+
+## Model specification
+
+The main model (`analysis-model.R`) fits a GAMM via `mgcv::bam()`:
+
+- **Family:** Gaussian with log link
+- **Fitting:** fREML with `discrete = TRUE`
+- **Fixed effects of interest:** `Method` (model structure), `CountryTargets` (geographic scope)
+- **Adjusting for:** `Incidence` (cubic spline), `Trend`, `Horizon`, `VariantPhase`, `Location`, `Model`
+- **Random effects basis:** `re` for Location, VariantPhase, and Model
+- **Outcomes:** cases and deaths scored separately on natural and log scales
+
+Univariate models are fitted first (one term each) to obtain unadjusted estimates; a joint model containing all terms yields adjusted estimates. Both are saved to `output/results.rds`.
diff --git a/R/analysis-model.R b/R/analysis-model.R
@@ -47,7 +47,7 @@ model_wis <- function(scoring_scale = "log", output_dir = "output") {
     model = wis ~ s(Model, bs = "re")
   )
 
-  # Full joint model
+  # Full model
   m.formula_joint <- wis ~
     s(Method, bs = "re") +
     s(CountryTargets, bs = "re") +

diff --git a/R/dag-check.R b/R/dag-check.R
@@ -1,67 +1,45 @@
-# Define DAG for model structure and WIS
-# Variables:
-# model_structure: model structure (exposure)
-# wis: weighted interval score (outcome)
-# team_resources: resources available to the team (latent)
-# transmission_dynamics: underlying transmission dynamics (latent)
-# country_targets: whether the model forecasts for one or multiple countries
-# variant: dominant variant phase
-# country: country target location
-# trend: increasing, decreasing, or stable epidemic trend
-# horizon: forecast horizon
-
+# Define DAG: methods -> score
 library(dagitty)
 library(ggdag)
 library(ggplot2)
 
-dag <- dagitty('dag {
-  model_structure[exposure]
-  wis [outcome]
-  model_team_resources[latent]
-  transmission_dynamics[latent]
-  
-  model_team_resources -> model_structure
-  model_team_resources -> wis
-  model_team_resources -> country_targets
-  
-  model_country_targets -> model_structure
-  model_country_targets -> wis
-
-  country -> variant
-  variant -> transmission_dynamics
-  transmission_dynamics -> country
-  transmission_dynamics -> trend
-
-  transmission_dynamics -> wis
-  variant -> wis
-  trend -> wis
-  country -> wis
-  horizon -> wis
-
-  model_structure -> wis
-}')
-
-cat("\n--- Minimal adjustment sets for TOTAL effect of model_structure on wis ---\n")
-print(adjustmentSets(dag, effect = "total"))
-
-cat("\n--- Minimal adjustment sets for DIRECT effect of model_structure on wis ---\n")
-print(adjustmentSets(dag, effect = "direct"))
+dag <- dagitty('
+dag {
+bb="-6.421,-7.624,6.826,6.874"
+"Accuracy score" [outcome,pos="5.506,2.291"]
+"Epidemic dynamics" [latent,pos="1.743,-0.912"]
+"Model structure" [exposure,pos="-2.105,2.885"]
+"Modeller strategy" [latent,pos="-2.133,-1.027"]
+"Observed incidence" [adjusted,pos="3.234,0.948"]
+"Single/multi-country" [pos="0.210,1.926"]
+Country [adjusted,pos="-0.231,-0.950"]
+Horizon [adjusted,pos="2.155,1.907"]
+Model [adjusted,pos="1.658,3.000"]
+Prediction [pos="3.418,3.058"]
+Trend [adjusted,pos="3.262,-0.164"]
+Variant [adjusted,pos="0.210,-1.794"]
+"Epidemic dynamics" -> "Observed incidence"
+"Epidemic dynamics" -> Trend
+"Model structure" -> "Single/multi-country"
+"Model structure" -> Model
+"Modeller strategy" -> "Model structure"
+"Modeller strategy" -> "Single/multi-country"
+"Observed incidence" -> "Accuracy score"
+"Observed incidence" <-> Trend
+"Single/multi-country" -> Prediction
+Country -> "Epidemic dynamics"
+Country -> "Observed incidence"
+Horizon -> Prediction
+Model -> Prediction
+Prediction -> "Accuracy score"
+Variant -> "Epidemic dynamics"
+}
+')
 
-cat("\n--- Implied conditional independencies ---\n")
-print(impliedConditionalIndependencies(dag))
+tidy_ggdag <- tidy_dagitty(dag)
 
-cat("\n--- All paths from model_structure to wis (open/closed unconditional) ---\n")
-print(paths(dag, from = "model_structure", to = "wis"))
-
-cat("\n--- Paths given the total-effect adjustment set ---\n")
-adj <- adjustmentSets(dag, effect = "total")
-if (length(adj) > 0) {
-  print(paths(dag, from = "model_structure", to = "wis", Z = adj[[1]]))
-}
+p_adj <- ggdag_adjustment_set(tidy_ggdag, node_size = 14) +
+  theme(legend.position = "bottom")
 
 p <- ggdag_status(dag, text = FALSE, use_labels = "name") +
-  theme_dag()
-
-dir.create("report/supplement", recursive = TRUE, showWarnings = FALSE)
-ggsave("report/supplement/dag.pdf", p, width = 8, height = 6)
-cat("\nDAG figure saved to report/supplement/dag.pdf\n")
+  theme_dag_blank()
diff --git a/report/_quarto/_background.qmd b/report/_quarto/_background.qmd