diff --git a/Cargo.lock b/Cargo.lock index 29919124a..8689366ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3414,6 +3414,7 @@ dependencies = [ "bytes", "futures", "openshell-core", + "serde", "tar", "tempfile", "tokio", @@ -3436,6 +3437,7 @@ dependencies = [ "openshell-core", "prost", "prost-types", + "serde", "serde_json", "thiserror 2.0.18", "tokio", @@ -3667,6 +3669,7 @@ dependencies = [ "tokio-rustls", "tokio-stream", "tokio-tungstenite 0.26.2", + "toml", "tonic", "tower 0.5.3", "tower-http 0.6.8", @@ -5129,6 +5132,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -6066,6 +6078,47 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap 2.14.0", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tonic" version = "0.12.3" @@ -7155,6 +7208,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + [[package]] name = "wiremock" version = "0.6.5" diff --git a/Cargo.toml b/Cargo.toml index 195544431..3fea379a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ nix = { version = "0.29", features = ["signal", "process", "user", "fs", "term"] serde = { version = "1", features = ["derive"] } serde_json = "1" serde_yml = "0.0.12" +toml = "0.8" apollo-parser = "0.8.5" # HTTP client diff --git a/architecture/gateway.md b/architecture/gateway.md index 68832d0cf..5840cb511 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -193,6 +193,43 @@ pre-created Secrets) disable the Helm hook via `pkiInitJob.enabled=false`. The chart also ships a `certManager.*` path that produces equivalent Secrets through cert-manager `Issuer`/`Certificate` resources. +## Configuration + +The gateway reads its configuration from three sources, merged in this +precedence (highest first): + +``` +CLI flag > OPENSHELL_* env var > TOML file > built-in default +``` + +The TOML file is opt-in via `--config ` / `OPENSHELL_GATEWAY_CONFIG`. +When unset, the gateway behaves exactly as before — CLI flags and env vars +drive every setting. See `docs/reference/gateway-config.mdx` for worked +per-driver examples and RFC 0003 for the full schema. + +`database_url` is env-only and rejected when present in the file +(`OPENSHELL_DB_URL` / `--db-url`). + +### Driver inheritance + +`[openshell.gateway]` carries a small set of values (`default_image`, +`supervisor_image`, `guest_tls_ca/cert/key`, `client_tls_secret_name`, +`host_gateway_ip`, `enable_user_namespaces`) that are inherited into each +driver's `[openshell.drivers.]` table when the driver-specific table +does not override them. The allowlist is per-driver so a gateway-wide +default cannot land in a driver that does not understand it (e.g. +`client_tls_secret_name` is K8s-only). + +`image_pull_policy` is intentionally **not** inheritable: Kubernetes uses +`Always | IfNotPresent | Never` (passed verbatim to the K8s API) while +Podman uses the lowercase enum `always | missing | never | newer`. No +value means the same thing in both, so the key lives only under each +driver's own table. + +Driver-specific values that are not part of the inheritance allowlist +(e.g. K8s `namespace`, Podman `socket_path`, VM `vcpus`) only come from +the driver's own table. + ## Operational Constraints - Gateway TLS and client certificate distribution are deployment concerns owned diff --git a/crates/openshell-driver-docker/Cargo.toml b/crates/openshell-driver-docker/Cargo.toml index 79d4fb37d..e2c97532a 100644 --- a/crates/openshell-driver-docker/Cargo.toml +++ b/crates/openshell-driver-docker/Cargo.toml @@ -19,6 +19,7 @@ futures = { workspace = true } tokio-stream = { workspace = true } tracing = { workspace = true } bytes = { workspace = true } +serde = { workspace = true } bollard = { version = "0.20" } tar = "0.4" tempfile = "3" diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 6059596ab..785cc4cd3 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -127,7 +127,8 @@ pub trait SupervisorReadiness: Send + Sync + 'static { } /// Gateway-local configuration for the Docker compute driver. -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct DockerComputeConfig { /// Optional override for the Linux `openshell-sandbox` binary mounted into containers. pub supervisor_bin: Option, @@ -151,6 +152,19 @@ pub struct DockerComputeConfig { pub network_name: String, } +impl Default for DockerComputeConfig { + fn default() -> Self { + Self { + supervisor_bin: None, + supervisor_image: None, + guest_tls_ca: None, + guest_tls_cert: None, + guest_tls_key: None, + network_name: DEFAULT_DOCKER_NETWORK_NAME.to_string(), + } + } +} + #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct DockerGuestTlsPaths { pub(crate) ca: PathBuf, diff --git a/crates/openshell-driver-kubernetes/Cargo.toml b/crates/openshell-driver-kubernetes/Cargo.toml index 5e247dc77..c222c9c31 100644 --- a/crates/openshell-driver-kubernetes/Cargo.toml +++ b/crates/openshell-driver-kubernetes/Cargo.toml @@ -26,6 +26,7 @@ tokio-stream = { workspace = true } kube = { workspace = true } kube-runtime = { workspace = true } k8s-openapi = { workspace = true } +serde = { workspace = true } serde_json = { workspace = true } clap = { workspace = true } tracing = { workspace = true } diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index e9baf0e5c..a6d133a6d 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -1,8 +1,14 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use openshell_core::config::{ + DEFAULT_K8S_NAMESPACE, DEFAULT_SSH_HANDSHAKE_SKEW_SECS, DEFAULT_SUPERVISOR_IMAGE, +}; +use serde::{Deserialize, Serialize}; + /// How the supervisor binary is delivered into sandbox pods. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] pub enum SupervisorSideloadMethod { /// Mount the supervisor OCI image directly as a read-only volume /// (requires Kubernetes >= v1.33 with the `ImageVolume` feature gate, @@ -37,7 +43,8 @@ impl std::str::FromStr for SupervisorSideloadMethod { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct KubernetesComputeConfig { pub namespace: String, pub default_image: String, @@ -59,3 +66,27 @@ pub struct KubernetesComputeConfig { pub host_gateway_ip: String, pub enable_user_namespaces: bool, } + +impl Default for KubernetesComputeConfig { + fn default() -> Self { + Self { + namespace: DEFAULT_K8S_NAMESPACE.to_string(), + default_image: String::new(), + // Default empty so the gateway omits `imagePullPolicy` from pod + // specs and Kubernetes applies its own default (Always for `latest`, + // IfNotPresent otherwise). `DEFAULT_IMAGE_PULL_POLICY` ("missing") + // is Podman vocabulary and is not a valid Kubernetes value. + image_pull_policy: String::new(), + supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(), + supervisor_image_pull_policy: String::new(), + supervisor_sideload_method: SupervisorSideloadMethod::default(), + grpc_endpoint: String::new(), + ssh_socket_path: "/run/openshell/ssh.sock".to_string(), + ssh_handshake_secret: String::new(), + ssh_handshake_skew_secs: DEFAULT_SSH_HANDSHAKE_SKEW_SECS, + client_tls_secret_name: String::new(), + host_gateway_ip: String::new(), + enable_user_namespaces: false, + } + } +} diff --git a/crates/openshell-driver-podman/src/config.rs b/crates/openshell-driver-podman/src/config.rs index d82b8d0b0..cfb58751e 100644 --- a/crates/openshell-driver-podman/src/config.rs +++ b/crates/openshell-driver-podman/src/config.rs @@ -61,7 +61,8 @@ impl FromStr for ImagePullPolicy { } } -#[derive(Clone)] +#[derive(Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct PodmanComputeConfig { /// Path to the Podman API Unix socket. /// Default: `$XDG_RUNTIME_DIR/podman/podman.sock` (Linux), diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 9cba99045..f03f6f3cc 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -74,6 +74,7 @@ bytes = { workspace = true } pin-project-lite = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +toml = { workspace = true } tokio-stream = { workspace = true } sqlx = { workspace = true } reqwest = { workspace = true } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 3cd9e8c79..8d65a40b9 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -3,7 +3,8 @@ //! Shared CLI entrypoint for the gateway binaries. -use clap::{ArgAction, Command, CommandFactory, FromArgMatches, Parser}; +use clap::parser::ValueSource; +use clap::{ArgAction, ArgMatches, Command, CommandFactory, FromArgMatches, Parser}; use miette::{IntoDiagnostic, Result}; use openshell_core::ComputeDriverKind; use openshell_core::config::{ @@ -17,6 +18,7 @@ use tracing_subscriber::EnvFilter; use crate::certgen; use crate::compute::{DockerComputeConfig, VmComputeConfig}; +use crate::config_file::{self, ConfigFile, GatewayFileSection}; use crate::{run_server, tracing_bus::TracingLogBus}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. @@ -44,6 +46,14 @@ enum Commands { #[derive(clap::Args, Debug)] #[allow(clippy::struct_excessive_bools)] struct RunArgs { + /// Path to a TOML configuration file (see RFC 0003). + /// + /// When set, gateway-wide settings and per-driver tables are read from + /// the file. Command-line flags and `OPENSHELL_*` environment variables + /// continue to take precedence over file values. + #[arg(long, env = "OPENSHELL_GATEWAY_CONFIG")] + config: Option, + /// IP address to bind the server, health, and metrics listeners to. #[arg(long, default_value = "127.0.0.1", env = "OPENSHELL_BIND_ADDRESS")] bind_address: IpAddr, @@ -329,15 +339,28 @@ pub async fn run_cli() -> Result<()> { .install_default() .map_err(|e| miette::miette!("failed to install rustls crypto provider: {e:?}"))?; - let cli = Cli::from_arg_matches(&command().get_matches()).expect("clap validated args"); + let matches = command().get_matches(); + let cli = Cli::from_arg_matches(&matches).expect("clap validated args"); match cli.command { Some(Commands::GenerateCerts(args)) => certgen::run(args).await, - None => Box::pin(run_from_args(cli.run)).await, + None => Box::pin(run_from_args(cli.run, matches)).await, } } -async fn run_from_args(args: RunArgs) -> Result<()> { +async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { + // Load TOML file when --config / OPENSHELL_GATEWAY_CONFIG is set. + // File values are applied below for any argument that is still at its + // built-in default — CLI flags and OPENSHELL_* env vars always win. + let file: Option = if let Some(path) = args.config.clone() { + Some(config_file::load(&path).map_err(|e| miette::miette!("{e}"))?) + } else { + None + }; + if let Some(file) = file.as_ref() { + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + } + let tracing_log_bus = TracingLogBus::new(); tracing_log_bus.install_subscriber( EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), @@ -348,15 +371,15 @@ async fn run_from_args(args: RunArgs) -> Result<()> { let tls = if args.disable_tls { None } else { - let cert_path = args.tls_cert.ok_or_else(|| { + let cert_path = args.tls_cert.clone().ok_or_else(|| { miette::miette!( "--tls-cert is required when TLS is enabled (use --disable-tls to skip)" ) })?; - let key_path = args.tls_key.ok_or_else(|| { + let key_path = args.tls_key.clone().ok_or_else(|| { miette::miette!("--tls-key is required when TLS is enabled (use --disable-tls to skip)") })?; - let client_ca_path = args.tls_client_ca.ok_or_else(|| { + let client_ca_path = args.tls_client_ca.clone().ok_or_else(|| { miette::miette!( "--tls-client-ca is required when TLS is enabled (use --disable-tls to skip)" ) @@ -371,109 +394,121 @@ async fn run_from_args(args: RunArgs) -> Result<()> { let db_url = args .db_url + .clone() .ok_or_else(|| miette::miette!("--db-url is required (or set OPENSHELL_DB_URL)"))?; let mut config = openshell_core::Config::new(tls) .with_bind_address(bind) .with_log_level(&args.log_level); - if args.health_port != 0 { - if args.port == args.health_port { + // Listener addresses for the health and metrics endpoints. The file may + // pin a different interface than the main listener (e.g. health on + // 127.0.0.1 while gRPC binds 0.0.0.0); the full `SocketAddr` from the + // file is preserved unless CLI/env supplied an explicit `--health-port` / + // `--metrics-port`, in which case the port overrides the file value + // while the IP defaults to `args.bind_address`. + let file_gateway = file.as_ref().map(|f| &f.openshell.gateway); + let health_bind = resolve_aux_listener( + args.bind_address, + args.health_port, + &matches, + "health_port", + || file_gateway.and_then(|g| g.health_bind_address), + ); + let metrics_bind = resolve_aux_listener( + args.bind_address, + args.metrics_port, + &matches, + "metrics_port", + || file_gateway.and_then(|g| g.metrics_bind_address), + ); + + if let Some(addr) = health_bind { + if args.port == addr.port() { return Err(miette::miette!( "--port and --health-port must be different (both set to {})", args.port )); } - let health_bind = SocketAddr::new(args.bind_address, args.health_port); - config = config.with_health_bind_address(health_bind); + config = config.with_health_bind_address(addr); } - if args.metrics_port != 0 { - if args.port == args.metrics_port { + if let Some(addr) = metrics_bind { + if args.port == addr.port() { return Err(miette::miette!( "--port and --metrics-port must be different (both set to {})", args.port )); } - if args.health_port != 0 && args.health_port == args.metrics_port { + if let Some(health) = health_bind + && health.port() == addr.port() + { return Err(miette::miette!( "--health-port and --metrics-port must be different (both set to {})", - args.health_port + health.port() )); } - let metrics_bind = SocketAddr::new(args.bind_address, args.metrics_port); - config = config.with_metrics_bind_address(metrics_bind); + config = config.with_metrics_bind_address(addr); } config = config .with_database_url(db_url) - .with_compute_drivers(args.drivers) - .with_sandbox_namespace(args.sandbox_namespace) - .with_ssh_gateway_host(args.ssh_gateway_host) + .with_compute_drivers(args.drivers.clone()) + .with_sandbox_namespace(args.sandbox_namespace.clone()) + .with_ssh_gateway_host(args.ssh_gateway_host.clone()) .with_ssh_gateway_port(args.ssh_gateway_port) .with_sandbox_ssh_port(args.sandbox_ssh_port) .with_ssh_handshake_skew_secs(args.ssh_handshake_skew_secs) - .with_server_sans(args.server_sans) + .with_server_sans(args.server_sans.clone()) .with_loopback_service_http(args.enable_loopback_service_http); - if let Some(image) = args.sandbox_image { + if let Some(ttl) = file + .as_ref() + .and_then(|f| f.openshell.gateway.ssh_session_ttl_secs) + { + config = config.with_ssh_session_ttl_secs(ttl); + } + + if let Some(image) = args.sandbox_image.clone() { config = config.with_sandbox_image(image); } - if let Some(policy) = args.sandbox_image_pull_policy { + if let Some(policy) = args.sandbox_image_pull_policy.clone() { config = config.with_sandbox_image_pull_policy(policy); } - if let Some(endpoint) = args.grpc_endpoint { + if let Some(endpoint) = args.grpc_endpoint.clone() { config = config.with_grpc_endpoint(endpoint); } - if let Some(secret) = args.ssh_handshake_secret { + if let Some(secret) = args.ssh_handshake_secret.clone() { config = config.with_ssh_handshake_secret(secret); } - if let Some(name) = args.client_tls_secret_name { + if let Some(name) = args.client_tls_secret_name.clone() { config = config.with_client_tls_secret_name(name); } - if let Some(ip) = args.host_gateway_ip { + if let Some(ip) = args.host_gateway_ip.clone() { config = config.with_host_gateway_ip(ip); } - if let Some(issuer) = args.oidc_issuer { + if let Some(issuer) = args.oidc_issuer.clone() { config = config.with_oidc(openshell_core::OidcConfig { issuer, - audience: args.oidc_audience, + audience: args.oidc_audience.clone(), jwks_ttl_secs: args.oidc_jwks_ttl, - roles_claim: args.oidc_roles_claim, - admin_role: args.oidc_admin_role, - user_role: args.oidc_user_role, - scopes_claim: args.oidc_scopes_claim, + roles_claim: args.oidc_roles_claim.clone(), + admin_role: args.oidc_admin_role.clone(), + user_role: args.oidc_user_role.clone(), + scopes_claim: args.oidc_scopes_claim.clone(), }); } config.enable_user_namespaces = args.enable_user_namespaces; - let vm_config = VmComputeConfig { - state_dir: args.vm_driver_state_dir, - driver_dir: args.driver_dir, - default_image: config.sandbox_image.clone(), - krun_log_level: args.vm_krun_log_level, - vcpus: args.vm_vcpus, - mem_mib: args.vm_mem_mib, - guest_tls_ca: args.vm_tls_ca, - guest_tls_cert: args.vm_tls_cert, - guest_tls_key: args.vm_tls_key, - }; - - let docker_config = DockerComputeConfig { - supervisor_bin: args.docker_supervisor_bin, - supervisor_image: args.docker_supervisor_image, - guest_tls_ca: args.docker_tls_ca, - guest_tls_cert: args.docker_tls_cert, - guest_tls_key: args.docker_tls_key, - network_name: args.docker_network_name, - }; + let vm_config = build_vm_config(&args, &matches, &config, file.as_ref())?; + let docker_config = build_docker_config(&args, &matches, file.as_ref())?; if args.disable_tls { info!("TLS disabled — listening on plaintext HTTP"); @@ -483,15 +518,296 @@ async fn run_from_args(args: RunArgs) -> Result<()> { info!(bind = %config.bind_address, "Starting OpenShell server"); - run_server(config, vm_config, docker_config, tracing_log_bus) - .await - .into_diagnostic() + Box::pin(run_server( + config, + vm_config, + docker_config, + file, + tracing_log_bus, + )) + .await + .into_diagnostic() } fn parse_compute_driver(value: &str) -> std::result::Result { value.parse() } +/// Returns `true` when an argument's value came from clap's built-in default +/// (or was never supplied at all). When the predicate is `true`, the loader +/// is free to replace the value with one read from the TOML config file. +fn arg_defaulted(matches: &ArgMatches, id: &str) -> bool { + matches!( + matches.value_source(id), + None | Some(ValueSource::DefaultValue) + ) +} + +/// Resolve the bind address for an auxiliary listener (health / metrics). +/// +/// The precedence is: +/// 1. CLI flag or `OPENSHELL_*` env var explicitly set on the corresponding +/// port argument → `bind_address:port` (port from CLI, IP from the main +/// listener interface). +/// 2. Full `SocketAddr` from `[openshell.gateway].{health,metrics}_bind_address` +/// → used as-is (this is how operators pin a loopback-only health port +/// on a gateway whose gRPC listener is bound publicly). +/// 3. Otherwise the listener is disabled (returns `None`). +fn resolve_aux_listener( + bind_ip: IpAddr, + port_arg: u16, + matches: &ArgMatches, + port_id: &str, + file_addr: impl FnOnce() -> Option, +) -> Option { + if !arg_defaulted(matches, port_id) { + if port_arg == 0 { + return None; + } + return Some(SocketAddr::new(bind_ip, port_arg)); + } + if let Some(addr) = file_addr() { + return Some(addr); + } + if port_arg == 0 { + None + } else { + Some(SocketAddr::new(bind_ip, port_arg)) + } +} + +/// Apply gateway-wide values from `[openshell.gateway]` onto `RunArgs` for +/// every argument that is still sourced from clap's built-in default. +/// +/// The function intentionally does not touch `database_url` or +/// `ssh_handshake_secret` — those secrets are env-only and the loader +/// already rejected them when they appear in the file. +fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: &ArgMatches) { + if let Some(addr) = file.bind_address { + if arg_defaulted(matches, "bind_address") { + args.bind_address = addr.ip(); + } + if arg_defaulted(matches, "port") { + args.port = addr.port(); + } + } + // Note: file's full health_bind_address / metrics_bind_address are + // consumed in `run_from_args`'s listener-resolution block so the IP + // half of the SocketAddr is preserved. Copying only the port here + // would silently relocate a loopback-intended listener onto the + // public bind address. + if let Some(level) = &file.log_level + && arg_defaulted(matches, "log_level") + { + args.log_level.clone_from(level); + } + if let Some(drivers) = &file.compute_drivers + && arg_defaulted(matches, "drivers") + { + args.drivers.clone_from(drivers); + } + if let Some(ns) = &file.sandbox_namespace + && arg_defaulted(matches, "sandbox_namespace") + { + args.sandbox_namespace.clone_from(ns); + } + if let Some(port) = file.sandbox_ssh_port + && arg_defaulted(matches, "sandbox_ssh_port") + { + args.sandbox_ssh_port = port; + } + if let Some(host) = &file.ssh_gateway_host + && arg_defaulted(matches, "ssh_gateway_host") + { + args.ssh_gateway_host.clone_from(host); + } + if let Some(port) = file.ssh_gateway_port + && arg_defaulted(matches, "ssh_gateway_port") + { + args.ssh_gateway_port = port; + } + if let Some(skew) = file.ssh_handshake_skew_secs + && arg_defaulted(matches, "ssh_handshake_skew_secs") + { + args.ssh_handshake_skew_secs = skew; + } + if let Some(image) = &file.default_image + && args.sandbox_image.is_none() + && arg_defaulted(matches, "sandbox_image") + { + args.sandbox_image = Some(image.clone()); + } + if let Some(secret) = &file.client_tls_secret_name + && args.client_tls_secret_name.is_none() + && arg_defaulted(matches, "client_tls_secret_name") + { + args.client_tls_secret_name = Some(secret.clone()); + } + if let Some(ip) = &file.host_gateway_ip + && args.host_gateway_ip.is_none() + && arg_defaulted(matches, "host_gateway_ip") + { + args.host_gateway_ip = Some(ip.clone()); + } + if let Some(enabled) = file.enable_user_namespaces + && arg_defaulted(matches, "enable_user_namespaces") + { + args.enable_user_namespaces = enabled; + } + if let Some(sans) = &file.server_sans + && args.server_sans.is_empty() + && arg_defaulted(matches, "server_sans") + { + args.server_sans.clone_from(sans); + } + if let Some(enabled) = file.enable_loopback_service_http + && arg_defaulted(matches, "enable_loopback_service_http") + { + args.enable_loopback_service_http = enabled; + } + if let Some(disabled) = file.disable_tls + && arg_defaulted(matches, "disable_tls") + { + args.disable_tls = disabled; + } + // TLS gateway listener fields + if let Some(tls) = &file.tls { + if args.tls_cert.is_none() && arg_defaulted(matches, "tls_cert") { + args.tls_cert = Some(tls.cert_path.clone()); + } + if args.tls_key.is_none() && arg_defaulted(matches, "tls_key") { + args.tls_key = Some(tls.key_path.clone()); + } + if args.tls_client_ca.is_none() && arg_defaulted(matches, "tls_client_ca") { + args.tls_client_ca = Some(tls.client_ca_path.clone()); + } + if tls.allow_unauthenticated && arg_defaulted(matches, "disable_gateway_auth") { + args.disable_gateway_auth = true; + } + } + // OIDC fields + if let Some(oidc) = &file.oidc { + if args.oidc_issuer.is_none() && arg_defaulted(matches, "oidc_issuer") { + args.oidc_issuer = Some(oidc.issuer.clone()); + } + if arg_defaulted(matches, "oidc_audience") { + args.oidc_audience.clone_from(&oidc.audience); + } + if arg_defaulted(matches, "oidc_jwks_ttl") { + args.oidc_jwks_ttl = oidc.jwks_ttl_secs; + } + if arg_defaulted(matches, "oidc_roles_claim") { + args.oidc_roles_claim.clone_from(&oidc.roles_claim); + } + if arg_defaulted(matches, "oidc_admin_role") { + args.oidc_admin_role.clone_from(&oidc.admin_role); + } + if arg_defaulted(matches, "oidc_user_role") { + args.oidc_user_role.clone_from(&oidc.user_role); + } + if arg_defaulted(matches, "oidc_scopes_claim") { + args.oidc_scopes_claim.clone_from(&oidc.scopes_claim); + } + } +} + +/// Build [`VmComputeConfig`] by overlaying CLI args on top of the +/// `[openshell.drivers.vm]` table inherited from `[openshell.gateway]`. +fn build_vm_config( + args: &RunArgs, + matches: &ArgMatches, + config: &openshell_core::Config, + file: Option<&ConfigFile>, +) -> Result { + let mut cfg = if let Some(file) = file { + let merged = config_file::driver_table( + ComputeDriverKind::Vm, + &file.openshell.gateway, + file.openshell.drivers.get("vm"), + ); + merged + .try_into::() + .map_err(|e| miette::miette!("invalid [openshell.drivers.vm] table: {e}"))? + } else { + VmComputeConfig::default() + }; + + // CLI/env overrides — and `state_dir` is also pulled from RunArgs when the + // file did not set it, so the gateway always has a working directory. + if !arg_defaulted(matches, "vm_driver_state_dir") || cfg.state_dir.as_os_str().is_empty() { + cfg.state_dir.clone_from(&args.vm_driver_state_dir); + } + if !arg_defaulted(matches, "driver_dir") || cfg.driver_dir.is_none() { + cfg.driver_dir.clone_from(&args.driver_dir); + } + if !arg_defaulted(matches, "vm_krun_log_level") { + cfg.krun_log_level = args.vm_krun_log_level; + } + if !arg_defaulted(matches, "vm_vcpus") { + cfg.vcpus = args.vm_vcpus; + } + if !arg_defaulted(matches, "vm_mem_mib") { + cfg.mem_mib = args.vm_mem_mib; + } + if let Some(p) = args.vm_tls_ca.clone() { + cfg.guest_tls_ca = Some(p); + } + if let Some(p) = args.vm_tls_cert.clone() { + cfg.guest_tls_cert = Some(p); + } + if let Some(p) = args.vm_tls_key.clone() { + cfg.guest_tls_key = Some(p); + } + // Fall through: image inherited from gateway-wide `sandbox_image` when + // the merged table did not supply `default_image`. + if cfg.default_image.is_empty() { + cfg.default_image.clone_from(&config.sandbox_image); + } + Ok(cfg) +} + +/// Build [`DockerComputeConfig`] using the same inheritance pattern as +/// [`build_vm_config`]. +fn build_docker_config( + args: &RunArgs, + matches: &ArgMatches, + file: Option<&ConfigFile>, +) -> Result { + let mut cfg = if let Some(file) = file { + let merged = config_file::driver_table( + ComputeDriverKind::Docker, + &file.openshell.gateway, + file.openshell.drivers.get("docker"), + ); + merged + .try_into::() + .map_err(|e| miette::miette!("invalid [openshell.drivers.docker] table: {e}"))? + } else { + DockerComputeConfig::default() + }; + + if args.docker_supervisor_bin.is_some() { + cfg.supervisor_bin.clone_from(&args.docker_supervisor_bin); + } + if args.docker_supervisor_image.is_some() { + cfg.supervisor_image + .clone_from(&args.docker_supervisor_image); + } + if args.docker_tls_ca.is_some() { + cfg.guest_tls_ca.clone_from(&args.docker_tls_ca); + } + if args.docker_tls_cert.is_some() { + cfg.guest_tls_cert.clone_from(&args.docker_tls_cert); + } + if args.docker_tls_key.is_some() { + cfg.guest_tls_key.clone_from(&args.docker_tls_key); + } + if !arg_defaulted(matches, "docker_network_name") { + cfg.network_name.clone_from(&args.docker_network_name); + } + Ok(cfg) +} + #[cfg(test)] mod tests { use super::{Cli, command}; @@ -716,4 +1032,308 @@ mod tests { assert!(cli.command.is_none()); assert!(cli.run.db_url.is_none()); } + + // ── Config-file merge tests ────────────────────────────────────────── + // + // `merge_file_into_args` is the bridge between `config_file::ConfigFile` + // and `RunArgs`. These cases lock in the precedence rule: + // + // CLI flag > OPENSHELL_* env var > TOML file > built-in default + // + // by exercising each combination on representative gateway fields. + + use super::{ConfigFile, merge_file_into_args}; + use clap::FromArgMatches; + + fn parse_with_args(argv: &[&str]) -> (super::RunArgs, clap::ArgMatches) { + let matches = command().try_get_matches_from(argv).expect("parses"); + let cli = Cli::from_arg_matches(&matches).expect("from arg matches"); + (cli.run, matches) + } + + fn config_file_from_toml(toml: &str) -> ConfigFile { + toml::from_str(toml).expect("valid TOML in test fixture") + } + + #[test] + fn file_value_applies_when_cli_uses_default() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_BIND_ADDRESS"); + let _g2 = EnvVarGuard::remove("OPENSHELL_SERVER_PORT"); + let _g3 = EnvVarGuard::remove("OPENSHELL_LOG_LEVEL"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +bind_address = "0.0.0.0:9090" +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.bind_address, IpAddr::V4(Ipv4Addr::UNSPECIFIED)); + assert_eq!(args.port, 9090); + assert_eq!(args.log_level, "debug"); + } + + #[test] + fn cli_flag_overrides_file_value() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_BIND_ADDRESS"); + let _g2 = EnvVarGuard::remove("OPENSHELL_LOG_LEVEL"); + + let (mut args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--log-level", + "warn", + ]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.log_level, "warn", "CLI flag must win over file"); + } + + #[test] + fn env_var_overrides_file_value() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set("OPENSHELL_LOG_LEVEL", "trace"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.log_level, "trace", "env var must win over file"); + } + + #[test] + fn file_oidc_block_populates_oidc_args() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_OIDC_ISSUER"); + let _g2 = EnvVarGuard::remove("OPENSHELL_OIDC_AUDIENCE"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway.oidc] +issuer = "https://idp.example.com" +audience = "openshell-cli" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.oidc_issuer.as_deref(), Some("https://idp.example.com")); + assert_eq!(args.oidc_audience, "openshell-cli"); + } + + #[test] + fn aux_listener_preserves_file_ip_against_public_bind() { + use std::net::SocketAddr; + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_HEALTH_PORT"); + + let (_args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file_addr: SocketAddr = "127.0.0.1:8081".parse().unwrap(); + let resolved = super::resolve_aux_listener( + IpAddr::V4(Ipv4Addr::UNSPECIFIED), + 0, + &matches, + "health_port", + || Some(file_addr), + ); + assert_eq!( + resolved, + Some(file_addr), + "TOML health_bind_address 127.0.0.1:8081 must not be relocated to 0.0.0.0:8081" + ); + } + + #[test] + fn aux_listener_cli_port_overrides_file_addr() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_HEALTH_PORT"); + + let (_args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--health-port", + "9999", + ]); + let file_addr: std::net::SocketAddr = "127.0.0.1:8081".parse().unwrap(); + let resolved = super::resolve_aux_listener( + IpAddr::V4(Ipv4Addr::UNSPECIFIED), + 9999, + &matches, + "health_port", + || Some(file_addr), + ); + assert_eq!( + resolved, + Some("0.0.0.0:9999".parse().unwrap()), + "CLI flag must win over file value" + ); + } + + #[test] + fn file_disable_tls_applies() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_DISABLE_TLS"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r" +[openshell.gateway] +disable_tls = true +", + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert!(args.disable_tls); + } + + #[test] + fn file_ssh_session_ttl_secs_is_parsed() { + // The loader must accept and surface the documented key. The actual + // wiring into `Config` happens in `run_from_args` against the parsed + // file (not via `merge_file_into_args`, since there is no matching + // `RunArgs` field), so this test pins the schema half. + let file = config_file_from_toml( + r" +[openshell.gateway] +ssh_session_ttl_secs = 1234 +", + ); + assert_eq!(file.openshell.gateway.ssh_session_ttl_secs, Some(1234)); + } + + #[test] + fn file_populates_service_routing_fields() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_SERVER_SAN"); + let _g2 = EnvVarGuard::remove("OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +server_sans = ["gateway.local", "*.dev.openshell.localhost"] +enable_loopback_service_http = false +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!( + args.server_sans, + vec![ + "gateway.local".to_string(), + "*.dev.openshell.localhost".to_string() + ] + ); + assert!(!args.enable_loopback_service_http); + } + + #[test] + fn env_var_overrides_file_loopback_service_http() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set("OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP", "true"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r" +[openshell.gateway] +enable_loopback_service_http = false +", + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert!( + args.enable_loopback_service_http, + "env var must win over file" + ); + } + + #[test] + fn driver_inherits_shared_image_from_gateway_section() { + // [openshell.gateway].default_image inherits into the K8s driver + // table when the driver-specific table does not set it. + let file = config_file_from_toml( + r#" +[openshell.gateway] +default_image = "ghcr.io/nvidia/openshell/sandbox:1.0" + +[openshell.drivers.kubernetes] +namespace = "agents" +"#, + ); + let merged = crate::config_file::driver_table( + super::ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + let parsed = merged + .try_into::() + .expect("merged table deserializes"); + assert_eq!(parsed.default_image, "ghcr.io/nvidia/openshell/sandbox:1.0"); + assert_eq!(parsed.namespace, "agents"); + } + + #[test] + fn driver_specific_value_overrides_gateway_inheritance() { + let file = config_file_from_toml( + r#" +[openshell.gateway] +default_image = "gateway-default:1.0" + +[openshell.drivers.kubernetes] +default_image = "k8s-specific:1.0" +"#, + ); + let merged = crate::config_file::driver_table( + super::ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + let parsed = merged + .try_into::() + .expect("deserializes"); + assert_eq!(parsed.default_image, "k8s-specific:1.0"); + } } diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs index 1e62d4942..3a67f381b 100644 --- a/crates/openshell-server/src/compute/vm.rs +++ b/crates/openshell-server/src/compute/vm.rs @@ -60,7 +60,8 @@ const COMPUTE_DRIVER_SOCKET_RUN_DIR: &str = "run"; const COMPUTE_DRIVER_SOCKET_NAME: &str = "compute-driver.sock"; /// Configuration for launching and talking to the VM compute driver. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct VmComputeConfig { /// Working directory for VM driver sandbox state. pub state_dir: PathBuf, diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs new file mode 100644 index 000000000..8e5477683 --- /dev/null +++ b/crates/openshell-server/src/config_file.rs @@ -0,0 +1,517 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! TOML configuration file loader for the gateway. +//! +//! See `rfc/0003-gateway-configuration/README.md` for the file format. This +//! module parses the file into [`ConfigFile`], rejects fields that must be +//! supplied via env/CLI (database URL, SSH handshake secret), and provides +//! [`driver_table`] which overlays shared `[openshell.gateway]` defaults onto +//! a `[openshell.drivers.]` table so each driver crate's +//! `Deserialize` impl sees a fully-populated table. +//! +//! The merge precedence at the gateway level is: +//! ```text +//! CLI flag > OPENSHELL_* env var > TOML file > built-in default +//! ``` +//! Per-field application of file values happens in [`crate::cli`], which uses +//! clap's `ArgMatches::value_source` to detect arguments that fell back to +//! their default and are therefore eligible for replacement by file values. + +use std::collections::BTreeMap; +use std::net::SocketAddr; +use std::path::{Path, PathBuf}; + +use openshell_core::config::ComputeDriverKind; +use openshell_core::{OidcConfig, TlsConfig}; +use serde::{Deserialize, Serialize}; + +/// Latest schema version this build understands. +pub const SCHEMA_VERSION: u32 = 1; + +/// Root of the gateway TOML config file. +/// +/// The file is rooted at `[openshell]` to reserve room for future components +/// (CLI, sandbox, router) to share a single config file without key +/// collisions. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct ConfigFile { + #[serde(default)] + pub openshell: OpenShellRoot, +} + +/// `[openshell]` table. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct OpenShellRoot { + /// Reserved for future schema migrations. Versions greater than + /// [`SCHEMA_VERSION`] are rejected at load time. + #[serde(default)] + pub version: Option, + + #[serde(default)] + pub gateway: GatewayFileSection, + + /// `[openshell.drivers.]` tables — passed verbatim to each driver + /// crate's `Deserialize` impl after the gateway-side inheritance merge. + /// Stored as raw [`toml::Value`] so each driver can evolve its schema + /// independently of this crate. + #[serde(default)] + pub drivers: BTreeMap, +} + +/// `[openshell.gateway]` section. +/// +/// All fields are `Option` so the loader can tell whether a key was set +/// in the file (`Some`) or not (`None` — value is taken from CLI/env/default). +/// +/// The fields under "Shared driver defaults" are inherited into +/// `[openshell.drivers.]` tables per [`inheritable_keys`]. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct GatewayFileSection { + // ── Listeners ──────────────────────────────────────────────────────── + #[serde(default)] + pub bind_address: Option, + #[serde(default)] + pub health_bind_address: Option, + #[serde(default)] + pub metrics_bind_address: Option, + + // ── Logging ────────────────────────────────────────────────────────── + #[serde(default)] + pub log_level: Option, + + // ── Drivers ────────────────────────────────────────────────────────── + #[serde(default)] + pub compute_drivers: Option>, + + // ── Sandbox / SSH ──────────────────────────────────────────────────── + #[serde(default)] + pub sandbox_namespace: Option, + #[serde(default)] + pub sandbox_ssh_port: Option, + #[serde(default)] + pub ssh_gateway_host: Option, + #[serde(default)] + pub ssh_gateway_port: Option, + #[serde(default)] + pub ssh_handshake_skew_secs: Option, + #[serde(default)] + pub ssh_session_ttl_secs: Option, + + // ── Service routing ────────────────────────────────────────────────── + /// Subject Alternative Names configured on the gateway server certificate. + /// Wildcard DNS SANs also enable sandbox service URLs under that domain. + #[serde(default)] + pub server_sans: Option>, + /// Enable plaintext HTTP routing for loopback sandbox service URLs. + #[serde(default)] + pub enable_loopback_service_http: Option, + + // ── Shared driver defaults (inherited into [openshell.drivers.]) ─ + #[serde(default)] + pub default_image: Option, + #[serde(default)] + pub supervisor_image: Option, + #[serde(default)] + pub client_tls_secret_name: Option, + #[serde(default)] + pub host_gateway_ip: Option, + #[serde(default)] + pub enable_user_namespaces: Option, + #[serde(default)] + pub guest_tls_ca: Option, + #[serde(default)] + pub guest_tls_cert: Option, + #[serde(default)] + pub guest_tls_key: Option, + + // ── TLS toggle ─────────────────────────────────────────────────────── + /// When `true`, the gateway listens on plaintext HTTP and ignores any + /// `[openshell.gateway.tls]` table. Mirrors `--disable-tls`. + #[serde(default)] + pub disable_tls: Option, + + // ── Nested tables ──────────────────────────────────────────────────── + #[serde(default)] + pub tls: Option, + #[serde(default)] + pub oidc: Option, + + // ── Disallowed-in-file fields ──────────────────────────────────────── + // + // Captured so we can produce a friendly "set this via env/CLI instead" + // error rather than a generic "unknown field" message. Validated and + // rejected in [`load`]. + #[serde(default)] + pub database_url: Option, + #[serde(default)] + pub ssh_handshake_secret: Option, +} + +#[derive(Debug, thiserror::Error)] +pub enum ConfigFileError { + #[error("failed to read gateway config file '{}': {source}", path.display())] + Io { + path: PathBuf, + #[source] + source: std::io::Error, + }, + #[error("failed to parse gateway config file '{}': {source}", path.display())] + Parse { + path: PathBuf, + #[source] + source: toml::de::Error, + }, + #[error( + "unsupported gateway config version {version}; this build only supports version {SCHEMA_VERSION}" + )] + UnsupportedVersion { version: u32 }, + #[error( + "`{field}` is not allowed in the gateway config file — set the {env} env var or pass {cli} on the command line" + )] + SecretInFile { + field: &'static str, + env: &'static str, + cli: &'static str, + }, +} + +/// Load and validate a TOML config file. +/// +/// Returns `Ok(ConfigFile::default())` for an empty file (the gateway then +/// falls back entirely to CLI/env/built-in defaults). +pub fn load(path: &Path) -> Result { + let contents = std::fs::read_to_string(path).map_err(|source| ConfigFileError::Io { + path: path.to_path_buf(), + source, + })?; + if contents.trim().is_empty() { + return Ok(ConfigFile::default()); + } + let file: ConfigFile = toml::from_str(&contents).map_err(|source| ConfigFileError::Parse { + path: path.to_path_buf(), + source, + })?; + + if let Some(version) = file.openshell.version + && version > SCHEMA_VERSION + { + return Err(ConfigFileError::UnsupportedVersion { version }); + } + + if file.openshell.gateway.database_url.is_some() { + return Err(ConfigFileError::SecretInFile { + field: "database_url", + env: "OPENSHELL_DB_URL", + cli: "--db-url", + }); + } + if file.openshell.gateway.ssh_handshake_secret.is_some() { + return Err(ConfigFileError::SecretInFile { + field: "ssh_handshake_secret", + env: "OPENSHELL_SSH_HANDSHAKE_SECRET", + cli: "--ssh-handshake-secret", + }); + } + + Ok(file) +} + +/// Build the merged TOML table for `driver` by overlaying inheritable +/// `[openshell.gateway]` defaults onto `[openshell.drivers.]`. +/// +/// The returned [`toml::Value`] is a Table ready to feed into the driver's +/// `Deserialize` impl — keys present in `raw` win over the gateway defaults. +/// Keys outside [`inheritable_keys`] for this driver are never copied from +/// the gateway section, which keeps each driver's `deny_unknown_fields` +/// invariant intact. +pub fn driver_table( + driver: ComputeDriverKind, + gateway: &GatewayFileSection, + raw: Option<&toml::Value>, +) -> toml::Value { + let mut merged = match raw { + Some(toml::Value::Table(table)) => table.clone(), + _ => toml::Table::new(), + }; + + for key in inheritable_keys(driver) { + if merged.contains_key(*key) { + continue; + } + if let Some(value) = gateway_inherited_value(gateway, key) { + merged.insert((*key).to_string(), value); + } + } + + toml::Value::Table(merged) +} + +/// Inheritance allowlist (the Q4 "high-overlap set"). Each driver opts in +/// to a specific subset so a gateway-wide default does not accidentally land +/// in a driver table that does not understand the field. +fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { + match driver { + ComputeDriverKind::Kubernetes => &[ + "default_image", + "supervisor_image", + "client_tls_secret_name", + "host_gateway_ip", + "ssh_handshake_skew_secs", + "enable_user_namespaces", + ], + ComputeDriverKind::Docker => &[ + "supervisor_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + ComputeDriverKind::Podman => &[ + "default_image", + "supervisor_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + "ssh_handshake_skew_secs", + ], + ComputeDriverKind::Vm => &[ + "default_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + } +} + +fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option { + match key { + "default_image" => g.default_image.as_deref().map(string_value), + "supervisor_image" => g.supervisor_image.as_deref().map(string_value), + "client_tls_secret_name" => g.client_tls_secret_name.as_deref().map(string_value), + "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), + "ssh_handshake_skew_secs" => g.ssh_handshake_skew_secs.and_then(skew_value), + "enable_user_namespaces" => g.enable_user_namespaces.map(toml::Value::Boolean), + "guest_tls_ca" => g.guest_tls_ca.as_deref().map(path_value), + "guest_tls_cert" => g.guest_tls_cert.as_deref().map(path_value), + "guest_tls_key" => g.guest_tls_key.as_deref().map(path_value), + _ => None, + } +} + +fn string_value(s: &str) -> toml::Value { + toml::Value::String(s.to_owned()) +} + +fn path_value(p: &Path) -> toml::Value { + toml::Value::String(p.display().to_string()) +} + +fn skew_value(n: u64) -> Option { + i64::try_from(n).ok().map(toml::Value::Integer) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + fn write_tmp(contents: &str) -> tempfile::NamedTempFile { + let mut tmp = tempfile::Builder::new() + .suffix(".toml") + .tempfile() + .expect("tempfile"); + tmp.write_all(contents.as_bytes()).expect("write"); + tmp + } + + #[test] + fn empty_file_yields_default_config() { + let tmp = write_tmp(""); + let file = load(tmp.path()).expect("empty file parses"); + assert!(file.openshell.version.is_none()); + assert!(file.openshell.gateway.bind_address.is_none()); + assert!(file.openshell.drivers.is_empty()); + } + + #[test] + fn parses_full_example() { + let toml = r#" +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +log_level = "info" +compute_drivers = ["kubernetes"] +sandbox_namespace = "agents" +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +client_tls_secret_name = "openshell-sandbox-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +"#; + let tmp = write_tmp(toml); + let file = load(tmp.path()).expect("valid file parses"); + let gw = &file.openshell.gateway; + assert_eq!(gw.log_level.as_deref(), Some("info")); + assert_eq!( + gw.default_image.as_deref(), + Some("ghcr.io/nvidia/openshell/sandbox:latest") + ); + assert!(gw.tls.is_some()); + assert!(gw.oidc.is_some()); + assert!(file.openshell.drivers.contains_key("kubernetes")); + } + + #[test] + fn rejects_database_url_in_file() { + let toml = r#" +[openshell.gateway] +database_url = "sqlite::memory:" +"#; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("database_url must be rejected"); + assert!(matches!( + err, + ConfigFileError::SecretInFile { + field: "database_url", + .. + } + )); + } + + #[test] + fn rejects_ssh_handshake_secret_in_file() { + let toml = r#" +[openshell.gateway] +ssh_handshake_secret = "leaked" +"#; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("ssh_handshake_secret must be rejected"); + assert!(matches!( + err, + ConfigFileError::SecretInFile { + field: "ssh_handshake_secret", + .. + } + )); + } + + #[test] + fn rejects_unknown_gateway_field() { + let toml = r" +[openshell.gateway] +nonsense = true +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("unknown field must be rejected"); + assert!(matches!(err, ConfigFileError::Parse { .. })); + } + + #[test] + fn rejects_unsupported_version() { + let toml = r" +[openshell] +version = 2 +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("version > 1 must be rejected"); + assert!(matches!( + err, + ConfigFileError::UnsupportedVersion { version: 2 } + )); + } + + #[test] + fn driver_table_inherits_gateway_defaults() { + let gateway = GatewayFileSection { + default_image: Some("ghcr.io/nvidia/openshell/sandbox:0.9".to_string()), + supervisor_image: Some("ghcr.io/nvidia/openshell/supervisor:0.9".to_string()), + ..Default::default() + }; + let raw = toml::toml! { + namespace = "agents" + }; + let merged = driver_table( + ComputeDriverKind::Kubernetes, + &gateway, + Some(&toml::Value::Table(raw)), + ); + let table = merged.as_table().expect("table"); + assert_eq!( + table.get("namespace").and_then(|v| v.as_str()), + Some("agents") + ); + assert_eq!( + table.get("default_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/sandbox:0.9") + ); + assert_eq!( + table.get("supervisor_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/supervisor:0.9") + ); + } + + #[test] + fn driver_table_specific_value_overrides_gateway_default() { + let gateway = GatewayFileSection { + default_image: Some("gateway-default".to_string()), + ..Default::default() + }; + let raw = toml::toml! { + default_image = "driver-specific" + }; + let merged = driver_table( + ComputeDriverKind::Podman, + &gateway, + Some(&toml::Value::Table(raw)), + ); + assert_eq!( + merged + .as_table() + .unwrap() + .get("default_image") + .and_then(|v| v.as_str()), + Some("driver-specific") + ); + } + + #[test] + fn driver_table_does_not_leak_keys_outside_allowlist() { + // `client_tls_secret_name` is K8s-only; Docker must not receive it + // even when set at gateway scope. + let gateway = GatewayFileSection { + client_tls_secret_name: Some("openshell-sandbox-tls".to_string()), + ..Default::default() + }; + let merged = driver_table(ComputeDriverKind::Docker, &gateway, None); + assert!( + !merged + .as_table() + .unwrap() + .contains_key("client_tls_secret_name") + ); + } + + #[test] + fn missing_path_is_io_error() { + let err = load(Path::new("/nonexistent/openshell-gateway.toml")) + .expect_err("missing file must be io error"); + assert!(matches!(err, ConfigFileError::Io { .. })); + } +} diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 93ccdc9dc..e9f02da24 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -23,6 +23,7 @@ mod auth; pub mod certgen; pub mod cli; mod compute; +pub mod config_file; mod grpc; mod http; mod inference; @@ -152,6 +153,7 @@ pub async fn run_server( config: Config, vm_config: VmComputeConfig, docker_config: DockerComputeConfig, + config_file: Option, tracing_log_bus: TracingLogBus, ) -> Result<()> { let database_url = config.database_url.trim(); @@ -194,6 +196,7 @@ pub async fn run_server( &config, &vm_config, &docker_config, + config_file.as_ref(), store.clone(), sandbox_index.clone(), sandbox_watch_bus.clone(), @@ -551,6 +554,7 @@ async fn build_compute_runtime( config: &Config, vm_config: &VmComputeConfig, docker_config: &DockerComputeConfig, + file: Option<&config_file::ConfigFile>, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, @@ -562,37 +566,54 @@ async fn build_compute_runtime( match driver { ComputeDriverKind::Kubernetes => { - let supervisor_image = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()); - let supervisor_image_pull_policy = - std::env::var("OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_default(); + let mut k8s = kubernetes_config_from_file(file)?; + // Env overrides file for fields not represented in Config. + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") + && !v.is_empty() + { + k8s.supervisor_image = v; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY") + && !v.is_empty() + { + k8s.supervisor_image_pull_policy = v; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_SIDELOAD_METHOD") + && !v.is_empty() + && let Ok(parsed) = v.parse() + { + k8s.supervisor_sideload_method = parsed; + } + // Shared fields are sourced from Config, which already merged + // file + CLI/env at startup. + k8s.namespace.clone_from(&config.sandbox_namespace); + k8s.default_image.clone_from(&config.sandbox_image); + // Only let the gateway-wide CLI/env value overwrite the per-driver + // file value when it was actually set — otherwise the empty CLI + // default would silently clobber `image_pull_policy` configured + // under `[openshell.drivers.kubernetes]`. + if !config.sandbox_image_pull_policy.is_empty() { + k8s.image_pull_policy + .clone_from(&config.sandbox_image_pull_policy); + } + // Same rationale as `image_pull_policy`: only let the gateway-wide + // CLI/env value win when it was actually set, otherwise the empty + // CLI default would clobber `grpc_endpoint` from + // `[openshell.drivers.kubernetes]`. + if !config.grpc_endpoint.is_empty() { + k8s.grpc_endpoint.clone_from(&config.grpc_endpoint); + } + k8s.ssh_socket_path + .clone_from(&config.sandbox_ssh_socket_path); + k8s.ssh_handshake_secret + .clone_from(&config.ssh_handshake_secret); + k8s.ssh_handshake_skew_secs = config.ssh_handshake_skew_secs; + k8s.client_tls_secret_name + .clone_from(&config.client_tls_secret_name); + k8s.host_gateway_ip.clone_from(&config.host_gateway_ip); + k8s.enable_user_namespaces = config.enable_user_namespaces; ComputeRuntime::new_kubernetes( - KubernetesComputeConfig { - namespace: config.sandbox_namespace.clone(), - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.clone(), - supervisor_image, - supervisor_image_pull_policy, - supervisor_sideload_method: std::env::var( - "OPENSHELL_SUPERVISOR_SIDELOAD_METHOD", - ) - .ok() - .filter(|s| !s.is_empty()) - .and_then(|s| s.parse().ok()) - .unwrap_or_default(), - grpc_endpoint: config.grpc_endpoint.clone(), - ssh_socket_path: config.sandbox_ssh_socket_path.clone(), - ssh_handshake_secret: config.ssh_handshake_secret.clone(), - ssh_handshake_skew_secs: config.ssh_handshake_skew_secs, - client_tls_secret_name: config.client_tls_secret_name.clone(), - host_gateway_ip: config.host_gateway_ip.clone(), - enable_user_namespaces: config.enable_user_namespaces, - }, + k8s, store, sandbox_index, sandbox_watch_bus, @@ -628,63 +649,71 @@ async fn build_compute_runtime( .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) } ComputeDriverKind::Podman => { - let socket_path = std::env::var("OPENSHELL_PODMAN_SOCKET") - .ok() - .filter(|s| !s.is_empty()) - .map_or_else( - openshell_driver_podman::PodmanComputeConfig::default_socket_path, - std::path::PathBuf::from, - ); - - let network_name = std::env::var("OPENSHELL_NETWORK_NAME") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_NETWORK_NAME.to_string()); - - let stop_timeout_secs: u32 = std::env::var("OPENSHELL_STOP_TIMEOUT") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS); - - let supervisor_image = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()); - - // TLS client cert paths for sandbox mTLS. When all three are - // set, the Podman driver bind-mounts them into sandbox - // containers and switches the endpoint to https://. - let podman_tls_ca = std::env::var("OPENSHELL_PODMAN_TLS_CA") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); - let podman_tls_cert = std::env::var("OPENSHELL_PODMAN_TLS_CERT") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); - let podman_tls_key = std::env::var("OPENSHELL_PODMAN_TLS_KEY") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); + let mut podman = podman_config_from_file(file)?; + // Env overrides file for fields not represented in Config. + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_SOCKET") + && !v.is_empty() + { + podman.socket_path = std::path::PathBuf::from(v); + } + if let Ok(v) = std::env::var("OPENSHELL_NETWORK_NAME") + && !v.is_empty() + { + podman.network_name = v; + } + if let Ok(v) = std::env::var("OPENSHELL_STOP_TIMEOUT") + && let Ok(parsed) = v.parse() + { + podman.stop_timeout_secs = parsed; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") + && !v.is_empty() + { + podman.supervisor_image = v; + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_CA") + && !v.is_empty() + { + podman.guest_tls_ca = Some(std::path::PathBuf::from(v)); + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_CERT") + && !v.is_empty() + { + podman.guest_tls_cert = Some(std::path::PathBuf::from(v)); + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_KEY") + && !v.is_empty() + { + podman.guest_tls_key = Some(std::path::PathBuf::from(v)); + } + // Shared fields are sourced from Config (which already merged + // file + CLI/env at startup). + podman.default_image.clone_from(&config.sandbox_image); + // The CLI/env `image_pull_policy` is K8s-shaped + // (e.g. `IfNotPresent`) and won't parse into Podman's lowercase + // enum. Only apply it when the operator set a Podman-shaped value + // explicitly; otherwise keep whatever `[openshell.drivers.podman]` + // (or the struct default) provided. + if !config.sandbox_image_pull_policy.is_empty() + && let Ok(policy) = config.sandbox_image_pull_policy.parse() + { + podman.image_pull_policy = policy; + } + if !config.grpc_endpoint.is_empty() { + podman.grpc_endpoint.clone_from(&config.grpc_endpoint); + } + podman.gateway_port = config.bind_address.port(); + podman + .sandbox_ssh_socket_path + .clone_from(&config.sandbox_ssh_socket_path); + podman.ssh_port = config.sandbox_ssh_port; + podman + .ssh_handshake_secret + .clone_from(&config.ssh_handshake_secret); + podman.ssh_handshake_skew_secs = config.ssh_handshake_skew_secs; ComputeRuntime::new_podman( - openshell_driver_podman::PodmanComputeConfig { - socket_path, - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.parse().unwrap_or_default(), - grpc_endpoint: config.grpc_endpoint.clone(), - gateway_port: config.bind_address.port(), - sandbox_ssh_socket_path: config.sandbox_ssh_socket_path.clone(), - network_name, - ssh_port: config.sandbox_ssh_port, - ssh_handshake_secret: config.ssh_handshake_secret.clone(), - ssh_handshake_skew_secs: config.ssh_handshake_skew_secs, - stop_timeout_secs, - supervisor_image, - guest_tls_ca: podman_tls_ca, - guest_tls_cert: podman_tls_cert, - guest_tls_key: podman_tls_key, - }, + podman, store, sandbox_index, sandbox_watch_bus, @@ -697,6 +726,43 @@ async fn build_compute_runtime( } } +/// Build a [`KubernetesComputeConfig`] from the file's +/// `[openshell.drivers.kubernetes]` table merged with inheritable +/// `[openshell.gateway]` defaults. Falls back to the driver's `Default` +/// when no file is present. +fn kubernetes_config_from_file( + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Ok(KubernetesComputeConfig::default()); + }; + let merged = config_file::driver_table( + ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + merged + .try_into() + .map_err(|e| Error::config(format!("invalid [openshell.drivers.kubernetes] table: {e}"))) +} + +/// Same pattern as [`kubernetes_config_from_file`] but for Podman. +fn podman_config_from_file( + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Ok(openshell_driver_podman::PodmanComputeConfig::default()); + }; + let merged = config_file::driver_table( + ComputeDriverKind::Podman, + &file.openshell.gateway, + file.openshell.drivers.get("podman"), + ); + merged + .try_into() + .map_err(|e| Error::config(format!("invalid [openshell.drivers.podman] table: {e}"))) +} + fn configured_compute_driver(config: &Config) -> Result { match config.compute_drivers.as_slice() { [] => openshell_core::config::detect_driver().ok_or_else(|| { diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml new file mode 100644 index 000000000..5937b8a7d --- /dev/null +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -0,0 +1,103 @@ +{{/* +ConfigMap holding the gateway TOML config file (RFC 0003). + +The gateway reads `/etc/openshell/gateway.toml` (mounted from this ConfigMap) +at startup. CLI flags and OPENSHELL_* env vars on the StatefulSet container +still override anything in this file. + +Two values are intentionally NOT rendered here: + - server.dbUrl → passed via --db-url in the StatefulSet args + - sshHandshake secret → injected as OPENSHELL_SSH_HANDSHAKE_SECRET env var + from a Kubernetes Secret reference. +*/}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "openshell.fullname" . }}-config + labels: + {{- include "openshell.labels" . | nindent 4 }} +data: + gateway.toml: | + [openshell] + version = 1 + + [openshell.gateway] + bind_address = "0.0.0.0:{{ .Values.service.port }}" + {{- if .Values.service.healthPort }} + health_bind_address = "0.0.0.0:{{ .Values.service.healthPort }}" + {{- end }} + {{- if .Values.service.metricsPort }} + metrics_bind_address = "0.0.0.0:{{ .Values.service.metricsPort }}" + {{- end }} + log_level = {{ .Values.server.logLevel | quote }} + sandbox_namespace = {{ include "openshell.sandboxNamespace" . | quote }} + default_image = {{ .Values.server.sandboxImage | quote }} + supervisor_image = {{ include "openshell.supervisorImage" . | quote }} + {{- if .Values.server.sshGatewayHost }} + ssh_gateway_host = {{ .Values.server.sshGatewayHost | quote }} + {{- end }} + {{- if .Values.server.sshGatewayPort }} + ssh_gateway_port = {{ .Values.server.sshGatewayPort }} + {{- end }} + {{- if .Values.server.hostGatewayIP }} + host_gateway_ip = {{ .Values.server.hostGatewayIP | quote }} + {{- end }} + {{- if .Values.server.enableUserNamespaces }} + enable_user_namespaces = true + {{- end }} + {{- if .Values.server.disableTls }} + disable_tls = true + {{- else }} + client_tls_secret_name = {{ .Values.server.tls.clientTlsSecretName | quote }} + {{- end }} + enable_loopback_service_http = {{ .Values.server.enableLoopbackServiceHttp }} + {{- $sans := list -}} + {{- if and .Values.certManager.enabled .Values.certManager.serverDnsNames }} + {{- $sans = .Values.certManager.serverDnsNames }} + {{- else if and .Values.pkiInitJob.enabled .Values.pkiInitJob.serverDnsNames }} + {{- $sans = .Values.pkiInitJob.serverDnsNames }} + {{- end }} + {{- if $sans }} + server_sans = [{{- range $i, $san := $sans }}{{ if $i }}, {{ end }}{{ $san | quote }}{{- end }}] + {{- end }} + + {{- if not .Values.server.disableTls }} + + [openshell.gateway.tls] + cert_path = "/etc/openshell-tls/server/tls.crt" + key_path = "/etc/openshell-tls/server/tls.key" + client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + {{- if .Values.server.disableGatewayAuth }} + allow_unauthenticated = true + {{- end }} + {{- end }} + + {{- if .Values.server.oidc.issuer }} + + [openshell.gateway.oidc] + issuer = {{ .Values.server.oidc.issuer | quote }} + audience = {{ .Values.server.oidc.audience | quote }} + jwks_ttl_secs = {{ .Values.server.oidc.jwksTtl }} + {{- if .Values.server.oidc.rolesClaim }} + roles_claim = {{ .Values.server.oidc.rolesClaim | quote }} + {{- end }} + {{- if .Values.server.oidc.adminRole }} + admin_role = {{ .Values.server.oidc.adminRole | quote }} + {{- end }} + {{- if .Values.server.oidc.userRole }} + user_role = {{ .Values.server.oidc.userRole | quote }} + {{- end }} + {{- if .Values.server.oidc.scopesClaim }} + scopes_claim = {{ .Values.server.oidc.scopesClaim | quote }} + {{- end }} + {{- end }} + + [openshell.drivers.kubernetes] + grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} + supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + {{- if .Values.server.sandboxImagePullPolicy }} + image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} + {{- end }} + {{- if .Values.supervisor.image.pullPolicy }} + supervisor_image_pull_policy = {{ .Values.supervisor.image.pullPolicy | quote }} + {{- end }} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 69140a70c..ccb1775c7 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -15,10 +15,15 @@ spec: {{- include "openshell.selectorLabels" . | nindent 6 }} template: metadata: - {{- with .Values.podAnnotations }} annotations: + # Roll the StatefulSet when the rendered gateway TOML changes — the + # gateway only reads /etc/openshell/gateway.toml at startup, so + # without this annotation a `helm upgrade` that only mutates the + # ConfigMap would leave pods running with stale config. + checksum/gateway-config: {{ include (print $.Template.BasePath "/gateway-config.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} {{- toYaml . | nindent 8 }} - {{- end }} + {{- end }} labels: {{- include "openshell.labels" . | nindent 8 }} {{- with .Values.podLabels }} @@ -47,113 +52,24 @@ spec: image: {{ include "openshell.image" . | quote }} imagePullPolicy: {{ .Values.image.pullPolicy }} args: - - --bind-address - - "0.0.0.0" - - --port - - {{ .Values.service.port | quote }} - - --health-port - - {{ .Values.service.healthPort | quote }} - {{- if .Values.service.metricsPort }} - - --metrics-port - - {{ .Values.service.metricsPort | quote }} - {{- end }} - - --log-level - - {{ .Values.server.logLevel }} + - --config + - /etc/openshell/gateway.toml - --db-url - {{ .Values.server.dbUrl | quote }} env: - - name: OPENSHELL_SANDBOX_NAMESPACE - value: {{ include "openshell.sandboxNamespace" . | quote }} - - name: OPENSHELL_SANDBOX_IMAGE - value: {{ .Values.server.sandboxImage | quote }} - {{- if .Values.server.sandboxImagePullPolicy }} - - name: OPENSHELL_SANDBOX_IMAGE_PULL_POLICY - value: {{ .Values.server.sandboxImagePullPolicy | quote }} - {{- end }} - - name: OPENSHELL_SUPERVISOR_IMAGE - value: {{ include "openshell.supervisorImage" . | quote }} - {{- if .Values.supervisor.image.pullPolicy }} - - name: OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY - value: {{ .Values.supervisor.image.pullPolicy | quote }} - {{- end }} - - name: OPENSHELL_SUPERVISOR_SIDELOAD_METHOD - value: {{ include "openshell.supervisorSideloadMethod" . | quote }} - - name: OPENSHELL_GRPC_ENDPOINT - value: {{ include "openshell.grpcEndpoint" . | quote }} - {{- if .Values.server.sshGatewayHost }} - - name: OPENSHELL_SSH_GATEWAY_HOST - value: {{ .Values.server.sshGatewayHost | quote }} - {{- end }} - {{- if .Values.server.sshGatewayPort }} - - name: OPENSHELL_SSH_GATEWAY_PORT - value: {{ .Values.server.sshGatewayPort | quote }} - {{- end }} - {{- if .Values.server.hostGatewayIP }} - - name: OPENSHELL_HOST_GATEWAY_IP - value: {{ .Values.server.hostGatewayIP | quote }} - {{- end }} - {{- if .Values.server.enableUserNamespaces }} - - name: OPENSHELL_ENABLE_USER_NAMESPACES - value: "true" - {{- end }} - {{- if and .Values.certManager.enabled .Values.certManager.serverDnsNames }} - - name: OPENSHELL_SERVER_SAN - value: {{ join "," .Values.certManager.serverDnsNames | quote }} - {{- else if and .Values.pkiInitJob.enabled .Values.pkiInitJob.serverDnsNames }} - - name: OPENSHELL_SERVER_SAN - value: {{ join "," .Values.pkiInitJob.serverDnsNames | quote }} - {{- end }} - - name: OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP - value: {{ .Values.server.enableLoopbackServiceHttp | quote }} + # Secrets are env-only. Everything else lives in the ConfigMap- + # backed TOML file mounted at /etc/openshell/gateway.toml. - name: OPENSHELL_SSH_HANDSHAKE_SECRET valueFrom: secretKeyRef: name: {{ .Values.server.sshHandshakeSecretName | quote }} key: secret - {{- if .Values.server.disableTls }} - - name: OPENSHELL_DISABLE_TLS - value: "true" - {{- else }} - - name: OPENSHELL_TLS_CERT - value: /etc/openshell-tls/server/tls.crt - - name: OPENSHELL_TLS_KEY - value: /etc/openshell-tls/server/tls.key - - name: OPENSHELL_TLS_CLIENT_CA - value: /etc/openshell-tls/client-ca/ca.crt - - name: OPENSHELL_CLIENT_TLS_SECRET_NAME - value: {{ .Values.server.tls.clientTlsSecretName | quote }} - {{- if .Values.server.disableGatewayAuth }} - - name: OPENSHELL_DISABLE_GATEWAY_AUTH - value: "true" - {{- end }} - {{- end }} - {{- if .Values.server.oidc.issuer }} - - name: OPENSHELL_OIDC_ISSUER - value: {{ .Values.server.oidc.issuer | quote }} - - name: OPENSHELL_OIDC_AUDIENCE - value: {{ .Values.server.oidc.audience | quote }} - - name: OPENSHELL_OIDC_JWKS_TTL - value: {{ .Values.server.oidc.jwksTtl | quote }} - {{- if .Values.server.oidc.rolesClaim }} - - name: OPENSHELL_OIDC_ROLES_CLAIM - value: {{ .Values.server.oidc.rolesClaim | quote }} - {{- end }} - {{- if .Values.server.oidc.adminRole }} - - name: OPENSHELL_OIDC_ADMIN_ROLE - value: {{ .Values.server.oidc.adminRole | quote }} - {{- end }} - {{- if .Values.server.oidc.userRole }} - - name: OPENSHELL_OIDC_USER_ROLE - value: {{ .Values.server.oidc.userRole | quote }} - {{- end }} - {{- if .Values.server.oidc.scopesClaim }} - - name: OPENSHELL_OIDC_SCOPES_CLAIM - value: {{ .Values.server.oidc.scopesClaim | quote }} - {{- end }} - {{- end }} volumeMounts: - name: openshell-data mountPath: /var/openshell + - name: gateway-config + mountPath: /etc/openshell + readOnly: true {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -200,6 +116,9 @@ spec: resources: {{- toYaml .Values.resources | nindent 12 }} volumes: + - name: gateway-config + configMap: + name: {{ include "openshell.fullname" . }}-config {{- if not .Values.server.disableTls }} - name: tls-cert secret: diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml new file mode 100644 index 000000000..2d464b8e6 --- /dev/null +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +suite: gateway TOML config shape +templates: + - templates/gateway-config.yaml + - templates/statefulset.yaml +release: + name: openshell + namespace: my-namespace + +tests: + # Regression for Drew's P2: a ConfigMap-only mutation in `helm upgrade` + # must roll the StatefulSet, otherwise pods keep running with stale config. + - it: annotates the StatefulSet pod template with a ConfigMap checksum + template: templates/statefulset.yaml + asserts: + - exists: + path: spec.template.metadata.annotations["checksum/gateway-config"] + + + # Regression for the P1 bug Drew flagged: grpc_endpoint MUST live in the + # Kubernetes driver table, not in [openshell.gateway]. The gateway-side + # schema has `deny_unknown_fields` and no `grpc_endpoint` field, so writing + # it at gateway scope makes `config_file::load` reject the default install. + - it: renders grpc_endpoint under [openshell.drivers.kubernetes], not [openshell.gateway] + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?grpc_endpoint' + - notMatchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.gateway\][^\[]*?grpc_endpoint' + + - it: omits server_sans when no DNS SANs are configured + template: templates/gateway-config.yaml + asserts: + - notMatchRegex: + path: data["gateway.toml"] + pattern: 'server_sans\s*=' + + - it: emits disable_tls=true and omits the [openshell.gateway.tls] section when disableTls is set + set: + server.disableTls: true + certManager.enabled: false + pkiInitJob.enabled: false + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: 'disable_tls\s*=\s*true' + - notMatchRegex: + path: data["gateway.toml"] + pattern: '\[openshell\.gateway\.tls\]' + + - it: renders server_sans from certManager.serverDnsNames + set: + certManager.enabled: true + certManager.serverDnsNames: + - openshell + - "*.dev.openshell.localhost" + pkiInitJob.enabled: false + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: 'server_sans\s*=\s*\["openshell", "\*\.dev\.openshell\.localhost"\]' diff --git a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml index a128cd440..2d3461c6f 100644 --- a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml +++ b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml @@ -3,32 +3,28 @@ suite: sandboxNamespace defaulting templates: - - templates/statefulset.yaml + - templates/gateway-config.yaml - templates/networkpolicy.yaml release: name: openshell namespace: my-namespace tests: - - it: defaults OPENSHELL_SANDBOX_NAMESPACE to release namespace - template: templates/statefulset.yaml + - it: defaults sandbox_namespace to release namespace in the TOML config + template: templates/gateway-config.yaml asserts: - - contains: - path: spec.template.spec.containers[0].env - content: - name: OPENSHELL_SANDBOX_NAMESPACE - value: "my-namespace" + - matchRegex: + path: data["gateway.toml"] + pattern: 'sandbox_namespace\s*=\s*"my-namespace"' - it: uses explicit sandboxNamespace when set - template: templates/statefulset.yaml + template: templates/gateway-config.yaml set: server.sandboxNamespace: other-ns asserts: - - contains: - path: spec.template.spec.containers[0].env - content: - name: OPENSHELL_SANDBOX_NAMESPACE - value: "other-ns" + - matchRegex: + path: data["gateway.toml"] + pattern: 'sandbox_namespace\s*=\s*"other-ns"' - it: defaults NetworkPolicy namespace to release namespace template: templates/networkpolicy.yaml diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx new file mode 100644 index 000000000..5b5e5736b --- /dev/null +++ b/docs/reference/gateway-config.mdx @@ -0,0 +1,212 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Gateway Configuration File" +sidebar-title: "Gateway Config" +description: "Reference for the OpenShell gateway TOML configuration file (RFC 0003)." +keywords: "Generative AI, Cybersecurity, AI Agents, Sandboxing, Gateway, Configuration, TOML, Reference" +position: 5 +--- + +The OpenShell gateway reads its configuration from a TOML file when `--config` or `OPENSHELL_GATEWAY_CONFIG` is set. CLI flags and `OPENSHELL_*` environment variables always override the file. See [RFC 0003](https://github.com/NVIDIA/OpenShell/blob/main/rfc/0003-gateway-configuration/README.md) for the full schema. + +## Source Precedence + +```text +CLI flag > OPENSHELL_* env var > TOML file > built-in default +``` + +`database_url` and `ssh_handshake_secret` are env-only. The loader rejects them when they appear in the file. + +## Layout + +The file is rooted at `[openshell]`. Gateway-wide settings live under `[openshell.gateway]`. Each compute driver owns its own `[openshell.drivers.]` table. Shared keys set at gateway scope are inherited into driver tables when not overridden. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +# ... gateway-wide settings ... + +[openshell.gateway.tls] +# ... gateway listener TLS ... + +[openshell.gateway.oidc] +# ... JWT bearer auth ... + +[openshell.drivers.kubernetes] +# ... driver-specific settings ... +``` + +## Full Example + +A complete gateway configuration covering every section. Trim to the fields you need. + +```toml +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" + +log_level = "info" + +# When empty the gateway auto-detects (Kubernetes -> Podman -> Docker). VM is +# never auto-detected and requires an explicit entry here. +compute_drivers = ["kubernetes"] + +sandbox_namespace = "openshell" +sandbox_ssh_port = 2222 +ssh_gateway_host = "127.0.0.1" +ssh_gateway_port = 8080 + +# Subject Alternative Names baked into the gateway server certificate. +# Wildcard DNS SANs (e.g. "*.dev.openshell.localhost") also enable sandbox +# service URLs under that domain. +server_sans = ["openshell", "*.dev.openshell.localhost"] +# Allow plaintext HTTP routing for loopback sandbox service URLs. +enable_loopback_service_http = true + +# Shared driver defaults — inherited into [openshell.drivers.] tables +# when the driver-specific table does not override them. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +client_tls_secret_name = "openshell-client-tls" + +# Gateway listener TLS (distinct from the per-driver guest_tls_*). +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" +allow_unauthenticated = false + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" +jwks_ttl_secs = 3600 +roles_claim = "realm_access.roles" +admin_role = "openshell-admin" +user_role = "openshell-user" +``` + +`image_pull_policy` is intentionally not a shared gateway key. Kubernetes uses `Always | IfNotPresent | Never` while Podman uses `always | missing | never | newer`. Set it inside the relevant driver table. + +## Per-Driver Examples + +### Kubernetes + +The gateway runs as a Pod and creates sandbox Pods in another namespace. mTLS material for sandboxes is delivered via a Kubernetes Secret rather than host-side file paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" +log_level = "info" +compute_drivers = ["kubernetes"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +client_tls_secret_name = "openshell-client-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell-tls/server/tls.crt" +key_path = "/etc/openshell-tls/server/tls.key" +client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +image_pull_policy = "IfNotPresent" +# Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" +# on older clusters or where the ImageVolume feature gate is off. +supervisor_sideload_method = "image-volume" +``` + +### Docker + +Sandboxes run as containers on a local bridge network. The supervisor binary is bind-mounted from the host (no in-cluster image pull required); guest mTLS material is supplied as host paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["docker"] + +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.docker] +network_name = "openshell-docker" +# Skip the image-pull-and-extract step by pointing at a locally built binary. +supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" +``` + +### Podman + +Sandboxes run as Podman containers on a user-mode bridge network. The supervisor image is mounted read-only via Podman's `type=image` mount; guest mTLS material is supplied as host paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["podman"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.podman] +# Rootless socket path. For root Podman use /run/podman/podman.sock. +socket_path = "/run/user/1000/podman/podman.sock" +network_name = "openshell" +stop_timeout_secs = 10 +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer +``` + +### MicroVM + +Each sandbox runs inside its own libkrun microVM managed by the standalone `openshell-driver-vm` subprocess. Use this driver when you want stronger isolation than container namespaces alone. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +# VM is never auto-detected; an explicit entry here is required. +compute_drivers = ["vm"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +guest_tls_ca = "/var/lib/openshell/guest-tls/ca.pem" +guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" +guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" + +[openshell.drivers.vm] +state_dir = "/var/lib/openshell/vm" +# Where the gateway looks for the openshell-driver-vm subprocess binary. +driver_dir = "/usr/local/libexec/openshell" +vcpus = 2 +mem_mib = 2048 +krun_log_level = 1 +``` diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 9c6a16d94..b3c47ef7d 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -32,6 +32,8 @@ Common gateway options: | `--sandbox-image ` | `OPENSHELL_SANDBOX_IMAGE` | Set the default sandbox image used when a sandbox create request does not specify one. | | `--grpc-endpoint ` | `OPENSHELL_GRPC_ENDPOINT` | Set the gateway callback endpoint that sandbox workloads use to connect back to OpenShell. | +For the full file-based configuration surface — including gateway-wide settings, TLS, OIDC, and per-driver tables — see the [Gateway Configuration File](./gateway-config) reference. Each driver section below documents the CLI flags and environment variables; their TOML equivalents live in the `[openshell.drivers.]` tables on that page. + ## Docker Driver [Docker](https://www.docker.com/get-started/)-backed sandboxes run as containers on the gateway host. Use Docker for local development, single-machine gateways, and hosts that already use Docker Desktop or Docker Engine. diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 70ca09f7b..da311920b 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -37,7 +37,7 @@ Three sources are merged at startup, in descending priority: CLI flags > OPENSHELL_* environment variables > TOML config file > built-in defaults ``` -The TOML file is optional. If neither `--config` nor `OPENSHELL_CONFIG` is set, the gateway behaves exactly as before. Any field present in the file is overridden by a CLI flag or matching environment variable. +The TOML file is optional. If neither `--config` nor `OPENSHELL_GATEWAY_CONFIG` is set, the gateway behaves exactly as before. Any field present in the file is overridden by a CLI flag or matching environment variable. ### Loading the file @@ -86,12 +86,21 @@ ssh_handshake_skew_secs = 300 ssh_session_ttl_secs = 86400 ssh_gateway_host = "127.0.0.1" ssh_gateway_port = 8080 -ssh_connect_path = "/connect/ssh" sandbox_ssh_port = 2222 +# Service routing — wildcard DNS SANs in `server_sans` also enable sandbox +# service URLs under that domain. `enable_loopback_service_http` toggles +# plaintext HTTP routing for loopback service URLs. +server_sans = ["openshell", "*.dev.openshell.localhost"] +enable_loopback_service_http = true + # ────────────────────────────────────────────────────────────────────────────── # TLS / mTLS — when omitted, the gateway listens plaintext (sets --disable-tls) # ────────────────────────────────────────────────────────────────────────────── +# Mirrors --disable-tls / OPENSHELL_DISABLE_TLS. When true, the gateway +# ignores the [openshell.gateway.tls] table below. +disable_tls = false + [openshell.gateway.tls] cert_path = "/etc/openshell/certs/gateway.pem" key_path = "/etc/openshell/certs/gateway-key.pem" @@ -137,7 +146,7 @@ guest_tls_key = "/etc/openshell/certs/client-key.pem" [openshell.drivers.podman] socket_path = "/run/podman/podman.sock" default_image = "ghcr.io/nvidia/openshell/sandbox:latest" -image_pull_policy = "IfNotPresent" +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" network_name = "openshell" stop_timeout_secs = 10 @@ -261,11 +270,11 @@ The chart owners can migrate one section at a time: `OPENSHELL_*` env vars and t No part of this RFC has shipped yet. The work breaks down as: 1. **Add a config-file loader to `openshell-server`** — define a `GatewayConfigFile` struct that mirrors the schema above, parse it with `serde` + `toml`, and merge it into `openshell_core::Config` plus the per-driver structs in `compute/`. -2. **Wire the merge into `cli.rs`** — add `--config` / `OPENSHELL_CONFIG`, gate each existing flag's "apply from file" path on clap `ValueSource::DefaultValue`, and run cross-field validation after the merge. +2. **Wire the merge into `cli.rs`** — add `--config` / `OPENSHELL_GATEWAY_CONFIG`, gate each existing flag's "apply from file" path on clap `ValueSource::DefaultValue`, and run cross-field validation after the merge. 3. **Per-driver deserialization** — give each driver crate (`openshell-driver-{kubernetes,docker,podman,vm}`) a `from_toml` (or `serde::Deserialize`) entry point so the gateway can hand each driver its own table. 4. **Test coverage** — file parsing, env-overrides-file, CLI-overrides-env, partial TLS error, port-collision error, unknown-field rejection, missing driver table fallback. 5. **Helm chart migration** — add `gateway.config` value tree, render the `ConfigMap`, mount it, switch the gateway container to `--config`. Keep the `OPENSHELL_*` env names available as opt-in overrides for secrets. -6. **Example file** — ship `examples/gateway/gateway.example.toml` and link it from the docs reference. +6. **Example file** — ship the per-driver examples on the published docs reference at `docs/reference/gateway-config.mdx`. 7. **Architecture doc update** — reflect the new config sources and precedence in `architecture/gateway.md`. ## Risks