From 5be2d8c485ab979af214f794bdf5a7e992687a92 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 18:19:08 -0700 Subject: [PATCH 1/7] feat(gateway): add TOML configuration file (RFC 0003) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces an opt-in --config / OPENSHELL_GATEWAY_CONFIG flag that loads a TOML file with gateway-wide settings and per-driver tables. Source precedence is CLI > env > file > built-in default, implemented via clap's ValueSource so existing flags and env vars keep their priority. Driver crates (kubernetes, docker, podman, vm) now derive Deserialize on their config structs. SupervisorSideloadMethod gains Deserialize with kebab-case rename. A per-driver inheritance allowlist on the loader side overlays [openshell.gateway] shared defaults (default_image, supervisor_image, image_pull_policy, guest_tls_*, ssh_handshake_skew_secs, client_tls_secret_name, host_gateway_ip, enable_user_namespaces) onto each [openshell.drivers.] table before deserialization. The Helm chart renders a new gateway-config ConfigMap and mounts it at /etc/openshell/gateway.toml. The migrated OPENSHELL_* env entries are dropped from the StatefulSet — only the Secret-backed OPENSHELL_SSH_HANDSHAKE_SECRET remains. database_url stays on --db-url. Adds examples/gateway/gateway.example.toml and updates architecture/gateway.md with the source precedence and inheritance rules. --- Cargo.lock | 62 ++ Cargo.toml | 1 + architecture/gateway.md | 36 ++ crates/openshell-driver-docker/Cargo.toml | 1 + crates/openshell-driver-docker/src/lib.rs | 16 +- crates/openshell-driver-kubernetes/Cargo.toml | 1 + .../openshell-driver-kubernetes/src/config.rs | 32 +- crates/openshell-driver-podman/src/config.rs | 3 +- crates/openshell-server/Cargo.toml | 1 + crates/openshell-server/src/cli.rs | 567 ++++++++++++++++-- crates/openshell-server/src/compute/vm.rs | 3 +- crates/openshell-server/src/config_file.rs | 518 ++++++++++++++++ crates/openshell-server/src/lib.rs | 213 ++++--- .../openshell/templates/gateway-config.yaml | 101 ++++ .../helm/openshell/templates/statefulset.yaml | 106 +--- .../tests/sandbox_namespace_test.yaml | 24 +- examples/gateway/gateway.example.toml | 85 +++ rfc/0003-gateway-configuration/README.md | 7 +- 18 files changed, 1530 insertions(+), 247 deletions(-) create mode 100644 crates/openshell-server/src/config_file.rs create mode 100644 deploy/helm/openshell/templates/gateway-config.yaml create mode 100644 examples/gateway/gateway.example.toml diff --git a/Cargo.lock b/Cargo.lock index 29919124a..8689366ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3414,6 +3414,7 @@ dependencies = [ "bytes", "futures", "openshell-core", + "serde", "tar", "tempfile", "tokio", @@ -3436,6 +3437,7 @@ dependencies = [ "openshell-core", "prost", "prost-types", + "serde", "serde_json", "thiserror 2.0.18", "tokio", @@ -3667,6 +3669,7 @@ dependencies = [ "tokio-rustls", "tokio-stream", "tokio-tungstenite 0.26.2", + "toml", "tonic", "tower 0.5.3", "tower-http 0.6.8", @@ -5129,6 +5132,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -6066,6 +6078,47 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap 2.14.0", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tonic" version = "0.12.3" @@ -7155,6 +7208,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + [[package]] name = "wiremock" version = "0.6.5" diff --git a/Cargo.toml b/Cargo.toml index 195544431..3fea379a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ nix = { version = "0.29", features = ["signal", "process", "user", "fs", "term"] serde = { version = "1", features = ["derive"] } serde_json = "1" serde_yml = "0.0.12" +toml = "0.8" apollo-parser = "0.8.5" # HTTP client diff --git a/architecture/gateway.md b/architecture/gateway.md index 68832d0cf..64f4d9721 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -193,6 +193,42 @@ pre-created Secrets) disable the Helm hook via `pkiInitJob.enabled=false`. The chart also ships a `certManager.*` path that produces equivalent Secrets through cert-manager `Issuer`/`Certificate` resources. +## Configuration + +The gateway reads its configuration from three sources, merged in this +precedence (highest first): + +``` +CLI flag > OPENSHELL_* env var > TOML file > built-in default +``` + +The TOML file is opt-in via `--config ` / `OPENSHELL_GATEWAY_CONFIG`. +When unset, the gateway behaves exactly as before — CLI flags and env vars +drive every setting. See `examples/gateway/gateway.example.toml` for a +worked example and RFC 0003 for the full schema. + +Two secrets are env-only and rejected when present in the file: + +| Field | Source | +|------------------------|--------------------------------------------------| +| `database_url` | `OPENSHELL_DB_URL` / `--db-url` | +| `ssh_handshake_secret` | `OPENSHELL_SSH_HANDSHAKE_SECRET` / `--ssh-handshake-secret` | + +### Driver inheritance + +`[openshell.gateway]` carries a small set of values (`default_image`, +`supervisor_image`, `image_pull_policy`, `guest_tls_ca/cert/key`, +`client_tls_secret_name`, `host_gateway_ip`, `ssh_handshake_skew_secs`, +`enable_user_namespaces`) that are inherited into each driver's +`[openshell.drivers.]` table when the driver-specific table does not +override them. The allowlist is per-driver so a gateway-wide default cannot +land in a driver that does not understand it (e.g. +`client_tls_secret_name` is K8s-only). + +Driver-specific values that are not part of the inheritance allowlist +(e.g. K8s `namespace`, Podman `socket_path`, VM `vcpus`) only come from +the driver's own table. + ## Operational Constraints - Gateway TLS and client certificate distribution are deployment concerns owned diff --git a/crates/openshell-driver-docker/Cargo.toml b/crates/openshell-driver-docker/Cargo.toml index 79d4fb37d..e2c97532a 100644 --- a/crates/openshell-driver-docker/Cargo.toml +++ b/crates/openshell-driver-docker/Cargo.toml @@ -19,6 +19,7 @@ futures = { workspace = true } tokio-stream = { workspace = true } tracing = { workspace = true } bytes = { workspace = true } +serde = { workspace = true } bollard = { version = "0.20" } tar = "0.4" tempfile = "3" diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 6059596ab..785cc4cd3 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -127,7 +127,8 @@ pub trait SupervisorReadiness: Send + Sync + 'static { } /// Gateway-local configuration for the Docker compute driver. -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct DockerComputeConfig { /// Optional override for the Linux `openshell-sandbox` binary mounted into containers. pub supervisor_bin: Option, @@ -151,6 +152,19 @@ pub struct DockerComputeConfig { pub network_name: String, } +impl Default for DockerComputeConfig { + fn default() -> Self { + Self { + supervisor_bin: None, + supervisor_image: None, + guest_tls_ca: None, + guest_tls_cert: None, + guest_tls_key: None, + network_name: DEFAULT_DOCKER_NETWORK_NAME.to_string(), + } + } +} + #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct DockerGuestTlsPaths { pub(crate) ca: PathBuf, diff --git a/crates/openshell-driver-kubernetes/Cargo.toml b/crates/openshell-driver-kubernetes/Cargo.toml index 5e247dc77..c222c9c31 100644 --- a/crates/openshell-driver-kubernetes/Cargo.toml +++ b/crates/openshell-driver-kubernetes/Cargo.toml @@ -26,6 +26,7 @@ tokio-stream = { workspace = true } kube = { workspace = true } kube-runtime = { workspace = true } k8s-openapi = { workspace = true } +serde = { workspace = true } serde_json = { workspace = true } clap = { workspace = true } tracing = { workspace = true } diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index e9baf0e5c..2eab5a4c7 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -1,8 +1,15 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use openshell_core::config::{ + DEFAULT_IMAGE_PULL_POLICY, DEFAULT_K8S_NAMESPACE, DEFAULT_SSH_HANDSHAKE_SKEW_SECS, + DEFAULT_SUPERVISOR_IMAGE, +}; +use serde::{Deserialize, Serialize}; + /// How the supervisor binary is delivered into sandbox pods. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] pub enum SupervisorSideloadMethod { /// Mount the supervisor OCI image directly as a read-only volume /// (requires Kubernetes >= v1.33 with the `ImageVolume` feature gate, @@ -37,7 +44,8 @@ impl std::str::FromStr for SupervisorSideloadMethod { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct KubernetesComputeConfig { pub namespace: String, pub default_image: String, @@ -59,3 +67,23 @@ pub struct KubernetesComputeConfig { pub host_gateway_ip: String, pub enable_user_namespaces: bool, } + +impl Default for KubernetesComputeConfig { + fn default() -> Self { + Self { + namespace: DEFAULT_K8S_NAMESPACE.to_string(), + default_image: String::new(), + image_pull_policy: DEFAULT_IMAGE_PULL_POLICY.to_string(), + supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(), + supervisor_image_pull_policy: String::new(), + supervisor_sideload_method: SupervisorSideloadMethod::default(), + grpc_endpoint: String::new(), + ssh_socket_path: "/run/openshell/ssh.sock".to_string(), + ssh_handshake_secret: String::new(), + ssh_handshake_skew_secs: DEFAULT_SSH_HANDSHAKE_SKEW_SECS, + client_tls_secret_name: String::new(), + host_gateway_ip: String::new(), + enable_user_namespaces: false, + } + } +} diff --git a/crates/openshell-driver-podman/src/config.rs b/crates/openshell-driver-podman/src/config.rs index d82b8d0b0..cfb58751e 100644 --- a/crates/openshell-driver-podman/src/config.rs +++ b/crates/openshell-driver-podman/src/config.rs @@ -61,7 +61,8 @@ impl FromStr for ImagePullPolicy { } } -#[derive(Clone)] +#[derive(Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct PodmanComputeConfig { /// Path to the Podman API Unix socket. /// Default: `$XDG_RUNTIME_DIR/podman/podman.sock` (Linux), diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 9cba99045..f03f6f3cc 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -74,6 +74,7 @@ bytes = { workspace = true } pin-project-lite = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +toml = { workspace = true } tokio-stream = { workspace = true } sqlx = { workspace = true } reqwest = { workspace = true } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 3cd9e8c79..e2cfeba92 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -3,7 +3,8 @@ //! Shared CLI entrypoint for the gateway binaries. -use clap::{ArgAction, Command, CommandFactory, FromArgMatches, Parser}; +use clap::parser::ValueSource; +use clap::{ArgAction, ArgMatches, Command, CommandFactory, FromArgMatches, Parser}; use miette::{IntoDiagnostic, Result}; use openshell_core::ComputeDriverKind; use openshell_core::config::{ @@ -17,6 +18,7 @@ use tracing_subscriber::EnvFilter; use crate::certgen; use crate::compute::{DockerComputeConfig, VmComputeConfig}; +use crate::config_file::{self, ConfigFile, GatewayFileSection}; use crate::{run_server, tracing_bus::TracingLogBus}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. @@ -44,6 +46,14 @@ enum Commands { #[derive(clap::Args, Debug)] #[allow(clippy::struct_excessive_bools)] struct RunArgs { + /// Path to a TOML configuration file (see RFC 0003). + /// + /// When set, gateway-wide settings and per-driver tables are read from + /// the file. Command-line flags and `OPENSHELL_*` environment variables + /// continue to take precedence over file values. + #[arg(long, env = "OPENSHELL_GATEWAY_CONFIG")] + config: Option, + /// IP address to bind the server, health, and metrics listeners to. #[arg(long, default_value = "127.0.0.1", env = "OPENSHELL_BIND_ADDRESS")] bind_address: IpAddr, @@ -329,15 +339,28 @@ pub async fn run_cli() -> Result<()> { .install_default() .map_err(|e| miette::miette!("failed to install rustls crypto provider: {e:?}"))?; - let cli = Cli::from_arg_matches(&command().get_matches()).expect("clap validated args"); + let matches = command().get_matches(); + let cli = Cli::from_arg_matches(&matches).expect("clap validated args"); match cli.command { Some(Commands::GenerateCerts(args)) => certgen::run(args).await, - None => Box::pin(run_from_args(cli.run)).await, + None => Box::pin(run_from_args(cli.run, matches)).await, } } -async fn run_from_args(args: RunArgs) -> Result<()> { +async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { + // Load TOML file when --config / OPENSHELL_GATEWAY_CONFIG is set. + // File values are applied below for any argument that is still at its + // built-in default — CLI flags and OPENSHELL_* env vars always win. + let file: Option = if let Some(path) = args.config.clone() { + Some(config_file::load(&path).map_err(|e| miette::miette!("{e}"))?) + } else { + None + }; + if let Some(file) = file.as_ref() { + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + } + let tracing_log_bus = TracingLogBus::new(); tracing_log_bus.install_subscriber( EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), @@ -348,15 +371,15 @@ async fn run_from_args(args: RunArgs) -> Result<()> { let tls = if args.disable_tls { None } else { - let cert_path = args.tls_cert.ok_or_else(|| { + let cert_path = args.tls_cert.clone().ok_or_else(|| { miette::miette!( "--tls-cert is required when TLS is enabled (use --disable-tls to skip)" ) })?; - let key_path = args.tls_key.ok_or_else(|| { + let key_path = args.tls_key.clone().ok_or_else(|| { miette::miette!("--tls-key is required when TLS is enabled (use --disable-tls to skip)") })?; - let client_ca_path = args.tls_client_ca.ok_or_else(|| { + let client_ca_path = args.tls_client_ca.clone().ok_or_else(|| { miette::miette!( "--tls-client-ca is required when TLS is enabled (use --disable-tls to skip)" ) @@ -371,6 +394,7 @@ async fn run_from_args(args: RunArgs) -> Result<()> { let db_url = args .db_url + .clone() .ok_or_else(|| miette::miette!("--db-url is required (or set OPENSHELL_DB_URL)"))?; let mut config = openshell_core::Config::new(tls) @@ -407,73 +431,55 @@ async fn run_from_args(args: RunArgs) -> Result<()> { config = config .with_database_url(db_url) - .with_compute_drivers(args.drivers) - .with_sandbox_namespace(args.sandbox_namespace) - .with_ssh_gateway_host(args.ssh_gateway_host) + .with_compute_drivers(args.drivers.clone()) + .with_sandbox_namespace(args.sandbox_namespace.clone()) + .with_ssh_gateway_host(args.ssh_gateway_host.clone()) .with_ssh_gateway_port(args.ssh_gateway_port) .with_sandbox_ssh_port(args.sandbox_ssh_port) .with_ssh_handshake_skew_secs(args.ssh_handshake_skew_secs) - .with_server_sans(args.server_sans) + .with_server_sans(args.server_sans.clone()) .with_loopback_service_http(args.enable_loopback_service_http); - if let Some(image) = args.sandbox_image { + if let Some(image) = args.sandbox_image.clone() { config = config.with_sandbox_image(image); } - if let Some(policy) = args.sandbox_image_pull_policy { + if let Some(policy) = args.sandbox_image_pull_policy.clone() { config = config.with_sandbox_image_pull_policy(policy); } - if let Some(endpoint) = args.grpc_endpoint { + if let Some(endpoint) = args.grpc_endpoint.clone() { config = config.with_grpc_endpoint(endpoint); } - if let Some(secret) = args.ssh_handshake_secret { + if let Some(secret) = args.ssh_handshake_secret.clone() { config = config.with_ssh_handshake_secret(secret); } - if let Some(name) = args.client_tls_secret_name { + if let Some(name) = args.client_tls_secret_name.clone() { config = config.with_client_tls_secret_name(name); } - if let Some(ip) = args.host_gateway_ip { + if let Some(ip) = args.host_gateway_ip.clone() { config = config.with_host_gateway_ip(ip); } - if let Some(issuer) = args.oidc_issuer { + if let Some(issuer) = args.oidc_issuer.clone() { config = config.with_oidc(openshell_core::OidcConfig { issuer, - audience: args.oidc_audience, + audience: args.oidc_audience.clone(), jwks_ttl_secs: args.oidc_jwks_ttl, - roles_claim: args.oidc_roles_claim, - admin_role: args.oidc_admin_role, - user_role: args.oidc_user_role, - scopes_claim: args.oidc_scopes_claim, + roles_claim: args.oidc_roles_claim.clone(), + admin_role: args.oidc_admin_role.clone(), + user_role: args.oidc_user_role.clone(), + scopes_claim: args.oidc_scopes_claim.clone(), }); } config.enable_user_namespaces = args.enable_user_namespaces; - let vm_config = VmComputeConfig { - state_dir: args.vm_driver_state_dir, - driver_dir: args.driver_dir, - default_image: config.sandbox_image.clone(), - krun_log_level: args.vm_krun_log_level, - vcpus: args.vm_vcpus, - mem_mib: args.vm_mem_mib, - guest_tls_ca: args.vm_tls_ca, - guest_tls_cert: args.vm_tls_cert, - guest_tls_key: args.vm_tls_key, - }; - - let docker_config = DockerComputeConfig { - supervisor_bin: args.docker_supervisor_bin, - supervisor_image: args.docker_supervisor_image, - guest_tls_ca: args.docker_tls_ca, - guest_tls_cert: args.docker_tls_cert, - guest_tls_key: args.docker_tls_key, - network_name: args.docker_network_name, - }; + let vm_config = build_vm_config(&args, &matches, &config, file.as_ref())?; + let docker_config = build_docker_config(&args, &matches, file.as_ref())?; if args.disable_tls { info!("TLS disabled — listening on plaintext HTTP"); @@ -483,15 +489,269 @@ async fn run_from_args(args: RunArgs) -> Result<()> { info!(bind = %config.bind_address, "Starting OpenShell server"); - run_server(config, vm_config, docker_config, tracing_log_bus) - .await - .into_diagnostic() + Box::pin(run_server( + config, + vm_config, + docker_config, + file, + tracing_log_bus, + )) + .await + .into_diagnostic() } fn parse_compute_driver(value: &str) -> std::result::Result { value.parse() } +/// Returns `true` when an argument's value came from clap's built-in default +/// (or was never supplied at all). When the predicate is `true`, the loader +/// is free to replace the value with one read from the TOML config file. +fn arg_defaulted(matches: &ArgMatches, id: &str) -> bool { + matches!( + matches.value_source(id), + None | Some(ValueSource::DefaultValue) + ) +} + +/// Apply gateway-wide values from `[openshell.gateway]` onto `RunArgs` for +/// every argument that is still sourced from clap's built-in default. +/// +/// The function intentionally does not touch `database_url` or +/// `ssh_handshake_secret` — those secrets are env-only and the loader +/// already rejected them when they appear in the file. +fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: &ArgMatches) { + if let Some(addr) = file.bind_address { + if arg_defaulted(matches, "bind_address") { + args.bind_address = addr.ip(); + } + if arg_defaulted(matches, "port") { + args.port = addr.port(); + } + } + if let Some(addr) = file.health_bind_address + && arg_defaulted(matches, "health_port") + { + args.health_port = addr.port(); + } + if let Some(addr) = file.metrics_bind_address + && arg_defaulted(matches, "metrics_port") + { + args.metrics_port = addr.port(); + } + if let Some(level) = &file.log_level + && arg_defaulted(matches, "log_level") + { + args.log_level.clone_from(level); + } + if let Some(drivers) = &file.compute_drivers + && arg_defaulted(matches, "drivers") + { + args.drivers.clone_from(drivers); + } + if let Some(ns) = &file.sandbox_namespace + && arg_defaulted(matches, "sandbox_namespace") + { + args.sandbox_namespace.clone_from(ns); + } + if let Some(port) = file.sandbox_ssh_port + && arg_defaulted(matches, "sandbox_ssh_port") + { + args.sandbox_ssh_port = port; + } + if let Some(host) = &file.ssh_gateway_host + && arg_defaulted(matches, "ssh_gateway_host") + { + args.ssh_gateway_host.clone_from(host); + } + if let Some(port) = file.ssh_gateway_port + && arg_defaulted(matches, "ssh_gateway_port") + { + args.ssh_gateway_port = port; + } + if let Some(skew) = file.ssh_handshake_skew_secs + && arg_defaulted(matches, "ssh_handshake_skew_secs") + { + args.ssh_handshake_skew_secs = skew; + } + if let Some(image) = &file.default_image + && args.sandbox_image.is_none() + && arg_defaulted(matches, "sandbox_image") + { + args.sandbox_image = Some(image.clone()); + } + if let Some(policy) = &file.image_pull_policy + && args.sandbox_image_pull_policy.is_none() + && arg_defaulted(matches, "sandbox_image_pull_policy") + { + args.sandbox_image_pull_policy = Some(policy.clone()); + } + if let Some(secret) = &file.client_tls_secret_name + && args.client_tls_secret_name.is_none() + && arg_defaulted(matches, "client_tls_secret_name") + { + args.client_tls_secret_name = Some(secret.clone()); + } + if let Some(ip) = &file.host_gateway_ip + && args.host_gateway_ip.is_none() + && arg_defaulted(matches, "host_gateway_ip") + { + args.host_gateway_ip = Some(ip.clone()); + } + if let Some(enabled) = file.enable_user_namespaces + && arg_defaulted(matches, "enable_user_namespaces") + { + args.enable_user_namespaces = enabled; + } + if let Some(sans) = &file.server_sans + && args.server_sans.is_empty() + && arg_defaulted(matches, "server_sans") + { + args.server_sans.clone_from(sans); + } + if let Some(enabled) = file.enable_loopback_service_http + && arg_defaulted(matches, "enable_loopback_service_http") + { + args.enable_loopback_service_http = enabled; + } + // TLS gateway listener fields + if let Some(tls) = &file.tls { + if args.tls_cert.is_none() && arg_defaulted(matches, "tls_cert") { + args.tls_cert = Some(tls.cert_path.clone()); + } + if args.tls_key.is_none() && arg_defaulted(matches, "tls_key") { + args.tls_key = Some(tls.key_path.clone()); + } + if args.tls_client_ca.is_none() && arg_defaulted(matches, "tls_client_ca") { + args.tls_client_ca = Some(tls.client_ca_path.clone()); + } + if tls.allow_unauthenticated && arg_defaulted(matches, "disable_gateway_auth") { + args.disable_gateway_auth = true; + } + } + // OIDC fields + if let Some(oidc) = &file.oidc { + if args.oidc_issuer.is_none() && arg_defaulted(matches, "oidc_issuer") { + args.oidc_issuer = Some(oidc.issuer.clone()); + } + if arg_defaulted(matches, "oidc_audience") { + args.oidc_audience.clone_from(&oidc.audience); + } + if arg_defaulted(matches, "oidc_jwks_ttl") { + args.oidc_jwks_ttl = oidc.jwks_ttl_secs; + } + if arg_defaulted(matches, "oidc_roles_claim") { + args.oidc_roles_claim.clone_from(&oidc.roles_claim); + } + if arg_defaulted(matches, "oidc_admin_role") { + args.oidc_admin_role.clone_from(&oidc.admin_role); + } + if arg_defaulted(matches, "oidc_user_role") { + args.oidc_user_role.clone_from(&oidc.user_role); + } + if arg_defaulted(matches, "oidc_scopes_claim") { + args.oidc_scopes_claim.clone_from(&oidc.scopes_claim); + } + } +} + +/// Build [`VmComputeConfig`] by overlaying CLI args on top of the +/// `[openshell.drivers.vm]` table inherited from `[openshell.gateway]`. +fn build_vm_config( + args: &RunArgs, + matches: &ArgMatches, + config: &openshell_core::Config, + file: Option<&ConfigFile>, +) -> Result { + let mut cfg = if let Some(file) = file { + let merged = config_file::driver_table( + ComputeDriverKind::Vm, + &file.openshell.gateway, + file.openshell.drivers.get("vm"), + ); + merged + .try_into::() + .map_err(|e| miette::miette!("invalid [openshell.drivers.vm] table: {e}"))? + } else { + VmComputeConfig::default() + }; + + // CLI/env overrides — and `state_dir` is also pulled from RunArgs when the + // file did not set it, so the gateway always has a working directory. + if !arg_defaulted(matches, "vm_driver_state_dir") || cfg.state_dir.as_os_str().is_empty() { + cfg.state_dir.clone_from(&args.vm_driver_state_dir); + } + if !arg_defaulted(matches, "driver_dir") || cfg.driver_dir.is_none() { + cfg.driver_dir.clone_from(&args.driver_dir); + } + if !arg_defaulted(matches, "vm_krun_log_level") { + cfg.krun_log_level = args.vm_krun_log_level; + } + if !arg_defaulted(matches, "vm_vcpus") { + cfg.vcpus = args.vm_vcpus; + } + if !arg_defaulted(matches, "vm_mem_mib") { + cfg.mem_mib = args.vm_mem_mib; + } + if let Some(p) = args.vm_tls_ca.clone() { + cfg.guest_tls_ca = Some(p); + } + if let Some(p) = args.vm_tls_cert.clone() { + cfg.guest_tls_cert = Some(p); + } + if let Some(p) = args.vm_tls_key.clone() { + cfg.guest_tls_key = Some(p); + } + // Fall through: image inherited from gateway-wide `sandbox_image` when + // the merged table did not supply `default_image`. + if cfg.default_image.is_empty() { + cfg.default_image.clone_from(&config.sandbox_image); + } + Ok(cfg) +} + +/// Build [`DockerComputeConfig`] using the same inheritance pattern as +/// [`build_vm_config`]. +fn build_docker_config( + args: &RunArgs, + matches: &ArgMatches, + file: Option<&ConfigFile>, +) -> Result { + let mut cfg = if let Some(file) = file { + let merged = config_file::driver_table( + ComputeDriverKind::Docker, + &file.openshell.gateway, + file.openshell.drivers.get("docker"), + ); + merged + .try_into::() + .map_err(|e| miette::miette!("invalid [openshell.drivers.docker] table: {e}"))? + } else { + DockerComputeConfig::default() + }; + + if args.docker_supervisor_bin.is_some() { + cfg.supervisor_bin.clone_from(&args.docker_supervisor_bin); + } + if args.docker_supervisor_image.is_some() { + cfg.supervisor_image + .clone_from(&args.docker_supervisor_image); + } + if args.docker_tls_ca.is_some() { + cfg.guest_tls_ca.clone_from(&args.docker_tls_ca); + } + if args.docker_tls_cert.is_some() { + cfg.guest_tls_cert.clone_from(&args.docker_tls_cert); + } + if args.docker_tls_key.is_some() { + cfg.guest_tls_key.clone_from(&args.docker_tls_key); + } + if !arg_defaulted(matches, "docker_network_name") { + cfg.network_name.clone_from(&args.docker_network_name); + } + Ok(cfg) +} + #[cfg(test)] mod tests { use super::{Cli, command}; @@ -716,4 +976,219 @@ mod tests { assert!(cli.command.is_none()); assert!(cli.run.db_url.is_none()); } + + // ── Config-file merge tests ────────────────────────────────────────── + // + // `merge_file_into_args` is the bridge between `config_file::ConfigFile` + // and `RunArgs`. These cases lock in the precedence rule: + // + // CLI flag > OPENSHELL_* env var > TOML file > built-in default + // + // by exercising each combination on representative gateway fields. + + use super::{ConfigFile, merge_file_into_args}; + use clap::FromArgMatches; + + fn parse_with_args(argv: &[&str]) -> (super::RunArgs, clap::ArgMatches) { + let matches = command().try_get_matches_from(argv).expect("parses"); + let cli = Cli::from_arg_matches(&matches).expect("from arg matches"); + (cli.run, matches) + } + + fn config_file_from_toml(toml: &str) -> ConfigFile { + toml::from_str(toml).expect("valid TOML in test fixture") + } + + #[test] + fn file_value_applies_when_cli_uses_default() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_BIND_ADDRESS"); + let _g2 = EnvVarGuard::remove("OPENSHELL_SERVER_PORT"); + let _g3 = EnvVarGuard::remove("OPENSHELL_LOG_LEVEL"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +bind_address = "0.0.0.0:9090" +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.bind_address, IpAddr::V4(Ipv4Addr::UNSPECIFIED)); + assert_eq!(args.port, 9090); + assert_eq!(args.log_level, "debug"); + } + + #[test] + fn cli_flag_overrides_file_value() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_BIND_ADDRESS"); + let _g2 = EnvVarGuard::remove("OPENSHELL_LOG_LEVEL"); + + let (mut args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--log-level", + "warn", + ]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.log_level, "warn", "CLI flag must win over file"); + } + + #[test] + fn env_var_overrides_file_value() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set("OPENSHELL_LOG_LEVEL", "trace"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.log_level, "trace", "env var must win over file"); + } + + #[test] + fn file_oidc_block_populates_oidc_args() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_OIDC_ISSUER"); + let _g2 = EnvVarGuard::remove("OPENSHELL_OIDC_AUDIENCE"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway.oidc] +issuer = "https://idp.example.com" +audience = "openshell-cli" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.oidc_issuer.as_deref(), Some("https://idp.example.com")); + assert_eq!(args.oidc_audience, "openshell-cli"); + } + + #[test] + fn file_populates_service_routing_fields() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_SERVER_SAN"); + let _g2 = EnvVarGuard::remove("OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +server_sans = ["gateway.local", "*.dev.openshell.localhost"] +enable_loopback_service_http = false +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!( + args.server_sans, + vec![ + "gateway.local".to_string(), + "*.dev.openshell.localhost".to_string() + ] + ); + assert!(!args.enable_loopback_service_http); + } + + #[test] + fn env_var_overrides_file_loopback_service_http() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set("OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP", "true"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r" +[openshell.gateway] +enable_loopback_service_http = false +", + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert!( + args.enable_loopback_service_http, + "env var must win over file" + ); + } + + #[test] + fn driver_inherits_shared_image_from_gateway_section() { + // [openshell.gateway].default_image inherits into the K8s driver + // table when the driver-specific table does not set it. + let file = config_file_from_toml( + r#" +[openshell.gateway] +default_image = "ghcr.io/nvidia/openshell/sandbox:1.0" + +[openshell.drivers.kubernetes] +namespace = "agents" +"#, + ); + let merged = crate::config_file::driver_table( + super::ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + let parsed = merged + .try_into::() + .expect("merged table deserializes"); + assert_eq!(parsed.default_image, "ghcr.io/nvidia/openshell/sandbox:1.0"); + assert_eq!(parsed.namespace, "agents"); + } + + #[test] + fn driver_specific_value_overrides_gateway_inheritance() { + let file = config_file_from_toml( + r#" +[openshell.gateway] +default_image = "gateway-default:1.0" + +[openshell.drivers.kubernetes] +default_image = "k8s-specific:1.0" +"#, + ); + let merged = crate::config_file::driver_table( + super::ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + let parsed = merged + .try_into::() + .expect("deserializes"); + assert_eq!(parsed.default_image, "k8s-specific:1.0"); + } } diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs index 1e62d4942..3a67f381b 100644 --- a/crates/openshell-server/src/compute/vm.rs +++ b/crates/openshell-server/src/compute/vm.rs @@ -60,7 +60,8 @@ const COMPUTE_DRIVER_SOCKET_RUN_DIR: &str = "run"; const COMPUTE_DRIVER_SOCKET_NAME: &str = "compute-driver.sock"; /// Configuration for launching and talking to the VM compute driver. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct VmComputeConfig { /// Working directory for VM driver sandbox state. pub state_dir: PathBuf, diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs new file mode 100644 index 000000000..31348e386 --- /dev/null +++ b/crates/openshell-server/src/config_file.rs @@ -0,0 +1,518 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! TOML configuration file loader for the gateway. +//! +//! See `rfc/0003-gateway-configuration/README.md` for the file format. This +//! module parses the file into [`ConfigFile`], rejects fields that must be +//! supplied via env/CLI (database URL, SSH handshake secret), and provides +//! [`driver_table`] which overlays shared `[openshell.gateway]` defaults onto +//! a `[openshell.drivers.]` table so each driver crate's +//! `Deserialize` impl sees a fully-populated table. +//! +//! The merge precedence at the gateway level is: +//! ```text +//! CLI flag > OPENSHELL_* env var > TOML file > built-in default +//! ``` +//! Per-field application of file values happens in [`crate::cli`], which uses +//! clap's `ArgMatches::value_source` to detect arguments that fell back to +//! their default and are therefore eligible for replacement by file values. + +use std::collections::BTreeMap; +use std::net::SocketAddr; +use std::path::{Path, PathBuf}; + +use openshell_core::config::ComputeDriverKind; +use openshell_core::{OidcConfig, TlsConfig}; +use serde::{Deserialize, Serialize}; + +/// Latest schema version this build understands. +pub const SCHEMA_VERSION: u32 = 1; + +/// Root of the gateway TOML config file. +/// +/// The file is rooted at `[openshell]` to reserve room for future components +/// (CLI, sandbox, router) to share a single config file without key +/// collisions. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct ConfigFile { + #[serde(default)] + pub openshell: OpenShellRoot, +} + +/// `[openshell]` table. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct OpenShellRoot { + /// Reserved for future schema migrations. Versions greater than + /// [`SCHEMA_VERSION`] are rejected at load time. + #[serde(default)] + pub version: Option, + + #[serde(default)] + pub gateway: GatewayFileSection, + + /// `[openshell.drivers.]` tables — passed verbatim to each driver + /// crate's `Deserialize` impl after the gateway-side inheritance merge. + /// Stored as raw [`toml::Value`] so each driver can evolve its schema + /// independently of this crate. + #[serde(default)] + pub drivers: BTreeMap, +} + +/// `[openshell.gateway]` section. +/// +/// All fields are `Option` so the loader can tell whether a key was set +/// in the file (`Some`) or not (`None` — value is taken from CLI/env/default). +/// +/// The fields under "Shared driver defaults" are inherited into +/// `[openshell.drivers.]` tables per [`inheritable_keys`]. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct GatewayFileSection { + // ── Listeners ──────────────────────────────────────────────────────── + #[serde(default)] + pub bind_address: Option, + #[serde(default)] + pub health_bind_address: Option, + #[serde(default)] + pub metrics_bind_address: Option, + + // ── Logging ────────────────────────────────────────────────────────── + #[serde(default)] + pub log_level: Option, + + // ── Drivers ────────────────────────────────────────────────────────── + #[serde(default)] + pub compute_drivers: Option>, + + // ── Sandbox / SSH ──────────────────────────────────────────────────── + #[serde(default)] + pub sandbox_namespace: Option, + #[serde(default)] + pub sandbox_ssh_port: Option, + #[serde(default)] + pub ssh_gateway_host: Option, + #[serde(default)] + pub ssh_gateway_port: Option, + #[serde(default)] + pub ssh_handshake_skew_secs: Option, + #[serde(default)] + pub ssh_session_ttl_secs: Option, + + // ── Service routing ────────────────────────────────────────────────── + /// Subject Alternative Names configured on the gateway server certificate. + /// Wildcard DNS SANs also enable sandbox service URLs under that domain. + #[serde(default)] + pub server_sans: Option>, + /// Enable plaintext HTTP routing for loopback sandbox service URLs. + #[serde(default)] + pub enable_loopback_service_http: Option, + + // ── Shared driver defaults (inherited into [openshell.drivers.]) ─ + #[serde(default)] + pub default_image: Option, + #[serde(default)] + pub supervisor_image: Option, + #[serde(default)] + pub image_pull_policy: Option, + #[serde(default)] + pub client_tls_secret_name: Option, + #[serde(default)] + pub host_gateway_ip: Option, + #[serde(default)] + pub enable_user_namespaces: Option, + #[serde(default)] + pub guest_tls_ca: Option, + #[serde(default)] + pub guest_tls_cert: Option, + #[serde(default)] + pub guest_tls_key: Option, + + // ── Nested tables ──────────────────────────────────────────────────── + #[serde(default)] + pub tls: Option, + #[serde(default)] + pub oidc: Option, + + // ── Disallowed-in-file fields ──────────────────────────────────────── + // + // Captured so we can produce a friendly "set this via env/CLI instead" + // error rather than a generic "unknown field" message. Validated and + // rejected in [`load`]. + #[serde(default)] + pub database_url: Option, + #[serde(default)] + pub ssh_handshake_secret: Option, +} + +#[derive(Debug, thiserror::Error)] +pub enum ConfigFileError { + #[error("failed to read gateway config file '{}': {source}", path.display())] + Io { + path: PathBuf, + #[source] + source: std::io::Error, + }, + #[error("failed to parse gateway config file '{}': {source}", path.display())] + Parse { + path: PathBuf, + #[source] + source: toml::de::Error, + }, + #[error( + "unsupported gateway config version {version}; this build only supports version {SCHEMA_VERSION}" + )] + UnsupportedVersion { version: u32 }, + #[error( + "`{field}` is not allowed in the gateway config file — set the {env} env var or pass {cli} on the command line" + )] + SecretInFile { + field: &'static str, + env: &'static str, + cli: &'static str, + }, +} + +/// Load and validate a TOML config file. +/// +/// Returns `Ok(ConfigFile::default())` for an empty file (the gateway then +/// falls back entirely to CLI/env/built-in defaults). +pub fn load(path: &Path) -> Result { + let contents = std::fs::read_to_string(path).map_err(|source| ConfigFileError::Io { + path: path.to_path_buf(), + source, + })?; + if contents.trim().is_empty() { + return Ok(ConfigFile::default()); + } + let file: ConfigFile = toml::from_str(&contents).map_err(|source| ConfigFileError::Parse { + path: path.to_path_buf(), + source, + })?; + + if let Some(version) = file.openshell.version + && version > SCHEMA_VERSION + { + return Err(ConfigFileError::UnsupportedVersion { version }); + } + + if file.openshell.gateway.database_url.is_some() { + return Err(ConfigFileError::SecretInFile { + field: "database_url", + env: "OPENSHELL_DB_URL", + cli: "--db-url", + }); + } + if file.openshell.gateway.ssh_handshake_secret.is_some() { + return Err(ConfigFileError::SecretInFile { + field: "ssh_handshake_secret", + env: "OPENSHELL_SSH_HANDSHAKE_SECRET", + cli: "--ssh-handshake-secret", + }); + } + + Ok(file) +} + +/// Build the merged TOML table for `driver` by overlaying inheritable +/// `[openshell.gateway]` defaults onto `[openshell.drivers.]`. +/// +/// The returned [`toml::Value`] is a Table ready to feed into the driver's +/// `Deserialize` impl — keys present in `raw` win over the gateway defaults. +/// Keys outside [`inheritable_keys`] for this driver are never copied from +/// the gateway section, which keeps each driver's `deny_unknown_fields` +/// invariant intact. +pub fn driver_table( + driver: ComputeDriverKind, + gateway: &GatewayFileSection, + raw: Option<&toml::Value>, +) -> toml::Value { + let mut merged = match raw { + Some(toml::Value::Table(table)) => table.clone(), + _ => toml::Table::new(), + }; + + for key in inheritable_keys(driver) { + if merged.contains_key(*key) { + continue; + } + if let Some(value) = gateway_inherited_value(gateway, key) { + merged.insert((*key).to_string(), value); + } + } + + toml::Value::Table(merged) +} + +/// Inheritance allowlist (the Q4 "high-overlap set"). Each driver opts in +/// to a specific subset so a gateway-wide default does not accidentally land +/// in a driver table that does not understand the field. +fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { + match driver { + ComputeDriverKind::Kubernetes => &[ + "default_image", + "supervisor_image", + "image_pull_policy", + "client_tls_secret_name", + "host_gateway_ip", + "ssh_handshake_skew_secs", + "enable_user_namespaces", + ], + ComputeDriverKind::Docker => &[ + "supervisor_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + ComputeDriverKind::Podman => &[ + "default_image", + "supervisor_image", + "image_pull_policy", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + "ssh_handshake_skew_secs", + ], + ComputeDriverKind::Vm => &[ + "default_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + } +} + +fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option { + match key { + "default_image" => g.default_image.as_deref().map(string_value), + "supervisor_image" => g.supervisor_image.as_deref().map(string_value), + "image_pull_policy" => g.image_pull_policy.as_deref().map(string_value), + "client_tls_secret_name" => g.client_tls_secret_name.as_deref().map(string_value), + "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), + "ssh_handshake_skew_secs" => g.ssh_handshake_skew_secs.and_then(skew_value), + "enable_user_namespaces" => g.enable_user_namespaces.map(toml::Value::Boolean), + "guest_tls_ca" => g.guest_tls_ca.as_deref().map(path_value), + "guest_tls_cert" => g.guest_tls_cert.as_deref().map(path_value), + "guest_tls_key" => g.guest_tls_key.as_deref().map(path_value), + _ => None, + } +} + +fn string_value(s: &str) -> toml::Value { + toml::Value::String(s.to_owned()) +} + +fn path_value(p: &Path) -> toml::Value { + toml::Value::String(p.display().to_string()) +} + +fn skew_value(n: u64) -> Option { + i64::try_from(n).ok().map(toml::Value::Integer) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + fn write_tmp(contents: &str) -> tempfile::NamedTempFile { + let mut tmp = tempfile::Builder::new() + .suffix(".toml") + .tempfile() + .expect("tempfile"); + tmp.write_all(contents.as_bytes()).expect("write"); + tmp + } + + #[test] + fn empty_file_yields_default_config() { + let tmp = write_tmp(""); + let file = load(tmp.path()).expect("empty file parses"); + assert!(file.openshell.version.is_none()); + assert!(file.openshell.gateway.bind_address.is_none()); + assert!(file.openshell.drivers.is_empty()); + } + + #[test] + fn parses_full_example() { + let toml = r#" +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +log_level = "info" +compute_drivers = ["kubernetes"] +sandbox_namespace = "agents" +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +image_pull_policy = "IfNotPresent" +client_tls_secret_name = "openshell-sandbox-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +"#; + let tmp = write_tmp(toml); + let file = load(tmp.path()).expect("valid file parses"); + let gw = &file.openshell.gateway; + assert_eq!(gw.log_level.as_deref(), Some("info")); + assert_eq!( + gw.default_image.as_deref(), + Some("ghcr.io/nvidia/openshell/sandbox:latest") + ); + assert!(gw.tls.is_some()); + assert!(gw.oidc.is_some()); + assert!(file.openshell.drivers.contains_key("kubernetes")); + } + + #[test] + fn rejects_database_url_in_file() { + let toml = r#" +[openshell.gateway] +database_url = "sqlite::memory:" +"#; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("database_url must be rejected"); + assert!(matches!( + err, + ConfigFileError::SecretInFile { + field: "database_url", + .. + } + )); + } + + #[test] + fn rejects_ssh_handshake_secret_in_file() { + let toml = r#" +[openshell.gateway] +ssh_handshake_secret = "leaked" +"#; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("ssh_handshake_secret must be rejected"); + assert!(matches!( + err, + ConfigFileError::SecretInFile { + field: "ssh_handshake_secret", + .. + } + )); + } + + #[test] + fn rejects_unknown_gateway_field() { + let toml = r" +[openshell.gateway] +nonsense = true +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("unknown field must be rejected"); + assert!(matches!(err, ConfigFileError::Parse { .. })); + } + + #[test] + fn rejects_unsupported_version() { + let toml = r" +[openshell] +version = 2 +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("version > 1 must be rejected"); + assert!(matches!( + err, + ConfigFileError::UnsupportedVersion { version: 2 } + )); + } + + #[test] + fn driver_table_inherits_gateway_defaults() { + let gateway = GatewayFileSection { + default_image: Some("ghcr.io/nvidia/openshell/sandbox:0.9".to_string()), + supervisor_image: Some("ghcr.io/nvidia/openshell/supervisor:0.9".to_string()), + image_pull_policy: Some("IfNotPresent".to_string()), + ..Default::default() + }; + let raw = toml::toml! { + namespace = "agents" + }; + let merged = driver_table( + ComputeDriverKind::Kubernetes, + &gateway, + Some(&toml::Value::Table(raw)), + ); + let table = merged.as_table().expect("table"); + assert_eq!( + table.get("namespace").and_then(|v| v.as_str()), + Some("agents") + ); + assert_eq!( + table.get("default_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/sandbox:0.9") + ); + assert_eq!( + table.get("supervisor_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/supervisor:0.9") + ); + } + + #[test] + fn driver_table_specific_value_overrides_gateway_default() { + let gateway = GatewayFileSection { + default_image: Some("gateway-default".to_string()), + ..Default::default() + }; + let raw = toml::toml! { + default_image = "driver-specific" + }; + let merged = driver_table( + ComputeDriverKind::Podman, + &gateway, + Some(&toml::Value::Table(raw)), + ); + assert_eq!( + merged + .as_table() + .unwrap() + .get("default_image") + .and_then(|v| v.as_str()), + Some("driver-specific") + ); + } + + #[test] + fn driver_table_does_not_leak_keys_outside_allowlist() { + // `client_tls_secret_name` is K8s-only; Docker must not receive it + // even when set at gateway scope. + let gateway = GatewayFileSection { + client_tls_secret_name: Some("openshell-sandbox-tls".to_string()), + ..Default::default() + }; + let merged = driver_table(ComputeDriverKind::Docker, &gateway, None); + assert!( + !merged + .as_table() + .unwrap() + .contains_key("client_tls_secret_name") + ); + } + + #[test] + fn missing_path_is_io_error() { + let err = load(Path::new("/nonexistent/openshell-gateway.toml")) + .expect_err("missing file must be io error"); + assert!(matches!(err, ConfigFileError::Io { .. })); + } +} diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 93ccdc9dc..49ae1ee55 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -23,6 +23,7 @@ mod auth; pub mod certgen; pub mod cli; mod compute; +pub mod config_file; mod grpc; mod http; mod inference; @@ -152,6 +153,7 @@ pub async fn run_server( config: Config, vm_config: VmComputeConfig, docker_config: DockerComputeConfig, + config_file: Option, tracing_log_bus: TracingLogBus, ) -> Result<()> { let database_url = config.database_url.trim(); @@ -194,6 +196,7 @@ pub async fn run_server( &config, &vm_config, &docker_config, + config_file.as_ref(), store.clone(), sandbox_index.clone(), sandbox_watch_bus.clone(), @@ -551,6 +554,7 @@ async fn build_compute_runtime( config: &Config, vm_config: &VmComputeConfig, docker_config: &DockerComputeConfig, + file: Option<&config_file::ConfigFile>, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, @@ -562,37 +566,42 @@ async fn build_compute_runtime( match driver { ComputeDriverKind::Kubernetes => { - let supervisor_image = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()); - let supervisor_image_pull_policy = - std::env::var("OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_default(); + let mut k8s = kubernetes_config_from_file(file)?; + // Env overrides file for fields not represented in Config. + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") + && !v.is_empty() + { + k8s.supervisor_image = v; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY") + && !v.is_empty() + { + k8s.supervisor_image_pull_policy = v; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_SIDELOAD_METHOD") + && !v.is_empty() + && let Ok(parsed) = v.parse() + { + k8s.supervisor_sideload_method = parsed; + } + // Shared fields are sourced from Config, which already merged + // file + CLI/env at startup. + k8s.namespace.clone_from(&config.sandbox_namespace); + k8s.default_image.clone_from(&config.sandbox_image); + k8s.image_pull_policy + .clone_from(&config.sandbox_image_pull_policy); + k8s.grpc_endpoint.clone_from(&config.grpc_endpoint); + k8s.ssh_socket_path + .clone_from(&config.sandbox_ssh_socket_path); + k8s.ssh_handshake_secret + .clone_from(&config.ssh_handshake_secret); + k8s.ssh_handshake_skew_secs = config.ssh_handshake_skew_secs; + k8s.client_tls_secret_name + .clone_from(&config.client_tls_secret_name); + k8s.host_gateway_ip.clone_from(&config.host_gateway_ip); + k8s.enable_user_namespaces = config.enable_user_namespaces; ComputeRuntime::new_kubernetes( - KubernetesComputeConfig { - namespace: config.sandbox_namespace.clone(), - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.clone(), - supervisor_image, - supervisor_image_pull_policy, - supervisor_sideload_method: std::env::var( - "OPENSHELL_SUPERVISOR_SIDELOAD_METHOD", - ) - .ok() - .filter(|s| !s.is_empty()) - .and_then(|s| s.parse().ok()) - .unwrap_or_default(), - grpc_endpoint: config.grpc_endpoint.clone(), - ssh_socket_path: config.sandbox_ssh_socket_path.clone(), - ssh_handshake_secret: config.ssh_handshake_secret.clone(), - ssh_handshake_skew_secs: config.ssh_handshake_skew_secs, - client_tls_secret_name: config.client_tls_secret_name.clone(), - host_gateway_ip: config.host_gateway_ip.clone(), - enable_user_namespaces: config.enable_user_namespaces, - }, + k8s, store, sandbox_index, sandbox_watch_bus, @@ -628,63 +637,60 @@ async fn build_compute_runtime( .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) } ComputeDriverKind::Podman => { - let socket_path = std::env::var("OPENSHELL_PODMAN_SOCKET") - .ok() - .filter(|s| !s.is_empty()) - .map_or_else( - openshell_driver_podman::PodmanComputeConfig::default_socket_path, - std::path::PathBuf::from, - ); - - let network_name = std::env::var("OPENSHELL_NETWORK_NAME") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_NETWORK_NAME.to_string()); - - let stop_timeout_secs: u32 = std::env::var("OPENSHELL_STOP_TIMEOUT") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS); - - let supervisor_image = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()); - - // TLS client cert paths for sandbox mTLS. When all three are - // set, the Podman driver bind-mounts them into sandbox - // containers and switches the endpoint to https://. - let podman_tls_ca = std::env::var("OPENSHELL_PODMAN_TLS_CA") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); - let podman_tls_cert = std::env::var("OPENSHELL_PODMAN_TLS_CERT") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); - let podman_tls_key = std::env::var("OPENSHELL_PODMAN_TLS_KEY") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); + let mut podman = podman_config_from_file(file)?; + // Env overrides file for fields not represented in Config. + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_SOCKET") + && !v.is_empty() + { + podman.socket_path = std::path::PathBuf::from(v); + } + if let Ok(v) = std::env::var("OPENSHELL_NETWORK_NAME") + && !v.is_empty() + { + podman.network_name = v; + } + if let Ok(v) = std::env::var("OPENSHELL_STOP_TIMEOUT") + && let Ok(parsed) = v.parse() + { + podman.stop_timeout_secs = parsed; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") + && !v.is_empty() + { + podman.supervisor_image = v; + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_CA") + && !v.is_empty() + { + podman.guest_tls_ca = Some(std::path::PathBuf::from(v)); + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_CERT") + && !v.is_empty() + { + podman.guest_tls_cert = Some(std::path::PathBuf::from(v)); + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_KEY") + && !v.is_empty() + { + podman.guest_tls_key = Some(std::path::PathBuf::from(v)); + } + // Shared fields are sourced from Config (which already merged + // file + CLI/env at startup). + podman.default_image.clone_from(&config.sandbox_image); + podman.image_pull_policy = config.sandbox_image_pull_policy.parse().unwrap_or_default(); + podman.grpc_endpoint.clone_from(&config.grpc_endpoint); + podman.gateway_port = config.bind_address.port(); + podman + .sandbox_ssh_socket_path + .clone_from(&config.sandbox_ssh_socket_path); + podman.ssh_port = config.sandbox_ssh_port; + podman + .ssh_handshake_secret + .clone_from(&config.ssh_handshake_secret); + podman.ssh_handshake_skew_secs = config.ssh_handshake_skew_secs; ComputeRuntime::new_podman( - openshell_driver_podman::PodmanComputeConfig { - socket_path, - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.parse().unwrap_or_default(), - grpc_endpoint: config.grpc_endpoint.clone(), - gateway_port: config.bind_address.port(), - sandbox_ssh_socket_path: config.sandbox_ssh_socket_path.clone(), - network_name, - ssh_port: config.sandbox_ssh_port, - ssh_handshake_secret: config.ssh_handshake_secret.clone(), - ssh_handshake_skew_secs: config.ssh_handshake_skew_secs, - stop_timeout_secs, - supervisor_image, - guest_tls_ca: podman_tls_ca, - guest_tls_cert: podman_tls_cert, - guest_tls_key: podman_tls_key, - }, + podman, store, sandbox_index, sandbox_watch_bus, @@ -697,6 +703,43 @@ async fn build_compute_runtime( } } +/// Build a [`KubernetesComputeConfig`] from the file's +/// `[openshell.drivers.kubernetes]` table merged with inheritable +/// `[openshell.gateway]` defaults. Falls back to the driver's `Default` +/// when no file is present. +fn kubernetes_config_from_file( + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Ok(KubernetesComputeConfig::default()); + }; + let merged = config_file::driver_table( + ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + merged + .try_into() + .map_err(|e| Error::config(format!("invalid [openshell.drivers.kubernetes] table: {e}"))) +} + +/// Same pattern as [`kubernetes_config_from_file`] but for Podman. +fn podman_config_from_file( + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Ok(openshell_driver_podman::PodmanComputeConfig::default()); + }; + let merged = config_file::driver_table( + ComputeDriverKind::Podman, + &file.openshell.gateway, + file.openshell.drivers.get("podman"), + ); + merged + .try_into() + .map_err(|e| Error::config(format!("invalid [openshell.drivers.podman] table: {e}"))) +} + fn configured_compute_driver(config: &Config) -> Result { match config.compute_drivers.as_slice() { [] => openshell_core::config::detect_driver().ok_or_else(|| { diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml new file mode 100644 index 000000000..3ebbac582 --- /dev/null +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -0,0 +1,101 @@ +{{/* +ConfigMap holding the gateway TOML config file (RFC 0003). + +The gateway reads `/etc/openshell/gateway.toml` (mounted from this ConfigMap) +at startup. CLI flags and OPENSHELL_* env vars on the StatefulSet container +still override anything in this file. + +Two values are intentionally NOT rendered here: + - server.dbUrl → passed via --db-url in the StatefulSet args + - sshHandshake secret → injected as OPENSHELL_SSH_HANDSHAKE_SECRET env var + from a Kubernetes Secret reference. +*/}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "openshell.fullname" . }}-config + labels: + {{- include "openshell.labels" . | nindent 4 }} +data: + gateway.toml: | + [openshell] + version = 1 + + [openshell.gateway] + bind_address = "0.0.0.0:{{ .Values.service.port }}" + {{- if .Values.service.healthPort }} + health_bind_address = "0.0.0.0:{{ .Values.service.healthPort }}" + {{- end }} + {{- if .Values.service.metricsPort }} + metrics_bind_address = "0.0.0.0:{{ .Values.service.metricsPort }}" + {{- end }} + log_level = {{ .Values.server.logLevel | quote }} + sandbox_namespace = {{ include "openshell.sandboxNamespace" . | quote }} + default_image = {{ .Values.server.sandboxImage | quote }} + supervisor_image = {{ include "openshell.supervisorImage" . | quote }} + {{- if .Values.server.sandboxImagePullPolicy }} + image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} + {{- end }} + {{- if .Values.supervisor.image.pullPolicy }} + supervisor_image_pull_policy = {{ .Values.supervisor.image.pullPolicy | quote }} + {{- end }} + grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} + {{- if .Values.server.sshGatewayHost }} + ssh_gateway_host = {{ .Values.server.sshGatewayHost | quote }} + {{- end }} + {{- if .Values.server.sshGatewayPort }} + ssh_gateway_port = {{ .Values.server.sshGatewayPort }} + {{- end }} + {{- if .Values.server.hostGatewayIP }} + host_gateway_ip = {{ .Values.server.hostGatewayIP | quote }} + {{- end }} + {{- if .Values.server.enableUserNamespaces }} + enable_user_namespaces = true + {{- end }} + {{- if not .Values.server.disableTls }} + client_tls_secret_name = {{ .Values.server.tls.clientTlsSecretName | quote }} + {{- end }} + enable_loopback_service_http = {{ .Values.server.enableLoopbackServiceHttp }} + {{- $sans := list -}} + {{- if and .Values.certManager.enabled .Values.certManager.serverDnsNames }} + {{- $sans = .Values.certManager.serverDnsNames }} + {{- else if and .Values.pkiInitJob.enabled .Values.pkiInitJob.serverDnsNames }} + {{- $sans = .Values.pkiInitJob.serverDnsNames }} + {{- end }} + {{- if $sans }} + server_sans = [{{- range $i, $san := $sans }}{{ if $i }}, {{ end }}{{ $san | quote }}{{- end }}] + {{- end }} + + {{- if not .Values.server.disableTls }} + + [openshell.gateway.tls] + cert_path = "/etc/openshell-tls/server/tls.crt" + key_path = "/etc/openshell-tls/server/tls.key" + client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + {{- if .Values.server.disableGatewayAuth }} + allow_unauthenticated = true + {{- end }} + {{- end }} + + {{- if .Values.server.oidc.issuer }} + + [openshell.gateway.oidc] + issuer = {{ .Values.server.oidc.issuer | quote }} + audience = {{ .Values.server.oidc.audience | quote }} + jwks_ttl_secs = {{ .Values.server.oidc.jwksTtl }} + {{- if .Values.server.oidc.rolesClaim }} + roles_claim = {{ .Values.server.oidc.rolesClaim | quote }} + {{- end }} + {{- if .Values.server.oidc.adminRole }} + admin_role = {{ .Values.server.oidc.adminRole | quote }} + {{- end }} + {{- if .Values.server.oidc.userRole }} + user_role = {{ .Values.server.oidc.userRole | quote }} + {{- end }} + {{- if .Values.server.oidc.scopesClaim }} + scopes_claim = {{ .Values.server.oidc.scopesClaim | quote }} + {{- end }} + {{- end }} + + [openshell.drivers.kubernetes] + supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 69140a70c..cbd70094a 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -47,113 +47,24 @@ spec: image: {{ include "openshell.image" . | quote }} imagePullPolicy: {{ .Values.image.pullPolicy }} args: - - --bind-address - - "0.0.0.0" - - --port - - {{ .Values.service.port | quote }} - - --health-port - - {{ .Values.service.healthPort | quote }} - {{- if .Values.service.metricsPort }} - - --metrics-port - - {{ .Values.service.metricsPort | quote }} - {{- end }} - - --log-level - - {{ .Values.server.logLevel }} + - --config + - /etc/openshell/gateway.toml - --db-url - {{ .Values.server.dbUrl | quote }} env: - - name: OPENSHELL_SANDBOX_NAMESPACE - value: {{ include "openshell.sandboxNamespace" . | quote }} - - name: OPENSHELL_SANDBOX_IMAGE - value: {{ .Values.server.sandboxImage | quote }} - {{- if .Values.server.sandboxImagePullPolicy }} - - name: OPENSHELL_SANDBOX_IMAGE_PULL_POLICY - value: {{ .Values.server.sandboxImagePullPolicy | quote }} - {{- end }} - - name: OPENSHELL_SUPERVISOR_IMAGE - value: {{ include "openshell.supervisorImage" . | quote }} - {{- if .Values.supervisor.image.pullPolicy }} - - name: OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY - value: {{ .Values.supervisor.image.pullPolicy | quote }} - {{- end }} - - name: OPENSHELL_SUPERVISOR_SIDELOAD_METHOD - value: {{ include "openshell.supervisorSideloadMethod" . | quote }} - - name: OPENSHELL_GRPC_ENDPOINT - value: {{ include "openshell.grpcEndpoint" . | quote }} - {{- if .Values.server.sshGatewayHost }} - - name: OPENSHELL_SSH_GATEWAY_HOST - value: {{ .Values.server.sshGatewayHost | quote }} - {{- end }} - {{- if .Values.server.sshGatewayPort }} - - name: OPENSHELL_SSH_GATEWAY_PORT - value: {{ .Values.server.sshGatewayPort | quote }} - {{- end }} - {{- if .Values.server.hostGatewayIP }} - - name: OPENSHELL_HOST_GATEWAY_IP - value: {{ .Values.server.hostGatewayIP | quote }} - {{- end }} - {{- if .Values.server.enableUserNamespaces }} - - name: OPENSHELL_ENABLE_USER_NAMESPACES - value: "true" - {{- end }} - {{- if and .Values.certManager.enabled .Values.certManager.serverDnsNames }} - - name: OPENSHELL_SERVER_SAN - value: {{ join "," .Values.certManager.serverDnsNames | quote }} - {{- else if and .Values.pkiInitJob.enabled .Values.pkiInitJob.serverDnsNames }} - - name: OPENSHELL_SERVER_SAN - value: {{ join "," .Values.pkiInitJob.serverDnsNames | quote }} - {{- end }} - - name: OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP - value: {{ .Values.server.enableLoopbackServiceHttp | quote }} + # Secrets are env-only. Everything else lives in the ConfigMap- + # backed TOML file mounted at /etc/openshell/gateway.toml. - name: OPENSHELL_SSH_HANDSHAKE_SECRET valueFrom: secretKeyRef: name: {{ .Values.server.sshHandshakeSecretName | quote }} key: secret - {{- if .Values.server.disableTls }} - - name: OPENSHELL_DISABLE_TLS - value: "true" - {{- else }} - - name: OPENSHELL_TLS_CERT - value: /etc/openshell-tls/server/tls.crt - - name: OPENSHELL_TLS_KEY - value: /etc/openshell-tls/server/tls.key - - name: OPENSHELL_TLS_CLIENT_CA - value: /etc/openshell-tls/client-ca/ca.crt - - name: OPENSHELL_CLIENT_TLS_SECRET_NAME - value: {{ .Values.server.tls.clientTlsSecretName | quote }} - {{- if .Values.server.disableGatewayAuth }} - - name: OPENSHELL_DISABLE_GATEWAY_AUTH - value: "true" - {{- end }} - {{- end }} - {{- if .Values.server.oidc.issuer }} - - name: OPENSHELL_OIDC_ISSUER - value: {{ .Values.server.oidc.issuer | quote }} - - name: OPENSHELL_OIDC_AUDIENCE - value: {{ .Values.server.oidc.audience | quote }} - - name: OPENSHELL_OIDC_JWKS_TTL - value: {{ .Values.server.oidc.jwksTtl | quote }} - {{- if .Values.server.oidc.rolesClaim }} - - name: OPENSHELL_OIDC_ROLES_CLAIM - value: {{ .Values.server.oidc.rolesClaim | quote }} - {{- end }} - {{- if .Values.server.oidc.adminRole }} - - name: OPENSHELL_OIDC_ADMIN_ROLE - value: {{ .Values.server.oidc.adminRole | quote }} - {{- end }} - {{- if .Values.server.oidc.userRole }} - - name: OPENSHELL_OIDC_USER_ROLE - value: {{ .Values.server.oidc.userRole | quote }} - {{- end }} - {{- if .Values.server.oidc.scopesClaim }} - - name: OPENSHELL_OIDC_SCOPES_CLAIM - value: {{ .Values.server.oidc.scopesClaim | quote }} - {{- end }} - {{- end }} volumeMounts: - name: openshell-data mountPath: /var/openshell + - name: gateway-config + mountPath: /etc/openshell + readOnly: true {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -200,6 +111,9 @@ spec: resources: {{- toYaml .Values.resources | nindent 12 }} volumes: + - name: gateway-config + configMap: + name: {{ include "openshell.fullname" . }}-config {{- if not .Values.server.disableTls }} - name: tls-cert secret: diff --git a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml index a128cd440..2d3461c6f 100644 --- a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml +++ b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml @@ -3,32 +3,28 @@ suite: sandboxNamespace defaulting templates: - - templates/statefulset.yaml + - templates/gateway-config.yaml - templates/networkpolicy.yaml release: name: openshell namespace: my-namespace tests: - - it: defaults OPENSHELL_SANDBOX_NAMESPACE to release namespace - template: templates/statefulset.yaml + - it: defaults sandbox_namespace to release namespace in the TOML config + template: templates/gateway-config.yaml asserts: - - contains: - path: spec.template.spec.containers[0].env - content: - name: OPENSHELL_SANDBOX_NAMESPACE - value: "my-namespace" + - matchRegex: + path: data["gateway.toml"] + pattern: 'sandbox_namespace\s*=\s*"my-namespace"' - it: uses explicit sandboxNamespace when set - template: templates/statefulset.yaml + template: templates/gateway-config.yaml set: server.sandboxNamespace: other-ns asserts: - - contains: - path: spec.template.spec.containers[0].env - content: - name: OPENSHELL_SANDBOX_NAMESPACE - value: "other-ns" + - matchRegex: + path: data["gateway.toml"] + pattern: 'sandbox_namespace\s*=\s*"other-ns"' - it: defaults NetworkPolicy namespace to release namespace template: templates/networkpolicy.yaml diff --git a/examples/gateway/gateway.example.toml b/examples/gateway/gateway.example.toml new file mode 100644 index 000000000..29acb07a0 --- /dev/null +++ b/examples/gateway/gateway.example.toml @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Example gateway configuration. See RFC 0003 for the schema and +# rfc/0003-gateway-configuration/README.md for the field-by-field reference. +# +# Source precedence (highest first): +# CLI flag > OPENSHELL_* env var > this file > built-in default +# +# Two values are deliberately env-only and MUST NOT appear in this file: +# - database_url (OPENSHELL_DB_URL / --db-url) +# - ssh_handshake_secret (OPENSHELL_SSH_HANDSHAKE_SECRET / --ssh-handshake-secret) + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" + +log_level = "info" + +# When empty the gateway auto-detects (Kubernetes -> Podman -> Docker). VM is +# never auto-detected and requires an explicit entry here. +compute_drivers = ["kubernetes"] + +sandbox_namespace = "openshell" +sandbox_ssh_port = 2222 +ssh_gateway_host = "127.0.0.1" +ssh_gateway_port = 8080 +ssh_handshake_skew_secs = 300 + +# Subject Alternative Names baked into the gateway server certificate. +# Wildcard DNS SANs (e.g. "*.dev.openshell.localhost") also enable sandbox +# service URLs under that domain. +server_sans = ["openshell", "*.dev.openshell.localhost"] +# Allow plaintext HTTP routing for loopback sandbox service URLs. +enable_loopback_service_http = true + +# Shared driver defaults — inherited into [openshell.drivers.] tables +# when the driver-specific table does not override them. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +image_pull_policy = "IfNotPresent" +client_tls_secret_name = "openshell-client-tls" + +# Gateway listener TLS (distinct from the per-driver guest_tls_*). +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" +allow_unauthenticated = false + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" +jwks_ttl_secs = 3600 +roles_claim = "realm_access.roles" +admin_role = "openshell-admin" +user_role = "openshell-user" + +# ── Driver tables ────────────────────────────────────────────────────────── +# Each table inherits the shared defaults set above. Keys present here win +# over the inherited value. + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" + +[openshell.drivers.docker] +network_name = "openshell-docker" +# supervisor_image and guest_tls_* inherit from [openshell.gateway] + +[openshell.drivers.podman] +socket_path = "/run/podman/podman.sock" +network_name = "openshell" +stop_timeout_secs = 10 + +[openshell.drivers.vm] +state_dir = "/var/lib/openshell/vm" +driver_dir = "/usr/local/libexec/openshell" +vcpus = 2 +mem_mib = 2048 +krun_log_level = 1 diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 70ca09f7b..7b1ba0e0d 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -86,9 +86,14 @@ ssh_handshake_skew_secs = 300 ssh_session_ttl_secs = 86400 ssh_gateway_host = "127.0.0.1" ssh_gateway_port = 8080 -ssh_connect_path = "/connect/ssh" sandbox_ssh_port = 2222 +# Service routing — wildcard DNS SANs in `server_sans` also enable sandbox +# service URLs under that domain. `enable_loopback_service_http` toggles +# plaintext HTTP routing for loopback service URLs. +server_sans = ["openshell", "*.dev.openshell.localhost"] +enable_loopback_service_http = true + # ────────────────────────────────────────────────────────────────────────────── # TLS / mTLS — when omitted, the gateway listens plaintext (sets --disable-tls) # ────────────────────────────────────────────────────────────────────────────── From 3f959961513f6028cca2934b5a729ec125719182 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 18:23:01 -0700 Subject: [PATCH 2/7] docs(gateway): drop ssh_handshake_skew_secs and ssh_handshake_secret from examples Both fields are scheduled for removal. Remove the example values and the env-only note so the gateway.toml example and the architecture doc stop recommending settings that will not exist much longer. --- architecture/gateway.md | 19 +++++++------------ examples/gateway/gateway.example.toml | 6 ++---- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/architecture/gateway.md b/architecture/gateway.md index 64f4d9721..6b2db9a7b 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -207,23 +207,18 @@ When unset, the gateway behaves exactly as before — CLI flags and env vars drive every setting. See `examples/gateway/gateway.example.toml` for a worked example and RFC 0003 for the full schema. -Two secrets are env-only and rejected when present in the file: - -| Field | Source | -|------------------------|--------------------------------------------------| -| `database_url` | `OPENSHELL_DB_URL` / `--db-url` | -| `ssh_handshake_secret` | `OPENSHELL_SSH_HANDSHAKE_SECRET` / `--ssh-handshake-secret` | +`database_url` is env-only and rejected when present in the file +(`OPENSHELL_DB_URL` / `--db-url`). ### Driver inheritance `[openshell.gateway]` carries a small set of values (`default_image`, `supervisor_image`, `image_pull_policy`, `guest_tls_ca/cert/key`, -`client_tls_secret_name`, `host_gateway_ip`, `ssh_handshake_skew_secs`, -`enable_user_namespaces`) that are inherited into each driver's -`[openshell.drivers.]` table when the driver-specific table does not -override them. The allowlist is per-driver so a gateway-wide default cannot -land in a driver that does not understand it (e.g. -`client_tls_secret_name` is K8s-only). +`client_tls_secret_name`, `host_gateway_ip`, `enable_user_namespaces`) that +are inherited into each driver's `[openshell.drivers.]` table when +the driver-specific table does not override them. The allowlist is +per-driver so a gateway-wide default cannot land in a driver that does not +understand it (e.g. `client_tls_secret_name` is K8s-only). Driver-specific values that are not part of the inheritance allowlist (e.g. K8s `namespace`, Podman `socket_path`, VM `vcpus`) only come from diff --git a/examples/gateway/gateway.example.toml b/examples/gateway/gateway.example.toml index 29acb07a0..9e7064937 100644 --- a/examples/gateway/gateway.example.toml +++ b/examples/gateway/gateway.example.toml @@ -7,9 +7,8 @@ # Source precedence (highest first): # CLI flag > OPENSHELL_* env var > this file > built-in default # -# Two values are deliberately env-only and MUST NOT appear in this file: -# - database_url (OPENSHELL_DB_URL / --db-url) -# - ssh_handshake_secret (OPENSHELL_SSH_HANDSHAKE_SECRET / --ssh-handshake-secret) +# `database_url` is deliberately env-only and MUST NOT appear in this file +# (set OPENSHELL_DB_URL / --db-url instead). [openshell] version = 1 @@ -29,7 +28,6 @@ sandbox_namespace = "openshell" sandbox_ssh_port = 2222 ssh_gateway_host = "127.0.0.1" ssh_gateway_port = 8080 -ssh_handshake_skew_secs = 300 # Subject Alternative Names baked into the gateway server certificate. # Wildcard DNS SANs (e.g. "*.dev.openshell.localhost") also enable sandbox From 1a81fa28082f9a7bf827d2294d8f70bd37d219ce Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 18:25:48 -0700 Subject: [PATCH 3/7] docs(rfc): correct OPENSHELL_CONFIG to OPENSHELL_GATEWAY_CONFIG in RFC 0003 --- rfc/0003-gateway-configuration/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 7b1ba0e0d..763c30ad4 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -37,7 +37,7 @@ Three sources are merged at startup, in descending priority: CLI flags > OPENSHELL_* environment variables > TOML config file > built-in defaults ``` -The TOML file is optional. If neither `--config` nor `OPENSHELL_CONFIG` is set, the gateway behaves exactly as before. Any field present in the file is overridden by a CLI flag or matching environment variable. +The TOML file is optional. If neither `--config` nor `OPENSHELL_GATEWAY_CONFIG` is set, the gateway behaves exactly as before. Any field present in the file is overridden by a CLI flag or matching environment variable. ### Loading the file @@ -266,7 +266,7 @@ The chart owners can migrate one section at a time: `OPENSHELL_*` env vars and t No part of this RFC has shipped yet. The work breaks down as: 1. **Add a config-file loader to `openshell-server`** — define a `GatewayConfigFile` struct that mirrors the schema above, parse it with `serde` + `toml`, and merge it into `openshell_core::Config` plus the per-driver structs in `compute/`. -2. **Wire the merge into `cli.rs`** — add `--config` / `OPENSHELL_CONFIG`, gate each existing flag's "apply from file" path on clap `ValueSource::DefaultValue`, and run cross-field validation after the merge. +2. **Wire the merge into `cli.rs`** — add `--config` / `OPENSHELL_GATEWAY_CONFIG`, gate each existing flag's "apply from file" path on clap `ValueSource::DefaultValue`, and run cross-field validation after the merge. 3. **Per-driver deserialization** — give each driver crate (`openshell-driver-{kubernetes,docker,podman,vm}`) a `from_toml` (or `serde::Deserialize`) entry point so the gateway can hand each driver its own table. 4. **Test coverage** — file parsing, env-overrides-file, CLI-overrides-env, partial TLS error, port-collision error, unknown-field rejection, missing driver table fallback. 5. **Helm chart migration** — add `gateway.config` value tree, render the `ConfigMap`, mount it, switch the gateway container to `--config`. Keep the `OPENSHELL_*` env names available as opt-in overrides for secrets. From e67140537aaf59faa2bd12e34663aa4630f3030d Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 19:02:05 -0700 Subject: [PATCH 4/7] docs(gateway): add per-driver TOML example configurations Adds focused single-driver examples next to the comprehensive gateway.example.toml: kubernetes, docker, podman, and microvm. Each one demonstrates the realistic settings for that driver plus how shared [openshell.gateway] defaults inherit into the driver table. A new unit test (`checked_in_examples_parse`) loads every example through the config_file loader so schema drift fails CI rather than silently shipping a broken example. --- crates/openshell-server/src/config_file.rs | 21 +++++++++++++ examples/gateway/docker.example.toml | 27 ++++++++++++++++ examples/gateway/kubernetes.example.toml | 36 ++++++++++++++++++++++ examples/gateway/microvm.example.toml | 31 +++++++++++++++++++ examples/gateway/podman.example.toml | 30 ++++++++++++++++++ 5 files changed, 145 insertions(+) create mode 100644 examples/gateway/docker.example.toml create mode 100644 examples/gateway/kubernetes.example.toml create mode 100644 examples/gateway/microvm.example.toml create mode 100644 examples/gateway/podman.example.toml diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index 31348e386..b907906a7 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -515,4 +515,25 @@ version = 2 .expect_err("missing file must be io error"); assert!(matches!(err, ConfigFileError::Io { .. })); } + + /// Lock in that every checked-in example under `examples/gateway/` + /// round-trips through the loader. Catches schema drift the moment a + /// field is renamed or a doc snippet falls out of date. + #[test] + fn checked_in_examples_parse() { + let repo_root = Path::new(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(Path::parent) + .expect("crate manifest has grandparent"); + for name in [ + "gateway.example.toml", + "kubernetes.example.toml", + "docker.example.toml", + "podman.example.toml", + "microvm.example.toml", + ] { + let path = repo_root.join("examples/gateway").join(name); + load(&path).unwrap_or_else(|e| panic!("{name} failed to parse: {e}")); + } + } } diff --git a/examples/gateway/docker.example.toml b/examples/gateway/docker.example.toml new file mode 100644 index 000000000..27238fe0a --- /dev/null +++ b/examples/gateway/docker.example.toml @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Docker-driver example. Sandboxes run as containers on a local bridge +# network. The supervisor binary is bind-mounted from the host (no in-cluster +# image pull required); guest mTLS material is supplied as host paths. +# +# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["docker"] + +# Shared defaults inherited into [openshell.drivers.docker]. +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.docker] +network_name = "openshell-docker" +# Skip the image-pull-and-extract step by pointing at a locally built binary. +supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" diff --git a/examples/gateway/kubernetes.example.toml b/examples/gateway/kubernetes.example.toml new file mode 100644 index 000000000..9a5b77605 --- /dev/null +++ b/examples/gateway/kubernetes.example.toml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Kubernetes-driver example. The gateway runs as a Pod and creates sandbox +# Pods in another namespace. mTLS material for sandboxes is delivered via a +# Kubernetes Secret rather than host-side file paths. +# +# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" +log_level = "info" +compute_drivers = ["kubernetes"] + +# Shared defaults inherited into [openshell.drivers.kubernetes]. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +image_pull_policy = "IfNotPresent" +client_tls_secret_name = "openshell-client-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell-tls/server/tls.crt" +key_path = "/etc/openshell-tls/server/tls.key" +client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +# Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" +# on older clusters or where the ImageVolume feature gate is off. +supervisor_sideload_method = "image-volume" diff --git a/examples/gateway/microvm.example.toml b/examples/gateway/microvm.example.toml new file mode 100644 index 000000000..60bbd4356 --- /dev/null +++ b/examples/gateway/microvm.example.toml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# microVM-driver example. Each sandbox runs inside its own libkrun microVM +# managed by the standalone `openshell-driver-vm` subprocess. Use this driver +# when you want stronger isolation than container namespaces alone. +# +# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +# VM is never auto-detected; an explicit entry here is required. +compute_drivers = ["vm"] + +# Shared defaults inherited into [openshell.drivers.vm]. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +guest_tls_ca = "/var/lib/openshell/guest-tls/ca.pem" +guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" +guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" + +[openshell.drivers.vm] +state_dir = "/var/lib/openshell/vm" +# Where the gateway looks for the openshell-driver-vm subprocess binary. +driver_dir = "/usr/local/libexec/openshell" +vcpus = 2 +mem_mib = 2048 +krun_log_level = 1 diff --git a/examples/gateway/podman.example.toml b/examples/gateway/podman.example.toml new file mode 100644 index 000000000..11d241d6e --- /dev/null +++ b/examples/gateway/podman.example.toml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Podman-driver example. Sandboxes run as Podman containers on a user-mode +# bridge network. The supervisor image is mounted read-only via Podman's +# `type=image` mount; guest mTLS material is supplied as host paths. +# +# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["podman"] + +# Shared defaults inherited into [openshell.drivers.podman]. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +image_pull_policy = "missing" # Podman pull policy: always | missing | never | newer +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.podman] +# Rootless socket path. For root Podman use /run/podman/podman.sock. +socket_path = "/run/user/1000/podman/podman.sock" +network_name = "openshell" +stop_timeout_secs = 10 From 9f19b024062438249711fa5a984b7d3fcc34b72a Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 12 May 2026 15:36:13 -0700 Subject: [PATCH 5/7] refactor(gateway): drop image_pull_policy from shared inheritance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kubernetes and Podman use mutually-incompatible vocabularies for the same TOML key: - Kubernetes: `Always | IfNotPresent | Never` (free-form string passed verbatim to the K8s API). - Podman: `always | missing | never | newer` (strict lowercase enum deserialised into `ImagePullPolicy`). No value means the same thing in both drivers. Sharing the key at `[openshell.gateway]` scope and inheriting it into every active driver's table meant any value safe for one driver was either wrong or silently dropped for the other (`IfNotPresent` → `ImagePullPolicy::Missing` after `.unwrap_or_default()`). Operators run one driver per gateway, so the "shared default" never pays for itself. Make `image_pull_policy` driver-local: - Remove the field from `GatewayFileSection` and from `inheritable_keys()` for both Kubernetes and Podman. - Drop the file→`RunArgs` merge for the gateway-scope key. - Stop unconditionally clobbering the driver value with `config.sandbox_image_pull_policy` in the runtime wiring — only apply the CLI/env override when it was set (and, for Podman, only when it parses into the lowercase enum). - Move the key under `[openshell.drivers.kubernetes]` and `[openshell.drivers.podman]` in every example, the RFC, the architecture doc, and the Helm-rendered gateway ConfigMap. The supervisor pull policy follows the same shape: it is K8s-only and moves into `[openshell.drivers.kubernetes]` alongside `image_pull_policy` in the Helm template. --- architecture/gateway.md | 18 ++++++++++------ crates/openshell-server/src/cli.rs | 6 ------ crates/openshell-server/src/config_file.rs | 7 ------- crates/openshell-server/src/lib.rs | 21 ++++++++++++++++--- .../openshell/templates/gateway-config.yaml | 12 +++++------ examples/gateway/gateway.example.toml | 12 ++++++++--- examples/gateway/kubernetes.example.toml | 2 +- examples/gateway/podman.example.toml | 2 +- rfc/0003-gateway-configuration/README.md | 2 +- 9 files changed, 48 insertions(+), 34 deletions(-) diff --git a/architecture/gateway.md b/architecture/gateway.md index 6b2db9a7b..dc195bb1f 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -213,12 +213,18 @@ worked example and RFC 0003 for the full schema. ### Driver inheritance `[openshell.gateway]` carries a small set of values (`default_image`, -`supervisor_image`, `image_pull_policy`, `guest_tls_ca/cert/key`, -`client_tls_secret_name`, `host_gateway_ip`, `enable_user_namespaces`) that -are inherited into each driver's `[openshell.drivers.]` table when -the driver-specific table does not override them. The allowlist is -per-driver so a gateway-wide default cannot land in a driver that does not -understand it (e.g. `client_tls_secret_name` is K8s-only). +`supervisor_image`, `guest_tls_ca/cert/key`, `client_tls_secret_name`, +`host_gateway_ip`, `enable_user_namespaces`) that are inherited into each +driver's `[openshell.drivers.]` table when the driver-specific table +does not override them. The allowlist is per-driver so a gateway-wide +default cannot land in a driver that does not understand it (e.g. +`client_tls_secret_name` is K8s-only). + +`image_pull_policy` is intentionally **not** inheritable: Kubernetes uses +`Always | IfNotPresent | Never` (passed verbatim to the K8s API) while +Podman uses the lowercase enum `always | missing | never | newer`. No +value means the same thing in both, so the key lives only under each +driver's own table. Driver-specific values that are not part of the inheritance allowlist (e.g. K8s `namespace`, Podman `socket_path`, VM `vcpus`) only come from diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index e2cfeba92..ba349e321 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -580,12 +580,6 @@ fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: { args.sandbox_image = Some(image.clone()); } - if let Some(policy) = &file.image_pull_policy - && args.sandbox_image_pull_policy.is_none() - && arg_defaulted(matches, "sandbox_image_pull_policy") - { - args.sandbox_image_pull_policy = Some(policy.clone()); - } if let Some(secret) = &file.client_tls_secret_name && args.client_tls_secret_name.is_none() && arg_defaulted(matches, "client_tls_secret_name") diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index b907906a7..cb211ec70 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -116,8 +116,6 @@ pub struct GatewayFileSection { #[serde(default)] pub supervisor_image: Option, #[serde(default)] - pub image_pull_policy: Option, - #[serde(default)] pub client_tls_secret_name: Option, #[serde(default)] pub host_gateway_ip: Option, @@ -254,7 +252,6 @@ fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { ComputeDriverKind::Kubernetes => &[ "default_image", "supervisor_image", - "image_pull_policy", "client_tls_secret_name", "host_gateway_ip", "ssh_handshake_skew_secs", @@ -269,7 +266,6 @@ fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { ComputeDriverKind::Podman => &[ "default_image", "supervisor_image", - "image_pull_policy", "guest_tls_ca", "guest_tls_cert", "guest_tls_key", @@ -288,7 +284,6 @@ fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option g.default_image.as_deref().map(string_value), "supervisor_image" => g.supervisor_image.as_deref().map(string_value), - "image_pull_policy" => g.image_pull_policy.as_deref().map(string_value), "client_tls_secret_name" => g.client_tls_secret_name.as_deref().map(string_value), "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), "ssh_handshake_skew_secs" => g.ssh_handshake_skew_secs.and_then(skew_value), @@ -349,7 +344,6 @@ compute_drivers = ["kubernetes"] sandbox_namespace = "agents" default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -image_pull_policy = "IfNotPresent" client_tls_secret_name = "openshell-sandbox-tls" [openshell.gateway.tls] @@ -442,7 +436,6 @@ version = 2 let gateway = GatewayFileSection { default_image: Some("ghcr.io/nvidia/openshell/sandbox:0.9".to_string()), supervisor_image: Some("ghcr.io/nvidia/openshell/supervisor:0.9".to_string()), - image_pull_policy: Some("IfNotPresent".to_string()), ..Default::default() }; let raw = toml::toml! { diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 49ae1ee55..ebfd25691 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -588,8 +588,14 @@ async fn build_compute_runtime( // file + CLI/env at startup. k8s.namespace.clone_from(&config.sandbox_namespace); k8s.default_image.clone_from(&config.sandbox_image); - k8s.image_pull_policy - .clone_from(&config.sandbox_image_pull_policy); + // Only let the gateway-wide CLI/env value overwrite the per-driver + // file value when it was actually set — otherwise the empty CLI + // default would silently clobber `image_pull_policy` configured + // under `[openshell.drivers.kubernetes]`. + if !config.sandbox_image_pull_policy.is_empty() { + k8s.image_pull_policy + .clone_from(&config.sandbox_image_pull_policy); + } k8s.grpc_endpoint.clone_from(&config.grpc_endpoint); k8s.ssh_socket_path .clone_from(&config.sandbox_ssh_socket_path); @@ -677,7 +683,16 @@ async fn build_compute_runtime( // Shared fields are sourced from Config (which already merged // file + CLI/env at startup). podman.default_image.clone_from(&config.sandbox_image); - podman.image_pull_policy = config.sandbox_image_pull_policy.parse().unwrap_or_default(); + // The CLI/env `image_pull_policy` is K8s-shaped + // (e.g. `IfNotPresent`) and won't parse into Podman's lowercase + // enum. Only apply it when the operator set a Podman-shaped value + // explicitly; otherwise keep whatever `[openshell.drivers.podman]` + // (or the struct default) provided. + if !config.sandbox_image_pull_policy.is_empty() + && let Ok(policy) = config.sandbox_image_pull_policy.parse() + { + podman.image_pull_policy = policy; + } podman.grpc_endpoint.clone_from(&config.grpc_endpoint); podman.gateway_port = config.bind_address.port(); podman diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 3ebbac582..3cc5a5f78 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -33,12 +33,6 @@ data: sandbox_namespace = {{ include "openshell.sandboxNamespace" . | quote }} default_image = {{ .Values.server.sandboxImage | quote }} supervisor_image = {{ include "openshell.supervisorImage" . | quote }} - {{- if .Values.server.sandboxImagePullPolicy }} - image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} - {{- end }} - {{- if .Values.supervisor.image.pullPolicy }} - supervisor_image_pull_policy = {{ .Values.supervisor.image.pullPolicy | quote }} - {{- end }} grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} {{- if .Values.server.sshGatewayHost }} ssh_gateway_host = {{ .Values.server.sshGatewayHost | quote }} @@ -99,3 +93,9 @@ data: [openshell.drivers.kubernetes] supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + {{- if .Values.server.sandboxImagePullPolicy }} + image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} + {{- end }} + {{- if .Values.supervisor.image.pullPolicy }} + supervisor_image_pull_policy = {{ .Values.supervisor.image.pullPolicy | quote }} + {{- end }} diff --git a/examples/gateway/gateway.example.toml b/examples/gateway/gateway.example.toml index 9e7064937..120a9721e 100644 --- a/examples/gateway/gateway.example.toml +++ b/examples/gateway/gateway.example.toml @@ -40,8 +40,12 @@ enable_loopback_service_http = true # when the driver-specific table does not override them. default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -image_pull_policy = "IfNotPresent" client_tls_secret_name = "openshell-client-tls" +# `image_pull_policy` is intentionally NOT a shared key: Kubernetes uses +# `Always | IfNotPresent | Never` (passed as a string to the K8s API) while +# Podman uses the lowercase `always | missing | never | newer` enum. There is +# no value that means the same thing in both, so set it under the relevant +# `[openshell.drivers.]` table instead. # Gateway listener TLS (distinct from the per-driver guest_tls_*). [openshell.gateway.tls] @@ -63,8 +67,9 @@ user_role = "openshell-user" # over the inherited value. [openshell.drivers.kubernetes] -namespace = "agents" -grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +image_pull_policy = "IfNotPresent" # K8s vocabulary: Always | IfNotPresent | Never [openshell.drivers.docker] network_name = "openshell-docker" @@ -74,6 +79,7 @@ network_name = "openshell-docker" socket_path = "/run/podman/podman.sock" network_name = "openshell" stop_timeout_secs = 10 +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer [openshell.drivers.vm] state_dir = "/var/lib/openshell/vm" diff --git a/examples/gateway/kubernetes.example.toml b/examples/gateway/kubernetes.example.toml index 9a5b77605..6dd78e5e9 100644 --- a/examples/gateway/kubernetes.example.toml +++ b/examples/gateway/kubernetes.example.toml @@ -20,7 +20,6 @@ compute_drivers = ["kubernetes"] # Shared defaults inherited into [openshell.drivers.kubernetes]. default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -image_pull_policy = "IfNotPresent" client_tls_secret_name = "openshell-client-tls" [openshell.gateway.tls] @@ -31,6 +30,7 @@ client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" [openshell.drivers.kubernetes] namespace = "agents" grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +image_pull_policy = "IfNotPresent" # Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" # on older clusters or where the ImageVolume feature gate is off. supervisor_sideload_method = "image-volume" diff --git a/examples/gateway/podman.example.toml b/examples/gateway/podman.example.toml index 11d241d6e..3d963c176 100644 --- a/examples/gateway/podman.example.toml +++ b/examples/gateway/podman.example.toml @@ -18,7 +18,6 @@ compute_drivers = ["podman"] # Shared defaults inherited into [openshell.drivers.podman]. default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -image_pull_policy = "missing" # Podman pull policy: always | missing | never | newer guest_tls_ca = "/etc/openshell/certs/ca.pem" guest_tls_cert = "/etc/openshell/certs/client.pem" guest_tls_key = "/etc/openshell/certs/client-key.pem" @@ -28,3 +27,4 @@ guest_tls_key = "/etc/openshell/certs/client-key.pem" socket_path = "/run/user/1000/podman/podman.sock" network_name = "openshell" stop_timeout_secs = 10 +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 763c30ad4..488e954c0 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -142,7 +142,7 @@ guest_tls_key = "/etc/openshell/certs/client-key.pem" [openshell.drivers.podman] socket_path = "/run/podman/podman.sock" default_image = "ghcr.io/nvidia/openshell/sandbox:latest" -image_pull_policy = "IfNotPresent" +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" network_name = "openshell" stop_timeout_secs = 10 From 03617e0d23a03f5ab33fb806337436d2813d137f Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Wed, 13 May 2026 17:39:04 -0700 Subject: [PATCH 6/7] fix(gateway): address review feedback on TOML configuration Resolves the P1 and P2 issues raised in PR #1317: - Helm gateway ConfigMap moves `grpc_endpoint` under `[openshell.drivers.kubernetes]` so the default install no longer fails the gateway's `deny_unknown_fields` schema check. - `kubernetes_config_from_file` and `podman_config_from_file` only let the gateway-wide CLI/env `grpc_endpoint` overwrite the driver-table value when it was actually supplied, preserving file-only configs. - Kubernetes driver default `image_pull_policy` is now empty (was Podman vocabulary "missing"), so default deployments let the Kubernetes API apply its own policy instead of being rejected. - New `disable_tls` gateway field plumbs `.Values.server.disableTls` through the TOML ConfigMap instead of relying on env vars dropped from the StatefulSet. - StatefulSet pod template now carries a `checksum/gateway-config` annotation so `helm upgrade` rolls pods when the ConfigMap changes. - Auxiliary listener resolution preserves the full `SocketAddr` from `health_bind_address` / `metrics_bind_address`, so a loopback-pinned health port is not silently relocated onto the public bind address. - `ssh_session_ttl_secs` from the file is now applied to `Config` (it was previously accepted by the loader but never read). New regression coverage: cli-level merge tests for the new fields plus helm-unittest assertions for the ConfigMap shape, checksum annotation, and `disable_tls` rendering. --- .../openshell-driver-kubernetes/src/config.rs | 9 +- crates/openshell-server/src/cli.rs | 191 ++++++++++++++++-- crates/openshell-server/src/config_file.rs | 6 + crates/openshell-server/src/lib.rs | 12 +- .../openshell/templates/gateway-config.yaml | 8 +- .../helm/openshell/templates/statefulset.yaml | 9 +- .../openshell/tests/gateway_config_test.yaml | 68 +++++++ rfc/0003-gateway-configuration/README.md | 4 + 8 files changed, 277 insertions(+), 30 deletions(-) create mode 100644 deploy/helm/openshell/tests/gateway_config_test.yaml diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index 2eab5a4c7..a6d133a6d 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -2,8 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use openshell_core::config::{ - DEFAULT_IMAGE_PULL_POLICY, DEFAULT_K8S_NAMESPACE, DEFAULT_SSH_HANDSHAKE_SKEW_SECS, - DEFAULT_SUPERVISOR_IMAGE, + DEFAULT_K8S_NAMESPACE, DEFAULT_SSH_HANDSHAKE_SKEW_SECS, DEFAULT_SUPERVISOR_IMAGE, }; use serde::{Deserialize, Serialize}; @@ -73,7 +72,11 @@ impl Default for KubernetesComputeConfig { Self { namespace: DEFAULT_K8S_NAMESPACE.to_string(), default_image: String::new(), - image_pull_policy: DEFAULT_IMAGE_PULL_POLICY.to_string(), + // Default empty so the gateway omits `imagePullPolicy` from pod + // specs and Kubernetes applies its own default (Always for `latest`, + // IfNotPresent otherwise). `DEFAULT_IMAGE_PULL_POLICY` ("missing") + // is Podman vocabulary and is not a valid Kubernetes value. + image_pull_policy: String::new(), supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(), supervisor_image_pull_policy: String::new(), supervisor_sideload_method: SupervisorSideloadMethod::default(), diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index ba349e321..8d65a40b9 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -401,32 +401,54 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { .with_bind_address(bind) .with_log_level(&args.log_level); - if args.health_port != 0 { - if args.port == args.health_port { + // Listener addresses for the health and metrics endpoints. The file may + // pin a different interface than the main listener (e.g. health on + // 127.0.0.1 while gRPC binds 0.0.0.0); the full `SocketAddr` from the + // file is preserved unless CLI/env supplied an explicit `--health-port` / + // `--metrics-port`, in which case the port overrides the file value + // while the IP defaults to `args.bind_address`. + let file_gateway = file.as_ref().map(|f| &f.openshell.gateway); + let health_bind = resolve_aux_listener( + args.bind_address, + args.health_port, + &matches, + "health_port", + || file_gateway.and_then(|g| g.health_bind_address), + ); + let metrics_bind = resolve_aux_listener( + args.bind_address, + args.metrics_port, + &matches, + "metrics_port", + || file_gateway.and_then(|g| g.metrics_bind_address), + ); + + if let Some(addr) = health_bind { + if args.port == addr.port() { return Err(miette::miette!( "--port and --health-port must be different (both set to {})", args.port )); } - let health_bind = SocketAddr::new(args.bind_address, args.health_port); - config = config.with_health_bind_address(health_bind); + config = config.with_health_bind_address(addr); } - if args.metrics_port != 0 { - if args.port == args.metrics_port { + if let Some(addr) = metrics_bind { + if args.port == addr.port() { return Err(miette::miette!( "--port and --metrics-port must be different (both set to {})", args.port )); } - if args.health_port != 0 && args.health_port == args.metrics_port { + if let Some(health) = health_bind + && health.port() == addr.port() + { return Err(miette::miette!( "--health-port and --metrics-port must be different (both set to {})", - args.health_port + health.port() )); } - let metrics_bind = SocketAddr::new(args.bind_address, args.metrics_port); - config = config.with_metrics_bind_address(metrics_bind); + config = config.with_metrics_bind_address(addr); } config = config @@ -440,6 +462,13 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { .with_server_sans(args.server_sans.clone()) .with_loopback_service_http(args.enable_loopback_service_http); + if let Some(ttl) = file + .as_ref() + .and_then(|f| f.openshell.gateway.ssh_session_ttl_secs) + { + config = config.with_ssh_session_ttl_secs(ttl); + } + if let Some(image) = args.sandbox_image.clone() { config = config.with_sandbox_image(image); } @@ -514,6 +543,39 @@ fn arg_defaulted(matches: &ArgMatches, id: &str) -> bool { ) } +/// Resolve the bind address for an auxiliary listener (health / metrics). +/// +/// The precedence is: +/// 1. CLI flag or `OPENSHELL_*` env var explicitly set on the corresponding +/// port argument → `bind_address:port` (port from CLI, IP from the main +/// listener interface). +/// 2. Full `SocketAddr` from `[openshell.gateway].{health,metrics}_bind_address` +/// → used as-is (this is how operators pin a loopback-only health port +/// on a gateway whose gRPC listener is bound publicly). +/// 3. Otherwise the listener is disabled (returns `None`). +fn resolve_aux_listener( + bind_ip: IpAddr, + port_arg: u16, + matches: &ArgMatches, + port_id: &str, + file_addr: impl FnOnce() -> Option, +) -> Option { + if !arg_defaulted(matches, port_id) { + if port_arg == 0 { + return None; + } + return Some(SocketAddr::new(bind_ip, port_arg)); + } + if let Some(addr) = file_addr() { + return Some(addr); + } + if port_arg == 0 { + None + } else { + Some(SocketAddr::new(bind_ip, port_arg)) + } +} + /// Apply gateway-wide values from `[openshell.gateway]` onto `RunArgs` for /// every argument that is still sourced from clap's built-in default. /// @@ -529,16 +591,11 @@ fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: args.port = addr.port(); } } - if let Some(addr) = file.health_bind_address - && arg_defaulted(matches, "health_port") - { - args.health_port = addr.port(); - } - if let Some(addr) = file.metrics_bind_address - && arg_defaulted(matches, "metrics_port") - { - args.metrics_port = addr.port(); - } + // Note: file's full health_bind_address / metrics_bind_address are + // consumed in `run_from_args`'s listener-resolution block so the IP + // half of the SocketAddr is preserved. Copying only the port here + // would silently relocate a loopback-intended listener onto the + // public bind address. if let Some(level) = &file.log_level && arg_defaulted(matches, "log_level") { @@ -608,6 +665,11 @@ fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: { args.enable_loopback_service_http = enabled; } + if let Some(disabled) = file.disable_tls + && arg_defaulted(matches, "disable_tls") + { + args.disable_tls = disabled; + } // TLS gateway listener fields if let Some(tls) = &file.tls { if args.tls_cert.is_none() && arg_defaulted(matches, "tls_cert") { @@ -1087,6 +1149,95 @@ audience = "openshell-cli" assert_eq!(args.oidc_audience, "openshell-cli"); } + #[test] + fn aux_listener_preserves_file_ip_against_public_bind() { + use std::net::SocketAddr; + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_HEALTH_PORT"); + + let (_args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file_addr: SocketAddr = "127.0.0.1:8081".parse().unwrap(); + let resolved = super::resolve_aux_listener( + IpAddr::V4(Ipv4Addr::UNSPECIFIED), + 0, + &matches, + "health_port", + || Some(file_addr), + ); + assert_eq!( + resolved, + Some(file_addr), + "TOML health_bind_address 127.0.0.1:8081 must not be relocated to 0.0.0.0:8081" + ); + } + + #[test] + fn aux_listener_cli_port_overrides_file_addr() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_HEALTH_PORT"); + + let (_args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--health-port", + "9999", + ]); + let file_addr: std::net::SocketAddr = "127.0.0.1:8081".parse().unwrap(); + let resolved = super::resolve_aux_listener( + IpAddr::V4(Ipv4Addr::UNSPECIFIED), + 9999, + &matches, + "health_port", + || Some(file_addr), + ); + assert_eq!( + resolved, + Some("0.0.0.0:9999".parse().unwrap()), + "CLI flag must win over file value" + ); + } + + #[test] + fn file_disable_tls_applies() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_DISABLE_TLS"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r" +[openshell.gateway] +disable_tls = true +", + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert!(args.disable_tls); + } + + #[test] + fn file_ssh_session_ttl_secs_is_parsed() { + // The loader must accept and surface the documented key. The actual + // wiring into `Config` happens in `run_from_args` against the parsed + // file (not via `merge_file_into_args`, since there is no matching + // `RunArgs` field), so this test pins the schema half. + let file = config_file_from_toml( + r" +[openshell.gateway] +ssh_session_ttl_secs = 1234 +", + ); + assert_eq!(file.openshell.gateway.ssh_session_ttl_secs, Some(1234)); + } + #[test] fn file_populates_service_routing_fields() { let _lock = ENV_LOCK diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index cb211ec70..eb088f669 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -128,6 +128,12 @@ pub struct GatewayFileSection { #[serde(default)] pub guest_tls_key: Option, + // ── TLS toggle ─────────────────────────────────────────────────────── + /// When `true`, the gateway listens on plaintext HTTP and ignores any + /// `[openshell.gateway.tls]` table. Mirrors `--disable-tls`. + #[serde(default)] + pub disable_tls: Option, + // ── Nested tables ──────────────────────────────────────────────────── #[serde(default)] pub tls: Option, diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index ebfd25691..e9f02da24 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -596,7 +596,13 @@ async fn build_compute_runtime( k8s.image_pull_policy .clone_from(&config.sandbox_image_pull_policy); } - k8s.grpc_endpoint.clone_from(&config.grpc_endpoint); + // Same rationale as `image_pull_policy`: only let the gateway-wide + // CLI/env value win when it was actually set, otherwise the empty + // CLI default would clobber `grpc_endpoint` from + // `[openshell.drivers.kubernetes]`. + if !config.grpc_endpoint.is_empty() { + k8s.grpc_endpoint.clone_from(&config.grpc_endpoint); + } k8s.ssh_socket_path .clone_from(&config.sandbox_ssh_socket_path); k8s.ssh_handshake_secret @@ -693,7 +699,9 @@ async fn build_compute_runtime( { podman.image_pull_policy = policy; } - podman.grpc_endpoint.clone_from(&config.grpc_endpoint); + if !config.grpc_endpoint.is_empty() { + podman.grpc_endpoint.clone_from(&config.grpc_endpoint); + } podman.gateway_port = config.bind_address.port(); podman .sandbox_ssh_socket_path diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 3cc5a5f78..5937b8a7d 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -33,7 +33,6 @@ data: sandbox_namespace = {{ include "openshell.sandboxNamespace" . | quote }} default_image = {{ .Values.server.sandboxImage | quote }} supervisor_image = {{ include "openshell.supervisorImage" . | quote }} - grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} {{- if .Values.server.sshGatewayHost }} ssh_gateway_host = {{ .Values.server.sshGatewayHost | quote }} {{- end }} @@ -46,7 +45,9 @@ data: {{- if .Values.server.enableUserNamespaces }} enable_user_namespaces = true {{- end }} - {{- if not .Values.server.disableTls }} + {{- if .Values.server.disableTls }} + disable_tls = true + {{- else }} client_tls_secret_name = {{ .Values.server.tls.clientTlsSecretName | quote }} {{- end }} enable_loopback_service_http = {{ .Values.server.enableLoopbackServiceHttp }} @@ -92,7 +93,8 @@ data: {{- end }} [openshell.drivers.kubernetes] - supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} + supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} {{- if .Values.server.sandboxImagePullPolicy }} image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} {{- end }} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index cbd70094a..ccb1775c7 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -15,10 +15,15 @@ spec: {{- include "openshell.selectorLabels" . | nindent 6 }} template: metadata: - {{- with .Values.podAnnotations }} annotations: + # Roll the StatefulSet when the rendered gateway TOML changes — the + # gateway only reads /etc/openshell/gateway.toml at startup, so + # without this annotation a `helm upgrade` that only mutates the + # ConfigMap would leave pods running with stale config. + checksum/gateway-config: {{ include (print $.Template.BasePath "/gateway-config.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} {{- toYaml . | nindent 8 }} - {{- end }} + {{- end }} labels: {{- include "openshell.labels" . | nindent 8 }} {{- with .Values.podLabels }} diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml new file mode 100644 index 000000000..2d464b8e6 --- /dev/null +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +suite: gateway TOML config shape +templates: + - templates/gateway-config.yaml + - templates/statefulset.yaml +release: + name: openshell + namespace: my-namespace + +tests: + # Regression for Drew's P2: a ConfigMap-only mutation in `helm upgrade` + # must roll the StatefulSet, otherwise pods keep running with stale config. + - it: annotates the StatefulSet pod template with a ConfigMap checksum + template: templates/statefulset.yaml + asserts: + - exists: + path: spec.template.metadata.annotations["checksum/gateway-config"] + + + # Regression for the P1 bug Drew flagged: grpc_endpoint MUST live in the + # Kubernetes driver table, not in [openshell.gateway]. The gateway-side + # schema has `deny_unknown_fields` and no `grpc_endpoint` field, so writing + # it at gateway scope makes `config_file::load` reject the default install. + - it: renders grpc_endpoint under [openshell.drivers.kubernetes], not [openshell.gateway] + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?grpc_endpoint' + - notMatchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.gateway\][^\[]*?grpc_endpoint' + + - it: omits server_sans when no DNS SANs are configured + template: templates/gateway-config.yaml + asserts: + - notMatchRegex: + path: data["gateway.toml"] + pattern: 'server_sans\s*=' + + - it: emits disable_tls=true and omits the [openshell.gateway.tls] section when disableTls is set + set: + server.disableTls: true + certManager.enabled: false + pkiInitJob.enabled: false + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: 'disable_tls\s*=\s*true' + - notMatchRegex: + path: data["gateway.toml"] + pattern: '\[openshell\.gateway\.tls\]' + + - it: renders server_sans from certManager.serverDnsNames + set: + certManager.enabled: true + certManager.serverDnsNames: + - openshell + - "*.dev.openshell.localhost" + pkiInitJob.enabled: false + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: 'server_sans\s*=\s*\["openshell", "\*\.dev\.openshell\.localhost"\]' diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 488e954c0..a926fc9d8 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -97,6 +97,10 @@ enable_loopback_service_http = true # ────────────────────────────────────────────────────────────────────────────── # TLS / mTLS — when omitted, the gateway listens plaintext (sets --disable-tls) # ────────────────────────────────────────────────────────────────────────────── +# Mirrors --disable-tls / OPENSHELL_DISABLE_TLS. When true, the gateway +# ignores the [openshell.gateway.tls] table below. +disable_tls = false + [openshell.gateway.tls] cert_path = "/etc/openshell/certs/gateway.pem" key_path = "/etc/openshell/certs/gateway-key.pem" From abe8b74ef0ed67fec45a3e338091f9ba94e94a71 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Wed, 13 May 2026 17:53:07 -0700 Subject: [PATCH 7/7] docs(gateway): consolidate gateway TOML examples into docs reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the per-driver example files under examples/gateway/ with a single published reference page at docs/reference/gateway-config.mdx covering source precedence, layout, the full example, and the four per-driver examples (Kubernetes, Docker, Podman, microVM). Drew flagged during PR #1317 review that the examples belong with the user-facing docs rather than in a sibling examples/ directory. The cross-references in architecture/gateway.md and RFC 0003 are updated to point at the new docs page; the round-trip test in config_file.rs is removed (schema coverage stays on the inline parses_full_example test and per-field merge tests — doc-snippet drift belongs in a separate docs-lint, not in a cross-tree Rust unit test). --- architecture/gateway.md | 4 +- crates/openshell-server/src/config_file.rs | 21 -- docs/reference/gateway-config.mdx | 212 +++++++++++++++++++++ docs/reference/sandbox-compute-drivers.mdx | 2 + examples/gateway/docker.example.toml | 27 --- examples/gateway/gateway.example.toml | 89 --------- examples/gateway/kubernetes.example.toml | 36 ---- examples/gateway/microvm.example.toml | 31 --- examples/gateway/podman.example.toml | 30 --- rfc/0003-gateway-configuration/README.md | 2 +- 10 files changed, 217 insertions(+), 237 deletions(-) create mode 100644 docs/reference/gateway-config.mdx delete mode 100644 examples/gateway/docker.example.toml delete mode 100644 examples/gateway/gateway.example.toml delete mode 100644 examples/gateway/kubernetes.example.toml delete mode 100644 examples/gateway/microvm.example.toml delete mode 100644 examples/gateway/podman.example.toml diff --git a/architecture/gateway.md b/architecture/gateway.md index dc195bb1f..5840cb511 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -204,8 +204,8 @@ CLI flag > OPENSHELL_* env var > TOML file > built-in default The TOML file is opt-in via `--config ` / `OPENSHELL_GATEWAY_CONFIG`. When unset, the gateway behaves exactly as before — CLI flags and env vars -drive every setting. See `examples/gateway/gateway.example.toml` for a -worked example and RFC 0003 for the full schema. +drive every setting. See `docs/reference/gateway-config.mdx` for worked +per-driver examples and RFC 0003 for the full schema. `database_url` is env-only and rejected when present in the file (`OPENSHELL_DB_URL` / `--db-url`). diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index eb088f669..8e5477683 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -514,25 +514,4 @@ version = 2 .expect_err("missing file must be io error"); assert!(matches!(err, ConfigFileError::Io { .. })); } - - /// Lock in that every checked-in example under `examples/gateway/` - /// round-trips through the loader. Catches schema drift the moment a - /// field is renamed or a doc snippet falls out of date. - #[test] - fn checked_in_examples_parse() { - let repo_root = Path::new(env!("CARGO_MANIFEST_DIR")) - .parent() - .and_then(Path::parent) - .expect("crate manifest has grandparent"); - for name in [ - "gateway.example.toml", - "kubernetes.example.toml", - "docker.example.toml", - "podman.example.toml", - "microvm.example.toml", - ] { - let path = repo_root.join("examples/gateway").join(name); - load(&path).unwrap_or_else(|e| panic!("{name} failed to parse: {e}")); - } - } } diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx new file mode 100644 index 000000000..5b5e5736b --- /dev/null +++ b/docs/reference/gateway-config.mdx @@ -0,0 +1,212 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Gateway Configuration File" +sidebar-title: "Gateway Config" +description: "Reference for the OpenShell gateway TOML configuration file (RFC 0003)." +keywords: "Generative AI, Cybersecurity, AI Agents, Sandboxing, Gateway, Configuration, TOML, Reference" +position: 5 +--- + +The OpenShell gateway reads its configuration from a TOML file when `--config` or `OPENSHELL_GATEWAY_CONFIG` is set. CLI flags and `OPENSHELL_*` environment variables always override the file. See [RFC 0003](https://github.com/NVIDIA/OpenShell/blob/main/rfc/0003-gateway-configuration/README.md) for the full schema. + +## Source Precedence + +```text +CLI flag > OPENSHELL_* env var > TOML file > built-in default +``` + +`database_url` and `ssh_handshake_secret` are env-only. The loader rejects them when they appear in the file. + +## Layout + +The file is rooted at `[openshell]`. Gateway-wide settings live under `[openshell.gateway]`. Each compute driver owns its own `[openshell.drivers.]` table. Shared keys set at gateway scope are inherited into driver tables when not overridden. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +# ... gateway-wide settings ... + +[openshell.gateway.tls] +# ... gateway listener TLS ... + +[openshell.gateway.oidc] +# ... JWT bearer auth ... + +[openshell.drivers.kubernetes] +# ... driver-specific settings ... +``` + +## Full Example + +A complete gateway configuration covering every section. Trim to the fields you need. + +```toml +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" + +log_level = "info" + +# When empty the gateway auto-detects (Kubernetes -> Podman -> Docker). VM is +# never auto-detected and requires an explicit entry here. +compute_drivers = ["kubernetes"] + +sandbox_namespace = "openshell" +sandbox_ssh_port = 2222 +ssh_gateway_host = "127.0.0.1" +ssh_gateway_port = 8080 + +# Subject Alternative Names baked into the gateway server certificate. +# Wildcard DNS SANs (e.g. "*.dev.openshell.localhost") also enable sandbox +# service URLs under that domain. +server_sans = ["openshell", "*.dev.openshell.localhost"] +# Allow plaintext HTTP routing for loopback sandbox service URLs. +enable_loopback_service_http = true + +# Shared driver defaults — inherited into [openshell.drivers.] tables +# when the driver-specific table does not override them. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +client_tls_secret_name = "openshell-client-tls" + +# Gateway listener TLS (distinct from the per-driver guest_tls_*). +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" +allow_unauthenticated = false + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" +jwks_ttl_secs = 3600 +roles_claim = "realm_access.roles" +admin_role = "openshell-admin" +user_role = "openshell-user" +``` + +`image_pull_policy` is intentionally not a shared gateway key. Kubernetes uses `Always | IfNotPresent | Never` while Podman uses `always | missing | never | newer`. Set it inside the relevant driver table. + +## Per-Driver Examples + +### Kubernetes + +The gateway runs as a Pod and creates sandbox Pods in another namespace. mTLS material for sandboxes is delivered via a Kubernetes Secret rather than host-side file paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" +log_level = "info" +compute_drivers = ["kubernetes"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +client_tls_secret_name = "openshell-client-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell-tls/server/tls.crt" +key_path = "/etc/openshell-tls/server/tls.key" +client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +image_pull_policy = "IfNotPresent" +# Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" +# on older clusters or where the ImageVolume feature gate is off. +supervisor_sideload_method = "image-volume" +``` + +### Docker + +Sandboxes run as containers on a local bridge network. The supervisor binary is bind-mounted from the host (no in-cluster image pull required); guest mTLS material is supplied as host paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["docker"] + +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.docker] +network_name = "openshell-docker" +# Skip the image-pull-and-extract step by pointing at a locally built binary. +supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" +``` + +### Podman + +Sandboxes run as Podman containers on a user-mode bridge network. The supervisor image is mounted read-only via Podman's `type=image` mount; guest mTLS material is supplied as host paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["podman"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.podman] +# Rootless socket path. For root Podman use /run/podman/podman.sock. +socket_path = "/run/user/1000/podman/podman.sock" +network_name = "openshell" +stop_timeout_secs = 10 +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer +``` + +### MicroVM + +Each sandbox runs inside its own libkrun microVM managed by the standalone `openshell-driver-vm` subprocess. Use this driver when you want stronger isolation than container namespaces alone. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +# VM is never auto-detected; an explicit entry here is required. +compute_drivers = ["vm"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +guest_tls_ca = "/var/lib/openshell/guest-tls/ca.pem" +guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" +guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" + +[openshell.drivers.vm] +state_dir = "/var/lib/openshell/vm" +# Where the gateway looks for the openshell-driver-vm subprocess binary. +driver_dir = "/usr/local/libexec/openshell" +vcpus = 2 +mem_mib = 2048 +krun_log_level = 1 +``` diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 9c6a16d94..b3c47ef7d 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -32,6 +32,8 @@ Common gateway options: | `--sandbox-image ` | `OPENSHELL_SANDBOX_IMAGE` | Set the default sandbox image used when a sandbox create request does not specify one. | | `--grpc-endpoint ` | `OPENSHELL_GRPC_ENDPOINT` | Set the gateway callback endpoint that sandbox workloads use to connect back to OpenShell. | +For the full file-based configuration surface — including gateway-wide settings, TLS, OIDC, and per-driver tables — see the [Gateway Configuration File](./gateway-config) reference. Each driver section below documents the CLI flags and environment variables; their TOML equivalents live in the `[openshell.drivers.]` tables on that page. + ## Docker Driver [Docker](https://www.docker.com/get-started/)-backed sandboxes run as containers on the gateway host. Use Docker for local development, single-machine gateways, and hosts that already use Docker Desktop or Docker Engine. diff --git a/examples/gateway/docker.example.toml b/examples/gateway/docker.example.toml deleted file mode 100644 index 27238fe0a..000000000 --- a/examples/gateway/docker.example.toml +++ /dev/null @@ -1,27 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Docker-driver example. Sandboxes run as containers on a local bridge -# network. The supervisor binary is bind-mounted from the host (no in-cluster -# image pull required); guest mTLS material is supplied as host paths. -# -# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. - -[openshell] -version = 1 - -[openshell.gateway] -bind_address = "127.0.0.1:8080" -log_level = "info" -compute_drivers = ["docker"] - -# Shared defaults inherited into [openshell.drivers.docker]. -supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -guest_tls_ca = "/etc/openshell/certs/ca.pem" -guest_tls_cert = "/etc/openshell/certs/client.pem" -guest_tls_key = "/etc/openshell/certs/client-key.pem" - -[openshell.drivers.docker] -network_name = "openshell-docker" -# Skip the image-pull-and-extract step by pointing at a locally built binary. -supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" diff --git a/examples/gateway/gateway.example.toml b/examples/gateway/gateway.example.toml deleted file mode 100644 index 120a9721e..000000000 --- a/examples/gateway/gateway.example.toml +++ /dev/null @@ -1,89 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Example gateway configuration. See RFC 0003 for the schema and -# rfc/0003-gateway-configuration/README.md for the field-by-field reference. -# -# Source precedence (highest first): -# CLI flag > OPENSHELL_* env var > this file > built-in default -# -# `database_url` is deliberately env-only and MUST NOT appear in this file -# (set OPENSHELL_DB_URL / --db-url instead). - -[openshell] -version = 1 - -[openshell.gateway] -bind_address = "0.0.0.0:8080" -health_bind_address = "0.0.0.0:8081" -metrics_bind_address = "0.0.0.0:9090" - -log_level = "info" - -# When empty the gateway auto-detects (Kubernetes -> Podman -> Docker). VM is -# never auto-detected and requires an explicit entry here. -compute_drivers = ["kubernetes"] - -sandbox_namespace = "openshell" -sandbox_ssh_port = 2222 -ssh_gateway_host = "127.0.0.1" -ssh_gateway_port = 8080 - -# Subject Alternative Names baked into the gateway server certificate. -# Wildcard DNS SANs (e.g. "*.dev.openshell.localhost") also enable sandbox -# service URLs under that domain. -server_sans = ["openshell", "*.dev.openshell.localhost"] -# Allow plaintext HTTP routing for loopback sandbox service URLs. -enable_loopback_service_http = true - -# Shared driver defaults — inherited into [openshell.drivers.] tables -# when the driver-specific table does not override them. -default_image = "ghcr.io/nvidia/openshell/sandbox:latest" -supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -client_tls_secret_name = "openshell-client-tls" -# `image_pull_policy` is intentionally NOT a shared key: Kubernetes uses -# `Always | IfNotPresent | Never` (passed as a string to the K8s API) while -# Podman uses the lowercase `always | missing | never | newer` enum. There is -# no value that means the same thing in both, so set it under the relevant -# `[openshell.drivers.]` table instead. - -# Gateway listener TLS (distinct from the per-driver guest_tls_*). -[openshell.gateway.tls] -cert_path = "/etc/openshell/certs/gateway.pem" -key_path = "/etc/openshell/certs/gateway-key.pem" -client_ca_path = "/etc/openshell/certs/client-ca.pem" -allow_unauthenticated = false - -[openshell.gateway.oidc] -issuer = "https://idp.example.com/realms/openshell" -audience = "openshell-cli" -jwks_ttl_secs = 3600 -roles_claim = "realm_access.roles" -admin_role = "openshell-admin" -user_role = "openshell-user" - -# ── Driver tables ────────────────────────────────────────────────────────── -# Each table inherits the shared defaults set above. Keys present here win -# over the inherited value. - -[openshell.drivers.kubernetes] -namespace = "agents" -grpc_endpoint = "https://openshell-gateway.agents.svc:8080" -image_pull_policy = "IfNotPresent" # K8s vocabulary: Always | IfNotPresent | Never - -[openshell.drivers.docker] -network_name = "openshell-docker" -# supervisor_image and guest_tls_* inherit from [openshell.gateway] - -[openshell.drivers.podman] -socket_path = "/run/podman/podman.sock" -network_name = "openshell" -stop_timeout_secs = 10 -image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer - -[openshell.drivers.vm] -state_dir = "/var/lib/openshell/vm" -driver_dir = "/usr/local/libexec/openshell" -vcpus = 2 -mem_mib = 2048 -krun_log_level = 1 diff --git a/examples/gateway/kubernetes.example.toml b/examples/gateway/kubernetes.example.toml deleted file mode 100644 index 6dd78e5e9..000000000 --- a/examples/gateway/kubernetes.example.toml +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Kubernetes-driver example. The gateway runs as a Pod and creates sandbox -# Pods in another namespace. mTLS material for sandboxes is delivered via a -# Kubernetes Secret rather than host-side file paths. -# -# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. - -[openshell] -version = 1 - -[openshell.gateway] -bind_address = "0.0.0.0:8080" -health_bind_address = "0.0.0.0:8081" -metrics_bind_address = "0.0.0.0:9090" -log_level = "info" -compute_drivers = ["kubernetes"] - -# Shared defaults inherited into [openshell.drivers.kubernetes]. -default_image = "ghcr.io/nvidia/openshell/sandbox:latest" -supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -client_tls_secret_name = "openshell-client-tls" - -[openshell.gateway.tls] -cert_path = "/etc/openshell-tls/server/tls.crt" -key_path = "/etc/openshell-tls/server/tls.key" -client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" - -[openshell.drivers.kubernetes] -namespace = "agents" -grpc_endpoint = "https://openshell-gateway.agents.svc:8080" -image_pull_policy = "IfNotPresent" -# Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" -# on older clusters or where the ImageVolume feature gate is off. -supervisor_sideload_method = "image-volume" diff --git a/examples/gateway/microvm.example.toml b/examples/gateway/microvm.example.toml deleted file mode 100644 index 60bbd4356..000000000 --- a/examples/gateway/microvm.example.toml +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# microVM-driver example. Each sandbox runs inside its own libkrun microVM -# managed by the standalone `openshell-driver-vm` subprocess. Use this driver -# when you want stronger isolation than container namespaces alone. -# -# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. - -[openshell] -version = 1 - -[openshell.gateway] -bind_address = "127.0.0.1:8080" -log_level = "info" -# VM is never auto-detected; an explicit entry here is required. -compute_drivers = ["vm"] - -# Shared defaults inherited into [openshell.drivers.vm]. -default_image = "ghcr.io/nvidia/openshell/sandbox:latest" -guest_tls_ca = "/var/lib/openshell/guest-tls/ca.pem" -guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" -guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" - -[openshell.drivers.vm] -state_dir = "/var/lib/openshell/vm" -# Where the gateway looks for the openshell-driver-vm subprocess binary. -driver_dir = "/usr/local/libexec/openshell" -vcpus = 2 -mem_mib = 2048 -krun_log_level = 1 diff --git a/examples/gateway/podman.example.toml b/examples/gateway/podman.example.toml deleted file mode 100644 index 3d963c176..000000000 --- a/examples/gateway/podman.example.toml +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Podman-driver example. Sandboxes run as Podman containers on a user-mode -# bridge network. The supervisor image is mounted read-only via Podman's -# `type=image` mount; guest mTLS material is supplied as host paths. -# -# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. - -[openshell] -version = 1 - -[openshell.gateway] -bind_address = "127.0.0.1:8080" -log_level = "info" -compute_drivers = ["podman"] - -# Shared defaults inherited into [openshell.drivers.podman]. -default_image = "ghcr.io/nvidia/openshell/sandbox:latest" -supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -guest_tls_ca = "/etc/openshell/certs/ca.pem" -guest_tls_cert = "/etc/openshell/certs/client.pem" -guest_tls_key = "/etc/openshell/certs/client-key.pem" - -[openshell.drivers.podman] -# Rootless socket path. For root Podman use /run/podman/podman.sock. -socket_path = "/run/user/1000/podman/podman.sock" -network_name = "openshell" -stop_timeout_secs = 10 -image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index a926fc9d8..da311920b 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -274,7 +274,7 @@ No part of this RFC has shipped yet. The work breaks down as: 3. **Per-driver deserialization** — give each driver crate (`openshell-driver-{kubernetes,docker,podman,vm}`) a `from_toml` (or `serde::Deserialize`) entry point so the gateway can hand each driver its own table. 4. **Test coverage** — file parsing, env-overrides-file, CLI-overrides-env, partial TLS error, port-collision error, unknown-field rejection, missing driver table fallback. 5. **Helm chart migration** — add `gateway.config` value tree, render the `ConfigMap`, mount it, switch the gateway container to `--config`. Keep the `OPENSHELL_*` env names available as opt-in overrides for secrets. -6. **Example file** — ship `examples/gateway/gateway.example.toml` and link it from the docs reference. +6. **Example file** — ship the per-driver examples on the published docs reference at `docs/reference/gateway-config.mdx`. 7. **Architecture doc update** — reflect the new config sources and precedence in `architecture/gateway.md`. ## Risks