diff --git a/Cargo.lock b/Cargo.lock index 42c3dba..1ebccc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -137,6 +137,7 @@ dependencies = [ "serde_json", "serde_yaml", "sha2 0.10.9", + "thiserror 2.0.18", "tokio", "tracing", ] @@ -254,6 +255,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "apl-session-valkey" +version = "0.2.0" +dependencies = [ + "apl-cpex", + "async-trait", + "deadpool-redis", + "redis", + "serde", + "serde_yaml", + "sha2 0.10.9", + "testcontainers", + "testcontainers-modules", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", +] + [[package]] name = "ar_archive_writer" version = "0.5.1" @@ -272,6 +292,12 @@ dependencies = [ "rustversion", ] +[[package]] +name = "arcstr" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03918c3dbd7701a85c6b9887732e2921175f26c350b4563841d0958c21d57e6d" + [[package]] name = "arraydeque" version = "0.5.1" @@ -303,6 +329,55 @@ dependencies = [ "serde_json", ] +[[package]] +name = "astral-tokio-tar" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec179a06c1769b1e42e1e2cbe74c7dcdb3d6383c838454d063eaac5bbb7ebbe5" +dependencies = [ + "filetime", + "futures-core", + "libc", + "portable-atomic", + "rustc-hash", + "tokio", + "tokio-stream", + "xattr", +] + +[[package]] +name = "async-lock" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "async-trait" version = "0.1.89" @@ -348,6 +423,58 @@ dependencies = [ "fs_extra", ] +[[package]] +name = "axum" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" +dependencies = [ + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "sync_wrapper", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", +] + +[[package]] +name = "backon" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" +dependencies = [ + "fastrand", +] + [[package]] name = "base16ct" version = "0.2.0" @@ -401,8 +528,8 @@ dependencies = [ "nom", "p256", "pkcs8 0.9.0", - "prost", - "prost-types", + "prost 0.10.4", + "prost-types 0.10.1", "rand 0.8.6", "rand_core 0.6.4", "regex", @@ -491,6 +618,83 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "bollard" +version = "0.19.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87a52479c9237eb04047ddb94788c41ca0d26eaff8b697ecfbb4c32f7fdc3b1b" +dependencies = [ + "async-stream", + "base64 0.22.1", + "bitflags", + "bollard-buildkit-proto", + "bollard-stubs", + "bytes", + "chrono", + "futures-core", + "futures-util", + "hex", + "home", + "http", + "http-body-util", + "hyper", + "hyper-named-pipe", + "hyper-rustls", + "hyper-util", + "hyperlocal", + "log", + "num", + "pin-project-lite", + "rand 0.9.4", + "rustls", + "rustls-native-certs", + "rustls-pemfile", + "rustls-pki-types", + "serde", + "serde_derive", + "serde_json", + "serde_repr", + "serde_urlencoded", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tokio-util", + "tonic", + "tower-service", + "url", + "winapi", +] + +[[package]] +name = "bollard-buildkit-proto" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a885520bf6249ab931a764ffdb87b0ceef48e6e7d807cfdb21b751e086e1ad" +dependencies = [ + "prost 0.14.4", + "prost-types 0.14.4", + "tonic", + "tonic-prost", + "ureq", +] + +[[package]] +name = "bollard-stubs" +version = "1.49.1-rc.28.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5731fe885755e92beff1950774068e0cae67ea6ec7587381536fca84f1779623" +dependencies = [ + "base64 0.22.1", + "bollard-buildkit-proto", + "bytes", + "chrono", + "prost 0.14.4", + "serde", + "serde_json", + "serde_repr", + "serde_with", +] + [[package]] name = "borsh" version = "1.6.1" @@ -734,7 +938,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" dependencies = [ "bytes", + "futures-core", "memchr", + "pin-project-lite", + "tokio", + "tokio-util", +] + +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", ] [[package]] @@ -877,6 +1094,7 @@ dependencies = [ "apl-identity-jwt", "apl-pdp-cedar-direct", "apl-pii-scanner", + "apl-session-valkey", "async-trait", "cpex-core", "rmp-serde", @@ -1045,6 +1263,36 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "deadpool" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883466cb8db62725aee5f4a6011e8a5d42912b42632df32aad57fc91127c6e04" +dependencies = [ + "deadpool-runtime", + "num_cpus", + "tokio", +] + +[[package]] +name = "deadpool-redis" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bafa30c49dafe086d10116074e422ad7fc1c3cf554697e744a3ab112599ebd09" +dependencies = [ + "deadpool", + "redis", +] + +[[package]] +name = "deadpool-runtime" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2657f61fb1dd8bf37a8d51093cc7cee4e77125b22f7753f49b289f831bec2bae" +dependencies = [ + "tokio", +] + [[package]] name = "deflate64" version = "0.1.12" @@ -1156,6 +1404,17 @@ dependencies = [ "const-random", ] +[[package]] +name = "docker_credential" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29547a1dc60885a552306986316bc9701ba120c1a8db6769fa68691529ad373d" +dependencies = [ + "base64 0.22.1", + "serde", + "serde_json", +] + [[package]] name = "dunce" version = "1.0.5" @@ -1303,6 +1562,54 @@ dependencies = [ "typeid", ] +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "etcetera" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26c7b13d0780cb82722fd59f6f57f925e143427e4a75313a6c77243bf5326ae6" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.59.0", +] + +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + [[package]] name = "ff" version = "0.13.1" @@ -1642,6 +1949,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" @@ -1666,6 +1979,15 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "http" version = "1.4.0" @@ -1753,6 +2075,21 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-named-pipe" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278" +dependencies = [ + "hex", + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", + "winapi", +] + [[package]] name = "hyper-rustls" version = "0.27.9" @@ -1769,6 +2106,19 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.20" @@ -1794,6 +2144,21 @@ dependencies = [ "windows-registry", ] +[[package]] +name = "hyperlocal" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7" +dependencies = [ + "hex", + "http-body-util", + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + [[package]] name = "iana-time-zone" version = "0.1.65" @@ -2199,6 +2564,12 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "litemap" version = "0.8.2" @@ -2267,6 +2638,12 @@ dependencies = [ "sha2 0.11.0", ] +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + [[package]] name = "memchr" version = "2.8.0" @@ -2388,6 +2765,20 @@ dependencies = [ "serde", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -2414,6 +2805,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.2.2" @@ -2440,6 +2840,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -2450,6 +2861,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "object" version = "0.37.3" @@ -2511,6 +2932,12 @@ dependencies = [ "sha2 0.10.9", ] +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.5" @@ -2534,6 +2961,31 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parse-display" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "914a1c2265c98e2446911282c6ac86d8524f495792c38c5bd884f80499c7538a" +dependencies = [ + "parse-display-derive", + "regex", + "regex-syntax", +] + +[[package]] +name = "parse-display-derive" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ae7800a4c974efd12df917266338e79a7a74415173caf7e70aa0a0707345281" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "regex-syntax", + "structmeta", + "syn 2.0.117", +] + [[package]] name = "pastey" version = "0.2.3" @@ -2639,6 +3091,26 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315" +[[package]] +name = "pin-project" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -2682,6 +3154,12 @@ version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "potential_utf" version = "0.1.5" @@ -2786,7 +3264,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71adf41db68aa0daaefc69bb30bcd68ded9b9abaad5d1fbb6304c4fb390e083e" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.10.1", +] + +[[package]] +name = "prost" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" +dependencies = [ + "bytes", + "prost-derive 0.14.4", ] [[package]] @@ -2802,6 +3290,19 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "prost-derive" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "prost-types" version = "0.10.1" @@ -2809,7 +3310,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68" dependencies = [ "bytes", - "prost", + "prost 0.10.4", +] + +[[package]] +name = "prost-types" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" +dependencies = [ + "prost 0.14.4", ] [[package]] @@ -2975,6 +3485,35 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" +[[package]] +name = "redis" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fd510128eda94d1d49b9f81487744d5c451422431cce41238fe2853d29f4cc" +dependencies = [ + "arc-swap", + "arcstr", + "async-lock", + "backon", + "bytes", + "cfg-if", + "combine", + "futures-channel", + "futures-util", + "itoa", + "percent-encoding", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "ryu", + "socket2", + "tokio", + "tokio-rustls", + "tokio-util", + "url", + "xxhash-rust", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -3220,6 +3759,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + [[package]] name = "rustls" version = "0.23.40" @@ -3227,6 +3779,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", @@ -3247,6 +3800,15 @@ dependencies = [ "security-framework", ] +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "rustls-pki-types" version = "1.14.1" @@ -3470,6 +4032,17 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "serde_spanned" version = "1.1.1" @@ -3769,6 +4342,29 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "structmeta" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e1575d8d40908d70f6fd05537266b90ae71b15dbbe7a8b7dffa2b759306d329" +dependencies = [ + "proc-macro2", + "quote", + "structmeta-derive", + "syn 2.0.117", +] + +[[package]] +name = "structmeta-derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "strum" version = "0.28.0" @@ -3868,6 +4464,44 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "testcontainers" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f3ac71069f20ecfa60c396316c283fbf35e6833a53dff551a31b5458da05edc" +dependencies = [ + "astral-tokio-tar", + "async-trait", + "bollard", + "bytes", + "docker_credential", + "either", + "etcetera", + "futures", + "log", + "memchr", + "parse-display", + "pin-project-lite", + "serde", + "serde_json", + "serde_with", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tokio-util", + "ulid", + "url", +] + +[[package]] +name = "testcontainers-modules" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1966329d5bb3f89d33602d2db2da971fb839f9297dad16527abf4564e2ae0a6d" +dependencies = [ + "testcontainers", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -4011,6 +4645,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.18" @@ -4056,6 +4701,46 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "axum", + "base64 0.22.1", + "bytes", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost 0.14.4", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -4064,11 +4749,15 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.14.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -4188,6 +4877,16 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "ulid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe" +dependencies = [ + "rand 0.9.4", + "web-time", +] + [[package]] name = "unicode-ident" version = "1.0.24" @@ -4255,6 +4954,33 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" +dependencies = [ + "base64 0.22.1", + "log", + "percent-encoding", + "rustls", + "rustls-pki-types", + "ureq-proto", + "utf8-zero", +] + +[[package]] +name = "ureq-proto" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" +dependencies = [ + "base64 0.22.1", + "http", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.8" @@ -4265,8 +4991,15 @@ dependencies = [ "idna", "percent-encoding", "serde", + "serde_derive", ] +[[package]] +name = "utf8-zero" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -4489,6 +5222,22 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29333c3ea1ba8b17211763463ff24ee84e41c78224c16b001cd907e663a38c68" +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" @@ -4498,6 +5247,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" @@ -4577,6 +5332,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" @@ -4827,6 +5591,22 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + [[package]] name = "yaml-rust2" version = "0.11.0" diff --git a/Cargo.toml b/Cargo.toml index 2d87f66..01c6bc8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "crates/apl-delegator-biscuit", "crates/apl-pii-scanner", "crates/apl-audit-logger", + "crates/apl-session-valkey", "examples/go-demo/ffi", ] diff --git a/crates/apl-cpex/Cargo.toml b/crates/apl-cpex/Cargo.toml index b0b50f4..c859afe 100644 --- a/crates/apl-cpex/Cargo.toml +++ b/crates/apl-cpex/Cargo.toml @@ -29,6 +29,7 @@ apl-cmf = { path = "../apl-cmf" } cpex-core = { path = "../cpex-core" } async-trait = { workspace = true } chrono = { workspace = true } +thiserror = { workspace = true } serde_json = { workspace = true } serde_yaml = { workspace = true } tokio = { workspace = true } diff --git a/crates/apl-cpex/src/cmf_invoker.rs b/crates/apl-cpex/src/cmf_invoker.rs index 6abb3ca..b30a7fe 100644 --- a/crates/apl-cpex/src/cmf_invoker.rs +++ b/crates/apl-cpex/src/cmf_invoker.rs @@ -62,12 +62,10 @@ use cpex_core::manager::PluginManager; use apl_core::attributes::AttributeBag; use apl_core::evaluator::Decision; use apl_core::pipeline::{TaintEvent, TaintScope}; -use apl_core::step::{ - DispatchPhase, PluginError, PluginInvocation, PluginInvoker, PluginOutcome, -}; +use apl_core::step::{DispatchPhase, PluginError, PluginInvocation, PluginInvoker, PluginOutcome}; use crate::dispatch_plan::RouteDispatchPlan; -use crate::session_store::SessionStore; +use crate::session_store::{SessionStore, SessionStoreError}; /// Bridges APL plugin dispatch to CMF-family CPEX hooks. /// @@ -118,19 +116,23 @@ impl CmfPluginInvoker { payload: MessagePayload, plan: Arc, session_store: Arc, - ) -> Self { + ) -> Result { // Resolve session id via the 4-tier resolver (token claim → // header → identity-derived → none). Snapshotted before // hydration so the lookup is independent of the COW write // that hydration performs. - let session_id: Option = crate::session_resolver::resolve_session(&extensions) - .map(|(sid, _src)| sid); + let session_id: Option = + crate::session_resolver::resolve_session(&extensions).map(|(sid, _src)| sid); // Hydration: union the session's accumulated labels into the // request's security labels. Skipped when there's no session_id - // OR no stored labels (avoid the COW clone for nothing). + // (anonymous/sessionless traffic has no state to load and is + // unaffected by a store outage). A load error propagates so the + // caller fails the request closed *before* any decision is made + // — a distributed store being unreachable must never silently + // present as "no accumulated labels". if let Some(sid) = &session_id { - let stored = session_store.load_labels(sid).await; + let stored = session_store.load_labels(sid).await?; if !stored.is_empty() { extensions = hydrate_labels(extensions, &stored); } @@ -138,7 +140,7 @@ impl CmfPluginInvoker { let initial_labels = snapshot_labels(&extensions); - Self { + Ok(Self { manager, extensions: Arc::new(Mutex::new(extensions)), payload: Arc::new(Mutex::new(payload)), @@ -146,7 +148,7 @@ impl CmfPluginInvoker { session_id, session_store, initial_labels, - } + }) } /// Snapshot the current payload. Call after route evaluation to @@ -218,13 +220,22 @@ impl CmfPluginInvoker { /// Persist session-scoped state added during this request. Diffs /// current `security.labels` against the post-hydration snapshot - /// and appends new labels to the session store. No-op when there - /// was no session ID. Host calls this exactly once after route - /// evaluation completes. - pub async fn persist_session(&self) { - let Some(sid) = &self.session_id else { return }; + /// and appends new labels to the session store. No-op (returns + /// `Ok`) when there was no session ID or no new labels. Host calls + /// this exactly once after route evaluation completes. + /// + /// An append error is returned so the caller can fail the request + /// closed (R18). Because this runs after the policy decision is + /// computed, the route handler converts an append error into a Deny + /// outcome rather than dropping the accumulated taint silently. + pub async fn persist_session(&self) -> Result<(), SessionStoreError> { + let Some(sid) = &self.session_id else { + return Ok(()); + }; let current = self.extensions.lock().await; - let Some(security) = current.security.as_ref() else { return }; + let Some(security) = current.security.as_ref() else { + return Ok(()); + }; let new_labels: Vec = security .labels .iter() @@ -233,8 +244,9 @@ impl CmfPluginInvoker { .collect(); drop(current); // release the lock before the await if !new_labels.is_empty() { - self.session_store.append_labels(sid, &new_labels).await; + self.session_store.append_labels(sid, &new_labels).await?; } + Ok(()) } } @@ -305,7 +317,10 @@ impl PluginInvoker for CmfPluginInvoker { Some(v) => (Some(v.reason), v.code), None => (None, "policy.forbidden".to_string()), }; - Decision::Deny { reason, rule_source } + Decision::Deny { + reason, + rule_source, + } } else { Decision::Allow }; @@ -319,11 +334,9 @@ impl PluginInvoker for CmfPluginInvoker { Some(modified) => { *self.payload.lock().await = modified.clone(); match invocation { - PluginInvocation::Field { .. } => { - Some(serde_json::Value::String( - modified.message.get_text_content(), - )) - } + PluginInvocation::Field { .. } => Some(serde_json::Value::String( + modified.message.get_text_content(), + )), PluginInvocation::Step { .. } => None, } } @@ -347,10 +360,8 @@ impl PluginInvoker for CmfPluginInvoker { // already validated label monotonicity on the way out. let taints = if let Some(modified_ext) = result.modified_extensions { let after_labels = snapshot_labels(&modified_ext); - let new_labels: Vec = after_labels - .difference(&before_labels) - .cloned() - .collect(); + let new_labels: Vec = + after_labels.difference(&before_labels).cloned().collect(); *self.extensions.lock().await = modified_ext; new_labels .into_iter() @@ -407,4 +418,3 @@ fn hydrate_labels(mut extensions: Extensions, labels: &[String]) -> Extensions { extensions.security = Some(Arc::new(security)); extensions } - diff --git a/crates/apl-cpex/src/lib.rs b/crates/apl-cpex/src/lib.rs index b5f6aae..4afe5cf 100644 --- a/crates/apl-cpex/src/lib.rs +++ b/crates/apl-cpex/src/lib.rs @@ -48,5 +48,5 @@ pub use dispatch_plan::{DispatchCache, RouteDispatchPlan, RoutePluginEntry}; pub use pdp_router::PdpRouter; pub use register::{register_apl, AplOptions}; pub use route_handler::{AplRouteHandler, Phase}; -pub use session_store::{MemorySessionStore, SessionStore}; +pub use session_store::{MemorySessionStore, SessionStore, SessionStoreError, SessionStoreFactory}; pub use visitor::AplConfigVisitor; diff --git a/crates/apl-cpex/src/register.rs b/crates/apl-cpex/src/register.rs index ab6becf..13e0308 100644 --- a/crates/apl-cpex/src/register.rs +++ b/crates/apl-cpex/src/register.rs @@ -39,10 +39,9 @@ use cpex_core::visitor::ConfigVisitor; use apl_core::step::{PdpFactory, PdpResolver}; use crate::dispatch_plan::DispatchCache; -use crate::session_store::SessionStore; +use crate::session_store::{SessionStore, SessionStoreFactory}; use crate::visitor::AplConfigVisitor; - /// Configuration for [`register_apl`]. All runtime collaborators APL /// needs to do its work are funneled through here so the call site /// reads as a single block instead of a multi-step builder. @@ -75,6 +74,14 @@ pub struct AplOptions { /// `pdps`. pub pdp_factories: Vec>, + /// Session-store factories the visitor consults when it encounters a + /// `global.apl.session_store` block. Each factory advertises a + /// `kind()` string matching the block's `kind:` field — e.g. + /// `valkey`. An empty list keeps the constructor-supplied + /// `session_store` (the `MemorySessionStore` default) active, so + /// existing deployments are unaffected. + pub session_store_factories: Vec>, + /// Override the visitor's baseline capabilities for installed /// `AplRouteHandler`s. `None` uses the visitor's default /// (read-only across the common attribute namespaces); `Some(set)` @@ -96,6 +103,7 @@ impl AplOptions { session_store: Arc::new(crate::session_store::MemorySessionStore::new()), pdps: Vec::new(), pdp_factories: Vec::new(), + session_store_factories: Vec::new(), base_capabilities: None, } } @@ -142,15 +150,13 @@ impl AplOptions { /// mgr.load_config_yaml(&yaml_string)?; /// mgr.initialize().await?; /// ``` -pub fn register_apl( - mgr: &Arc, - opts: AplOptions, -) -> Arc { +pub fn register_apl(mgr: &Arc, opts: AplOptions) -> Arc { let AplOptions { dispatch_cache, session_store, pdps, pdp_factories, + session_store_factories, base_capabilities, } = opts; @@ -160,11 +166,7 @@ pub fn register_apl( // handle to the manager. Code-supplied PDPs go through // `register_pdp(&self, ...)` which uses interior mutability, so // they're registered after the `Arc` wrap. - let mut visitor = AplConfigVisitor::new( - dispatch_cache, - session_store, - Arc::downgrade(mgr), - ); + let mut visitor = AplConfigVisitor::new(dispatch_cache, session_store, Arc::downgrade(mgr)); if let Some(caps) = base_capabilities { visitor = visitor.with_base_capabilities(caps); @@ -174,6 +176,10 @@ pub fn register_apl( visitor.register_pdp_factory(factory); } + for factory in session_store_factories { + visitor.register_session_store_factory(factory); + } + let arc = Arc::new(visitor); for pdp in pdps { diff --git a/crates/apl-cpex/src/route_handler.rs b/crates/apl-cpex/src/route_handler.rs index d1d1b83..2ab2ce3 100644 --- a/crates/apl-cpex/src/route_handler.rs +++ b/crates/apl-cpex/src/route_handler.rs @@ -196,16 +196,41 @@ impl AnyHookHandler for AplRouteHandler { // so `dispatch_parallel` can clone an owned, 'static reference into // each spawned branch). Inherent-method calls on `CmfPluginInvoker` // (e.g. `extensions_arc`, `persist_session`) deref through the Arc. - let invoker = Arc::new( - CmfPluginInvoker::for_request( - Arc::clone(&manager), - extensions.clone(), - msg_payload.clone(), - plan, - Arc::clone(&self.session_store), - ) - .await, - ); + // Hydration loads accumulated session labels. A store failure + // here happens *before* any policy decision, so we fail the + // request closed immediately (R5/R18, F2): deny with a + // distinguished violation rather than proceeding as if the + // session carried no taint. Sessionless traffic never reaches + // the store, so this only denies session-bearing requests. + let invoker = match CmfPluginInvoker::for_request( + Arc::clone(&manager), + extensions.clone(), + msg_payload.clone(), + plan, + Arc::clone(&self.session_store), + ) + .await + { + Ok(inv) => Arc::new(inv), + Err(e) => { + tracing::error!( + alarm = "session_store_failure", + op = "load", + route = %self.route.route_key, + error = %e, + "session label load failed; failing request closed" + ); + return Ok(Box::new(ErasedResultFields { + continue_processing: false, + modified_payload: None, + modified_extensions: None, + violation: Some(PluginViolation::new( + "session.load_failed", + "session state could not be loaded", + )), + })); + } + }; // Build the attribute bag. APL predicates read flat keys; the // BagBuilder bridges typed CPEX extensions into that namespace. @@ -319,8 +344,10 @@ impl AnyHookHandler for AplRouteHandler { invoker.apply_session_taints(&decision.taints).await; // Commit any session-scoped labels accumulated during this - // request. No-op when there was no session id. - invoker.persist_session().await; + // request. No-op when there was no session id. The result is + // folded into the decision below (R18) — captured here because + // `continue_processing`/`violation` are computed after persist. + let persist_result = invoker.persist_session().await; // Surface the final mutated payload + extensions back into the // PipelineResult the executor returns to the host. The host's @@ -341,41 +368,39 @@ impl AnyHookHandler for AplRouteHandler { Phase::Pre => None, Phase::Post => Some(extract_result_from_message(&msg_payload.message)), }; - let modified_payload: Option> = - if route_payload.args != pre_args { - // An args pipeline (Pre) rewrote a field. Fold the new - // args back into a fresh MessagePayload so downstream - // readers (the host's body re-serializer) see the - // change. - let mut updated = final_payload.clone(); - write_args_back_to_message(&mut updated.message, &route_payload.args); - Some(Box::new(updated) as Box) - } else if matches!(self.phase, Phase::Post) - && pre_result - .as_ref() - .zip(route_payload.result.as_ref()) - .map(|(prev, current)| prev != current) - .unwrap_or(false) - { - // A `result:` pipeline rewrote a field in the upstream - // response. Fold the new result back into the message - // so the host's response body re-serializer can write - // it out before forwarding downstream. - let mut updated = final_payload.clone(); - if let Some(result_value) = route_payload.result.as_ref() { - write_result_back_to_message(&mut updated.message, result_value); - } - Some(Box::new(updated) as Box) - } else if msg_payload.message.get_text_content() - != final_payload.message.get_text_content() - { - // A `policy:` plugin mutated the message directly via - // `modify_payload` (not through a field pipeline). Pass - // the invoker's view through unchanged. - Some(Box::new(final_payload) as Box) - } else { - None - }; + let modified_payload: Option> = if route_payload.args != pre_args { + // An args pipeline (Pre) rewrote a field. Fold the new + // args back into a fresh MessagePayload so downstream + // readers (the host's body re-serializer) see the + // change. + let mut updated = final_payload.clone(); + write_args_back_to_message(&mut updated.message, &route_payload.args); + Some(Box::new(updated) as Box) + } else if matches!(self.phase, Phase::Post) + && pre_result + .as_ref() + .zip(route_payload.result.as_ref()) + .map(|(prev, current)| prev != current) + .unwrap_or(false) + { + // A `result:` pipeline rewrote a field in the upstream + // response. Fold the new result back into the message + // so the host's response body re-serializer can write + // it out before forwarding downstream. + let mut updated = final_payload.clone(); + if let Some(result_value) = route_payload.result.as_ref() { + write_result_back_to_message(&mut updated.message, result_value); + } + Some(Box::new(updated) as Box) + } else if msg_payload.message.get_text_content() != final_payload.message.get_text_content() + { + // A `policy:` plugin mutated the message directly via + // `modify_payload` (not through a field pipeline). Pass + // the invoker's view through unchanged. + Some(Box::new(final_payload) as Box) + } else { + None + }; let modified_extensions = if extensions_changed(extensions, &final_extensions) { Some(final_extensions.cow_copy()) @@ -383,9 +408,12 @@ impl AnyHookHandler for AplRouteHandler { None }; - let (continue_processing, violation) = match decision.decision { + let (mut continue_processing, mut violation) = match decision.decision { Decision::Allow => (true, None), - Decision::Deny { reason, rule_source } => { + Decision::Deny { + reason, + rule_source, + } => { let code = if rule_source.is_empty() { "policy.deny".to_string() } else { @@ -396,6 +424,33 @@ impl AnyHookHandler for AplRouteHandler { } }; + // Append fail-closed (R18) with merge precedence: + // - decision Allow + append Err → flip to Deny with a + // distinguished `session.persist_failed` violation. + // - decision Deny + append Err → keep the original policy + // violation (preserve attribution); the request is already + // denied. The append failure surfaces only as the alarm. + // The alarm/metric fires on every append failure regardless of + // decision, since the dangerous residual is a *selective* + // failure (append rejected while reads still succeed). + if let Err(e) = persist_result { + tracing::error!( + alarm = "session_store_failure", + op = "append", + route = %self.route.route_key, + decision_was_allow = continue_processing, + error = %e, + "session label persist failed; failing request closed" + ); + if continue_processing { + continue_processing = false; + violation = Some(PluginViolation::new( + "session.persist_failed", + "session state could not be persisted", + )); + } + } + Ok(Box::new(ErasedResultFields { continue_processing, modified_payload, @@ -566,4 +621,3 @@ fn extensions_changed(before: &Extensions, after: &Extensions) -> bool { }; security_changed || delegation_changed || raw_creds_changed } - diff --git a/crates/apl-cpex/src/session_store.rs b/crates/apl-cpex/src/session_store.rs index 54f7037..b227a8c 100644 --- a/crates/apl-cpex/src/session_store.rs +++ b/crates/apl-cpex/src/session_store.rs @@ -27,10 +27,33 @@ // hydration/persistence into/out of `Extensions.security.labels`. use std::collections::{HashMap, HashSet}; -use std::sync::RwLock; +use std::sync::{Arc, RwLock}; use async_trait::async_trait; +/// Error returned by a `SessionStore` when the backing store could not +/// satisfy a request. Distributed backends (e.g. Valkey) surface +/// connectivity/timeout/protocol failures and undecodable responses +/// here so callers can **fail closed** rather than silently treating a +/// backend failure as "no accumulated labels". +/// +/// String-typed deliberately, matching the trait's own philosophy (see +/// the module header): the error stays free of backend-specific types so +/// non-CMF bridges and the cross-crate `apl-session-valkey` backend can +/// construct it without dragging dependencies into this surface. +/// +/// Note the distinction this enables: a **positively-confirmed key-miss** +/// (unknown session) is `Ok(empty)`, NOT an error — only a genuine +/// backend failure is an `Err`. +#[derive(Debug, thiserror::Error)] +pub enum SessionStoreError { + /// The backing store was unreachable, timed out, returned an error, + /// or returned a response that could not be decoded into the + /// expected representation. Callers fail closed on this. + #[error("session store backend error: {0}")] + Backend(String), +} + /// Pluggable session-state backend. Implementations must be `Send + Sync` /// — the same store is shared across all concurrent requests. /// @@ -38,19 +61,53 @@ use async_trait::async_trait; /// - `append_labels` is **monotonic** — labels added to a session never /// come back out. Removal (declassification) is a separate operation /// not covered by v0. -/// - Empty `load_labels` for an unknown `session_id` is the right -/// response — non-session traffic shouldn't fail, it just sees no -/// accumulated state. +/// - `load_labels` for an unknown `session_id` returns `Ok(empty)` — a +/// positively-confirmed key-miss is the right response for non-session +/// traffic, and is distinct from a backend failure (`Err`). +/// - Both methods return `Result` so a distributed backend can propagate +/// failures and the caller can fail the request closed. The in-process +/// [`MemorySessionStore`] is infallible and always returns `Ok`. #[async_trait] pub trait SessionStore: Send + Sync { - /// Load the union of labels accumulated for the session. Empty for - /// new or unknown sessions. - async fn load_labels(&self, session_id: &str) -> Vec; + /// Load the union of labels accumulated for the session. `Ok(empty)` + /// for new or unknown sessions (a confirmed key-miss); `Err` only on + /// a backend failure. + async fn load_labels(&self, session_id: &str) -> Result, SessionStoreError>; /// Append labels to the session. Existing labels are kept; new ones /// are unioned in. Caller has already deduped against `load_labels` - /// in the hot path, but the store re-dedups defensively. - async fn append_labels(&self, session_id: &str, labels: &[String]); + /// in the hot path, but the store re-dedups defensively. `Err` only + /// on a backend failure. + async fn append_labels( + &self, + session_id: &str, + labels: &[String], + ) -> Result<(), SessionStoreError>; +} + +/// Factory the visitor consults when it encounters a +/// `global.apl.session_store` block in the unified config. Mirrors +/// [`apl_core::step::PdpFactory`]: each factory advertises a `kind()` +/// string matching the YAML block's `kind:` field, and `build` turns the +/// block into a live store. Registered up front via +/// [`crate::AplOptions::session_store_factories`]; the visitor selects +/// the active store from config during its global-config walk, before +/// any route handler captures the store. +/// +/// `build` errors are construction-time (bad config, unresolvable +/// endpoint) and surface as a config-load failure — distinct from the +/// request-time [`SessionStoreError`] the trait methods return. +pub trait SessionStoreFactory: Send + Sync { + /// The `kind:` discriminator this factory builds (e.g. `"valkey"`). + fn kind(&self) -> &str; + + /// Build a store from its config block. The whole + /// `global.apl.session_store` mapping is passed so the factory can + /// read its own keys (endpoint, TLS, auth, prefix, TTL, …). + fn build( + &self, + config: &serde_yaml::Value, + ) -> Result, Box>; } /// In-process `SessionStore` backed by a `HashMap` of `HashSet`s. Suitable @@ -75,31 +132,33 @@ impl MemorySessionStore { /// callers should go through the trait so the backing implementation /// stays swappable. pub fn snapshot(&self) -> HashMap> { - self.inner - .read() - .unwrap_or_else(|p| p.into_inner()) - .clone() + self.inner.read().unwrap_or_else(|p| p.into_inner()).clone() } } #[async_trait] impl SessionStore for MemorySessionStore { - async fn load_labels(&self, session_id: &str) -> Vec { + async fn load_labels(&self, session_id: &str) -> Result, SessionStoreError> { let r = self.inner.read().unwrap_or_else(|p| p.into_inner()); - r.get(session_id) + Ok(r.get(session_id) .map(|s| s.iter().cloned().collect()) - .unwrap_or_default() + .unwrap_or_default()) } - async fn append_labels(&self, session_id: &str, labels: &[String]) { + async fn append_labels( + &self, + session_id: &str, + labels: &[String], + ) -> Result<(), SessionStoreError> { if labels.is_empty() { - return; + return Ok(()); } let mut w = self.inner.write().unwrap_or_else(|p| p.into_inner()); let entry = w.entry(session_id.to_string()).or_default(); for l in labels { entry.insert(l.clone()); } + Ok(()) } } @@ -111,7 +170,8 @@ mod tests { #[tokio::test] async fn load_for_unknown_session_is_empty() { let store = MemorySessionStore::new(); - assert!(store.load_labels("nonexistent").await.is_empty()); + // Unknown session is a confirmed key-miss: Ok(empty), not Err. + assert!(store.load_labels("nonexistent").await.unwrap().is_empty()); } #[tokio::test] @@ -119,8 +179,9 @@ mod tests { let store = MemorySessionStore::new(); store .append_labels("sess-1", &["PII".to_string(), "INTERNAL".to_string()]) - .await; - let mut labels = store.load_labels("sess-1").await; + .await + .unwrap(); + let mut labels = store.load_labels("sess-1").await.unwrap(); labels.sort(); assert_eq!(labels, vec!["INTERNAL".to_string(), "PII".to_string()]); } @@ -128,11 +189,15 @@ mod tests { #[tokio::test] async fn append_is_monotonic_dedupes() { let store = MemorySessionStore::new(); - store.append_labels("sess-1", &["PII".to_string()]).await; + store + .append_labels("sess-1", &["PII".to_string()]) + .await + .unwrap(); store .append_labels("sess-1", &["PII".to_string(), "PII".to_string()]) - .await; - let labels = store.load_labels("sess-1").await; + .await + .unwrap(); + let labels = store.load_labels("sess-1").await.unwrap(); assert_eq!(labels.len(), 1); assert_eq!(labels[0], "PII"); } @@ -140,10 +205,10 @@ mod tests { #[tokio::test] async fn sessions_are_isolated() { let store = MemorySessionStore::new(); - store.append_labels("a", &["X".to_string()]).await; - store.append_labels("b", &["Y".to_string()]).await; - assert_eq!(store.load_labels("a").await, vec!["X".to_string()]); - assert_eq!(store.load_labels("b").await, vec!["Y".to_string()]); + store.append_labels("a", &["X".to_string()]).await.unwrap(); + store.append_labels("b", &["Y".to_string()]).await.unwrap(); + assert_eq!(store.load_labels("a").await.unwrap(), vec!["X".to_string()]); + assert_eq!(store.load_labels("b").await.unwrap(), vec!["Y".to_string()]); } #[tokio::test] @@ -151,7 +216,7 @@ mod tests { let store: Arc = Arc::new(MemorySessionStore::new()); let c1 = Arc::clone(&store); let c2 = Arc::clone(&store); - c1.append_labels("sess", &["Z".to_string()]).await; - assert_eq!(c2.load_labels("sess").await, vec!["Z".to_string()]); + c1.append_labels("sess", &["Z".to_string()]).await.unwrap(); + assert_eq!(c2.load_labels("sess").await.unwrap(), vec!["Z".to_string()]); } } diff --git a/crates/apl-cpex/src/visitor.rs b/crates/apl-cpex/src/visitor.rs index 349032f..6102ebc 100644 --- a/crates/apl-cpex/src/visitor.rs +++ b/crates/apl-cpex/src/visitor.rs @@ -69,7 +69,7 @@ use apl_core::step::{PdpFactory, PdpResolver}; use crate::dispatch_plan::DispatchCache; use crate::pdp_router::PdpRouter; use crate::route_handler::{AplRouteHandler, Phase}; -use crate::session_store::SessionStore; +use crate::session_store::{SessionStore, SessionStoreFactory}; /// Legacy alias for the tool-family pre hook. Kept exported for /// callers that wired against the v0 visitor constants — the @@ -130,7 +130,13 @@ struct VisitorState { pub struct AplConfigVisitor { state: RwLock, dispatch_cache: Arc, - session_store: Arc, + /// Active session store. Behind a `RwLock` because a + /// `global.apl.session_store` block can swap it during the + /// config walk (`visit_global`), which runs before route handlers + /// capture the store in `visit_route`. Only touched during the + /// single-threaded config walk — never on the request hot path, + /// where each handler holds its own cloned `Arc`. + session_store: RwLock>, manager: Weak, /// Baseline capabilities granted to every synthetic `AplRouteHandler` /// the visitor installs. Unioned with the per-route plugin @@ -143,6 +149,11 @@ pub struct AplConfigVisitor { /// `global.apl.pdp[]` entry. Keyed by the factory's `kind()` — /// matches the `kind:` field in the YAML block. pdp_factories: HashMap>, + /// Factories the visitor consults for a `global.apl.session_store` + /// block. Keyed by the factory's `kind()`. Empty by default, in + /// which case the constructor-supplied store (typically + /// `MemorySessionStore`) stays active. + session_store_factories: HashMap>, } impl AplConfigVisitor { @@ -154,10 +165,11 @@ impl AplConfigVisitor { Self { state: RwLock::new(VisitorState::default()), dispatch_cache, - session_store, + session_store: RwLock::new(session_store), manager, base_capabilities: default_base_capabilities(), pdp_factories: HashMap::new(), + session_store_factories: HashMap::new(), } } @@ -175,7 +187,54 @@ impl AplConfigVisitor { /// `register_apl` setup; the visitor uses these to instantiate /// resolvers from `global.apl.pdp[]` config blocks. pub fn register_pdp_factory(&mut self, factory: Arc) { - self.pdp_factories.insert(factory.kind().to_string(), factory); + self.pdp_factories + .insert(factory.kind().to_string(), factory); + } + + /// Register a `SessionStoreFactory` by its `kind()`. Called during + /// `register_apl` setup; the visitor uses these to swap in the + /// config-selected session store when it sees a + /// `global.apl.session_store` block. + pub fn register_session_store_factory(&mut self, factory: Arc) { + self.session_store_factories + .insert(factory.kind().to_string(), factory); + } + + /// Parse the optional `global.apl.session_store` block and swap the + /// active store. Looks up the factory by `kind`, builds the store, + /// and replaces the constructor-supplied default. Runs during + /// `visit_global` — before `visit_route` clones the store into each + /// handler — so the selected store is the one handlers capture. + /// Absent block → no-op (the default store stays active). + fn build_session_store_from_config( + &self, + block: &serde_yaml::Value, + ) -> Result<(), VisitorError> { + let map = block.as_mapping().ok_or_else(|| { + "global.apl.session_store must be a mapping with a `kind:` field".to_string() + })?; + let kind = map + .get(serde_yaml::Value::String("kind".to_string())) + .and_then(|v| v.as_str()) + .ok_or_else(|| "global.apl.session_store missing required `kind:` field".to_string())?; + let factory = self.session_store_factories.get(kind).ok_or_else(|| { + format!( + "global.apl.session_store declared kind='{}' but no factory is registered for that \ + kind — host must call register_session_store_factory(...) before load_config_yaml", + kind + ) + })?; + let store = factory.build(block).map_err(|e| { + format!( + "global.apl.session_store (kind='{}') failed to build: {}", + kind, e + ) + })?; + *self + .session_store + .write() + .unwrap_or_else(|p| p.into_inner()) = store; + Ok(()) } /// Replace the baseline capability set granted to every installed @@ -184,10 +243,7 @@ impl AplConfigVisitor { /// agent). Tighten this when the deployment's policy plugins /// don't need broad reads — every cap removed is one fewer /// extension slot a buggy predicate can leak through. - pub fn with_base_capabilities( - mut self, - caps: std::collections::HashSet, - ) -> Self { + pub fn with_base_capabilities(mut self, caps: std::collections::HashSet) -> Self { self.base_capabilities = caps; self } @@ -212,12 +268,7 @@ impl AplConfigVisitor { let kind = map .get(serde_yaml::Value::String("kind".to_string())) .and_then(|v| v.as_str()) - .ok_or_else(|| { - format!( - "global.apl.pdp[{}] missing required `kind:` field", - index - ) - })?; + .ok_or_else(|| format!("global.apl.pdp[{}] missing required `kind:` field", index))?; let factory = self.pdp_factories.get(kind).ok_or_else(|| { format!( "global.apl.pdp[{}] declared kind='{}' but no factory is registered for that kind — \ @@ -322,13 +373,19 @@ impl ConfigVisitor for AplConfigVisitor { } } - // The `pdp:` sub-key isn't an APL DSL field; strip it before - // handing the block to `compile_policy_block_value` so the - // compiler doesn't see an unknown key. `compile_policy_block_value` - // accepts maps with `policy:` / `post_policy:` / `args:` / - // `result:` / `plugins:` (and inert fields it ignores), so a - // shallow strip on a clone is enough. - let policy_only = strip_pdp_key(&apl_block); + // Process an optional `global.apl.session_store` block: swap the + // active store before `visit_route` clones it into handlers. + if let Some(block) = apl_block.get("session_store") { + self.build_session_store_from_config(block)?; + } + + // The `pdp:` / `session_store:` sub-keys aren't APL DSL fields; + // strip them before handing the block to + // `compile_policy_block_value` so the compiler doesn't see unknown + // keys. `compile_policy_block_value` accepts maps with `policy:` / + // `post_policy:` / `args:` / `result:` / `plugins:` (and inert + // fields it ignores), so a shallow strip on a clone is enough. + let policy_only = strip_non_dsl_keys(&apl_block); let compiled = compile_policy_block_value("global.apl", &policy_only) .map_err(|e| Box::new(e) as VisitorError)?; self.state @@ -348,7 +405,7 @@ impl ConfigVisitor for AplConfigVisitor { return Ok(()); }; let source = format!("global.defaults.{}.apl", entity_type); - warn_if_pdp_at_nonglobal_scope(&source, &apl_block); + warn_if_global_only_key_at_nonglobal_scope(&source, &apl_block); let compiled = compile_policy_block_value(&source, &apl_block) .map_err(|e| Box::new(e) as VisitorError)?; self.state @@ -369,7 +426,7 @@ impl ConfigVisitor for AplConfigVisitor { return Ok(()); }; let source = format!("global.policies.{}.apl", tag); - warn_if_pdp_at_nonglobal_scope(&source, &apl_block); + warn_if_global_only_key_at_nonglobal_scope(&source, &apl_block); let compiled = compile_policy_block_value(&source, &apl_block) .map_err(|e| Box::new(e) as VisitorError)?; self.state @@ -400,7 +457,7 @@ impl ConfigVisitor for AplConfigVisitor { } }; if let Some(block) = &route_apl { - warn_if_pdp_at_nonglobal_scope(&format!("routes.{entity_type}"), block); + warn_if_global_only_key_at_nonglobal_scope(&format!("routes.{entity_type}"), block); } let scope = parsed.meta.as_ref().and_then(|m| m.scope.clone()); let tags: Vec = parsed @@ -490,10 +547,9 @@ impl ConfigVisitor for AplConfigVisitor { // the authoritative registration state). The lookup trait // is `parallel_safety::PluginModeLookup`, which // `PluginManager` implements. - if let Err(msg) = crate::parallel_safety::validate_parallel_plugin_modes( - &effective, - mgr.as_ref(), - ) { + if let Err(msg) = + crate::parallel_safety::validate_parallel_plugin_modes(&effective, mgr.as_ref()) + { let err_msg = format!("route '{}': parallel-safety: {}", route_key, msg); return Err(err_msg.into()); } @@ -516,6 +572,16 @@ impl ConfigVisitor for AplConfigVisitor { } }; + // Snapshot the active session store (a `global.apl.session_store` + // block in `visit_global` may have swapped it). Each handler + // captures its own clone, so request-time dispatch never touches + // the visitor's lock. + let session_store = self + .session_store + .read() + .unwrap_or_else(|p| p.into_inner()) + .clone(); + // Install Pre + Post handlers. Each handler instance is bound to // ONE phase so the executor can pick the right entry-point off // the (entity_type, entity_name, scope, hook_name) key. @@ -529,7 +595,7 @@ impl ConfigVisitor for AplConfigVisitor { Arc::clone(&route_arc), &plugin_registry, &self.dispatch_cache, - &self.session_store, + &session_store, &self.manager, Some(Arc::clone(&pdp_router_arc)), &self.base_capabilities, @@ -544,7 +610,7 @@ impl ConfigVisitor for AplConfigVisitor { route_arc, &plugin_registry, &self.dispatch_cache, - &self.session_store, + &session_store, &self.manager, Some(Arc::clone(&pdp_router_arc)), &self.base_capabilities, @@ -589,7 +655,10 @@ fn install_handler( // (`subject.*`, `role.*`, `delegated`, …) even when no plugins are // referenced. let mut capabilities = base_capabilities.clone(); - capabilities.extend(crate::dispatch_plan::route_capability_union(&route, plugin_registry)); + capabilities.extend(crate::dispatch_plan::route_capability_union( + &route, + plugin_registry, + )); let plugin_config = PluginConfig { name: format!( @@ -604,16 +673,15 @@ fn install_handler( capabilities, ..Default::default() }; - let mut handler = - AplRouteHandler::new( - plugin_config.clone(), - route, - phase, - Arc::clone(plugin_registry), - Arc::clone(dispatch_cache), - Arc::clone(session_store), - manager.clone(), - ); + let mut handler = AplRouteHandler::new( + plugin_config.clone(), + route, + phase, + Arc::clone(plugin_registry), + Arc::clone(dispatch_cache), + Arc::clone(session_store), + manager.clone(), + ); if let Some(pdp) = pdp { handler = handler.with_pdp(pdp); } @@ -654,21 +722,25 @@ fn names_of(sol: &cpex_core::config::StringOrList) -> Vec { } } -/// Warn when an APL block carries a `pdp:` declaration at a scope that +/// Warn when an APL block carries a global-only wiring key +/// ([`GLOBAL_ONLY_NON_DSL_KEYS`]: `pdp`, `session_store`) at a scope that /// cannot act on it. Only [`AplConfigVisitor::visit_global`] builds PDPs -/// (they are process-global CPEX wiring); a `pdp:` written under a -/// default / policy-bundle / route block is folded into the policy body -/// and silently discarded by `compile_policy_block_value`. Surfacing it -/// here turns that quiet no-op into an actionable signal. Applies to -/// both the flat and `apl:`-wrapped forms — neither is processed off the -/// global scope. -fn warn_if_pdp_at_nonglobal_scope(scope: &str, apl_block: &serde_yaml::Value) { - if apl_block.get("pdp").is_some() { - tracing::warn!( - scope, - "APL visitor: `pdp:` is only honored under the top-level `global:` block; \ - the declaration at this scope is ignored", - ); +/// and selects the session store (they are process-global CPEX wiring); a +/// `pdp:` / `session_store:` written under a default / policy-bundle / +/// route block is folded into the policy body and silently discarded by +/// `compile_policy_block_value`. Surfacing it here turns that quiet no-op +/// into an actionable signal. Applies to both the flat and `apl:`-wrapped +/// forms — neither is processed off the global scope. +fn warn_if_global_only_key_at_nonglobal_scope(scope: &str, apl_block: &serde_yaml::Value) { + for key in GLOBAL_ONLY_NON_DSL_KEYS { + if apl_block.get(key).is_some() { + tracing::warn!( + scope, + key, + "APL visitor: this key is only honored under the top-level `global:` block; \ + the declaration at this scope is ignored", + ); + } } } @@ -699,16 +771,27 @@ fn warn_unreferenced_plugin_overrides(route: &CompiledRoute) { } } -/// Strip the `pdp` sub-key from an `apl:` mapping so the remainder can -/// be handed to `compile_policy_block_value` (which doesn't model PDP -/// declarations — those are CPEX wiring concerns). Returns a clone of -/// the mapping with `pdp` removed; the original is left intact. -fn strip_pdp_key(apl_block: &serde_yaml::Value) -> serde_yaml::Value { +/// APL sub-keys that are CPEX *wiring*, not policy DSL: they are honored +/// only under the top-level `global:` block (where `visit_global` acts on +/// them) and are stripped before the remainder is handed to +/// `compile_policy_block_value`, which doesn't model them. Kept as a single +/// source of truth shared by [`strip_non_dsl_keys`] and +/// [`warn_if_global_only_key_at_nonglobal_scope`]. +const GLOBAL_ONLY_NON_DSL_KEYS: [&str; 2] = ["pdp", "session_store"]; + +/// Strip the global-only wiring sub-keys ([`GLOBAL_ONLY_NON_DSL_KEYS`]) +/// from an `apl:` mapping so the remainder can be handed to +/// `compile_policy_block_value` (which doesn't model PDP / session-store +/// declarations — those are CPEX wiring concerns). Returns a clone of the +/// mapping with those keys removed; the original is left intact. +fn strip_non_dsl_keys(apl_block: &serde_yaml::Value) -> serde_yaml::Value { let Some(map) = apl_block.as_mapping() else { return apl_block.clone(); }; let mut cloned = map.clone(); - cloned.remove(&serde_yaml::Value::String("pdp".to_string())); + for key in GLOBAL_ONLY_NON_DSL_KEYS { + cloned.remove(serde_yaml::Value::String(key.to_string())); + } serde_yaml::Value::Mapping(cloned) } @@ -728,12 +811,24 @@ fn on_error_to_string(on_err: &cpex_core::plugin::OnError) -> String { on_err.to_string() } -/// APL DSL keys recognized directly on a section (route / global / -/// defaults / policy-bundle) when the `apl:` wrapper is omitted. +/// APL keys recognized directly on a section (route / global / defaults / +/// policy-bundle) when the `apl:` wrapper is omitted. Includes the policy +/// DSL terms plus the global-only wiring keys ([`GLOBAL_ONLY_NON_DSL_KEYS`]): +/// `pdp` and `session_store` are accepted flat for parse symmetry with their +/// `apl:`-wrapped form, but only `visit_global` acts on them — at other +/// scopes they are inert and flagged by +/// [`warn_if_global_only_key_at_nonglobal_scope`]. /// `plugins` is intentionally absent here — it is shape-ambiguous (a /// structural plugin-ref *list* vs an apl-override *map*) and handled /// separately in [`apl_subblock`]. -const FLAT_APL_KEYS: [&str; 5] = ["policy", "post_policy", "args", "result", "pdp"]; +const FLAT_APL_KEYS: [&str; 6] = [ + "policy", + "post_policy", + "args", + "result", + "pdp", + "session_store", +]; /// Pull a section's APL block out of its raw YAML. /// @@ -815,6 +910,21 @@ mod tests { ); } + #[test] + fn flat_session_store_without_wrapper_is_collected() { + // A `session_store:` written directly on `global:` (no `apl:` + // wrapper) must be lifted into the block so `visit_global` can act + // on it — symmetric with the `apl:`-wrapped form and with `pdp:`. + let v = yaml("session_store:\n kind: valkey\n endpoint: localhost:6379\n"); + let block = apl_subblock(&v).expect("flat session_store recognized"); + let ss = block.get("session_store").expect("session_store lifted into the block"); + assert_eq!( + ss.get("kind").and_then(|k| k.as_str()), + Some("valkey"), + "the session_store mapping is preserved intact", + ); + } + #[test] fn flat_plugins_map_included_but_list_excluded() { // Map shape is the apl-override form → kept. @@ -854,15 +964,18 @@ mod tests { } #[test] - fn warn_if_pdp_at_nonglobal_scope_is_a_safe_noop() { - use super::warn_if_pdp_at_nonglobal_scope; - // The helper only emits a tracing event; it must never panic - // whether `pdp` is present or not. (The drop semantics are - // exercised end-to-end; here we just guard the helper's contract.) + fn warn_if_global_only_key_at_nonglobal_scope_is_a_safe_noop() { + use super::warn_if_global_only_key_at_nonglobal_scope; + // The helper only emits a tracing event; it must never panic for + // either global-only wiring key (`pdp` / `session_store`), or for + // none present. (The drop semantics are exercised end-to-end; here + // we just guard the helper's contract.) let with_pdp = yaml("policy:\n - \"deny\"\npdp:\n - kind: cel\n"); - let without_pdp = yaml("policy:\n - \"deny\"\n"); - warn_if_pdp_at_nonglobal_scope("route", &with_pdp); - warn_if_pdp_at_nonglobal_scope("global.defaults.tool.apl", &without_pdp); + let with_session_store = yaml("policy:\n - \"deny\"\nsession_store:\n kind: valkey\n"); + let without = yaml("policy:\n - \"deny\"\n"); + warn_if_global_only_key_at_nonglobal_scope("route", &with_pdp); + warn_if_global_only_key_at_nonglobal_scope("routes.tool", &with_session_store); + warn_if_global_only_key_at_nonglobal_scope("global.defaults.tool.apl", &without); } #[test] diff --git a/crates/apl-cpex/tests/capability_gating.rs b/crates/apl-cpex/tests/capability_gating.rs index d03656e..eaaa36e 100644 --- a/crates/apl-cpex/tests/capability_gating.rs +++ b/crates/apl-cpex/tests/capability_gating.rs @@ -206,6 +206,7 @@ routes: session_store: Arc::new(MemorySessionStore::new()), pdps: Vec::new(), pdp_factories: Vec::new(), + session_store_factories: Vec::new(), base_capabilities: None, }, ); @@ -268,6 +269,7 @@ routes: session_store: Arc::new(MemorySessionStore::new()), pdps: Vec::new(), pdp_factories: Vec::new(), + session_store_factories: Vec::new(), base_capabilities: Some(std::collections::HashSet::new()), }, ); @@ -415,6 +417,7 @@ routes: session_store: Arc::new(MemorySessionStore::new()), pdps: Vec::new(), pdp_factories: Vec::new(), + session_store_factories: Vec::new(), base_capabilities: Some(std::collections::HashSet::new()), }, ); diff --git a/crates/apl-cpex/tests/cmf_invoker_dispatch.rs b/crates/apl-cpex/tests/cmf_invoker_dispatch.rs index c95788b..c6f5c0f 100644 --- a/crates/apl-cpex/tests/cmf_invoker_dispatch.rs +++ b/crates/apl-cpex/tests/cmf_invoker_dispatch.rs @@ -17,8 +17,8 @@ use std::sync::Arc; use async_trait::async_trait; -use cpex_core::cmf::{CmfHook, ContentPart, Message, MessagePayload}; use cpex_core::cmf::enums::Role; +use cpex_core::cmf::{CmfHook, ContentPart, Message, MessagePayload}; use cpex_core::context::PluginContext; use cpex_core::error::{PluginError as CoreError, PluginViolation}; use cpex_core::extensions::{SecurityExtension, SubjectExtension}; @@ -40,12 +40,18 @@ use apl_cpex::{CmfPluginInvoker, MemorySessionStore, RouteDispatchPlan}; /// registry — no APL CompiledRoute involved. Used by the invoker-primitive /// tests below to exercise the plan-based dispatch path without standing /// up a full route. -fn plan_for(manager: &cpex_core::manager::PluginManager, plugin_name: &str) -> Arc { +fn plan_for( + manager: &cpex_core::manager::PluginManager, + plugin_name: &str, +) -> Arc { let entry = RouteDispatchPlan::resolve_plugin(manager, plugin_name) .expect("plugin must be registered with the manager"); let mut plugins = std::collections::HashMap::new(); plugins.insert(plugin_name.to_string(), entry); - Arc::new(RouteDispatchPlan { plugins, delegation_entries: Default::default() }) + Arc::new(RouteDispatchPlan { + plugins, + delegation_entries: Default::default(), + }) } // --------------------------------------------------------------------- @@ -78,7 +84,9 @@ impl HookHandler for AllowPlugin { struct AllowPluginFactory; impl PluginFactory for AllowPluginFactory { fn create(&self, config: &PluginConfig) -> Result> { - let plugin = Arc::new(AllowPlugin { cfg: config.clone() }); + let plugin = Arc::new(AllowPlugin { + cfg: config.clone(), + }); Ok(PluginInstance { plugin: plugin.clone(), handlers: vec![( @@ -117,7 +125,9 @@ impl HookHandler for DenyPlugin { struct DenyPluginFactory; impl PluginFactory for DenyPluginFactory { fn create(&self, config: &PluginConfig) -> Result> { - let plugin = Arc::new(DenyPlugin { cfg: config.clone() }); + let plugin = Arc::new(DenyPlugin { + cfg: config.clone(), + }); Ok(PluginInstance { plugin: plugin.clone(), handlers: vec![( @@ -173,7 +183,9 @@ impl HookHandler for ModifyPlugin { struct ModifyPluginFactory; impl PluginFactory for ModifyPluginFactory { fn create(&self, config: &PluginConfig) -> Result> { - let plugin = Arc::new(ModifyPlugin { cfg: config.clone() }); + let plugin = Arc::new(ModifyPlugin { + cfg: config.clone(), + }); Ok(PluginInstance { plugin: plugin.clone(), handlers: vec![( @@ -200,17 +212,11 @@ fn empty_bag() -> AttributeBag { /// Build a manager, register one factory + one plugin under the given /// kind, and return the wired manager ready for invocation. -async fn build_manager( - factory_kind: &str, - factory: Box, -) -> Arc { +async fn build_manager(factory_kind: &str, factory: Box) -> Arc { let mgr = PluginManager::default(); mgr.register_factory(factory_kind, factory); - let yaml = format!( - "plugins:\n - name: {0}\n kind: {0}\n", - factory_kind - ); + let yaml = format!("plugins:\n - name: {0}\n kind: {0}\n", factory_kind); let cfg = cpex_core::config::parse_config(&yaml).expect("parse_config"); mgr.load_config(cfg).expect("load_config"); mgr.initialize().await.expect("initialize"); @@ -232,10 +238,17 @@ async fn step_invocation_allow_returns_decision_allow() { plan, Arc::new(MemorySessionStore::new()), ) - .await; + .await + .expect("for_request"); let outcome = invoker - .invoke("allow-plugin", &empty_bag(), PluginInvocation::Step { phase: apl_core::step::DispatchPhase::Pre }) + .invoke( + "allow-plugin", + &empty_bag(), + PluginInvocation::Step { + phase: apl_core::step::DispatchPhase::Pre, + }, + ) .await .expect("invoke"); @@ -254,15 +267,25 @@ async fn step_invocation_deny_surfaces_violation_reason_and_code() { plan, Arc::new(MemorySessionStore::new()), ) - .await; + .await + .expect("for_request"); let outcome = invoker - .invoke("deny-plugin", &empty_bag(), PluginInvocation::Step { phase: apl_core::step::DispatchPhase::Pre }) + .invoke( + "deny-plugin", + &empty_bag(), + PluginInvocation::Step { + phase: apl_core::step::DispatchPhase::Pre, + }, + ) .await .expect("invoke"); match outcome.decision { - Decision::Deny { reason, rule_source } => { + Decision::Deny { + reason, + rule_source, + } => { assert_eq!(reason.as_deref(), Some("test-fixture denied this call")); assert_eq!(rule_source, "policy.forbidden"); } @@ -281,7 +304,8 @@ async fn field_invocation_modify_surfaces_modified_value_and_persists_payload() plan, Arc::new(MemorySessionStore::new()), ) - .await; + .await + .expect("for_request"); let bag = empty_bag(); let value = serde_json::Value::String("hello".to_string()); @@ -337,7 +361,8 @@ async fn current_payload_reflects_accumulated_mutations() { plan, Arc::new(MemorySessionStore::new()), ) - .await; + .await + .expect("for_request"); let bag = empty_bag(); let value = serde_json::Value::String("ignored".to_string()); @@ -355,10 +380,7 @@ async fn current_payload_reflects_accumulated_mutations() { .expect("invoke"); let final_payload = invoker.current_payload().await; - assert_eq!( - final_payload.message.get_text_content(), - "hello [MODIFIED]" - ); + assert_eq!(final_payload.message.get_text_content(), "hello [MODIFIED]"); } // --------------------------------------------------------------------- @@ -490,7 +512,10 @@ fn plan_with_narrowed_caps( entries_by_hook, }, ); - Arc::new(apl_cpex::RouteDispatchPlan { plugins, delegation_entries: Default::default() }) + Arc::new(apl_cpex::RouteDispatchPlan { + plugins, + delegation_entries: Default::default(), + }) } #[tokio::test] @@ -519,10 +544,17 @@ async fn route_override_caps_narrow_what_plugin_sees() { plan, Arc::new(MemorySessionStore::new()), ) - .await; + .await + .expect("for_request"); let outcome = invoker - .invoke("capture-plugin", &empty_bag(), PluginInvocation::Step { phase: apl_core::step::DispatchPhase::Pre }) + .invoke( + "capture-plugin", + &empty_bag(), + PluginInvocation::Step { + phase: apl_core::step::DispatchPhase::Pre, + }, + ) .await .expect("invoke"); assert_eq!(outcome.decision, Decision::Allow); @@ -623,9 +655,15 @@ impl Plugin for MultiHookMarker { struct MultiHookPluginFactory; impl PluginFactory for MultiHookPluginFactory { fn create(&self, config: &PluginConfig) -> Result> { - let marker = Arc::new(MultiHookMarker { cfg: config.clone() }); - let pre = Arc::new(PreSideHandler { cfg: config.clone() }); - let post = Arc::new(PostSideHandler { cfg: config.clone() }); + let marker = Arc::new(MultiHookMarker { + cfg: config.clone(), + }); + let pre = Arc::new(PreSideHandler { + cfg: config.clone(), + }); + let post = Arc::new(PostSideHandler { + cfg: config.clone(), + }); Ok(PluginInstance { plugin: marker as Arc, handlers: vec![ @@ -659,7 +697,8 @@ async fn multi_hook_plugin_dispatches_per_phase_via_routing_table() { plan, Arc::new(MemorySessionStore::new()), ) - .await; + .await + .expect("for_request"); // Pre phase — should hit pre handler → Allow. let pre_outcome = invoker diff --git a/crates/apl-cpex/tests/config_override.rs b/crates/apl-cpex/tests/config_override.rs index 3bfb705..3101ad1 100644 --- a/crates/apl-cpex/tests/config_override.rs +++ b/crates/apl-cpex/tests/config_override.rs @@ -158,6 +158,7 @@ async fn build_manager(yaml: &str) -> (Arc, Arc Extensions { /// Build Extensions populated with a subject + label so cap-gating /// tests can verify what a delegate plugin actually sees after the /// executor's per-entry filter narrows the view to declared caps. -fn ext_with_subject_and_label( - token: &str, - subject_id: &str, - label: &str, -) -> Extensions { +fn ext_with_subject_and_label(token: &str, subject_id: &str, label: &str) -> Extensions { use cpex_core::extensions::{SecurityExtension, SubjectExtension}; let mut raw = RawCredentialsExtension::default(); @@ -255,7 +249,11 @@ fn ext_with_subject_and_label( async fn build_setup( yaml: &str, plugins: Vec<(String, Arc, PluginConfig)>, -) -> (Arc, apl_core::CompiledConfig, Arc) { +) -> ( + Arc, + apl_core::CompiledConfig, + Arc, +) { let mgr = Arc::new(PluginManager::default()); for (_, plugin, cfg) in plugins { mgr.register_handler::(plugin, cfg) @@ -319,19 +317,22 @@ routes: let extensions = ext_with_bearer("eyJ.fake.user-jwt"); let session_store: Arc = Arc::new(MemorySessionStore::new()); - let invoker = Arc::new(CmfPluginInvoker::for_request( - Arc::clone(&mgr), - extensions, - cpex_core::cmf::MessagePayload { - message: cpex_core::cmf::Message::text( - cpex_core::cmf::enums::Role::User, - "fetch compensation", - ), - }, - Arc::clone(&plan), - Arc::clone(&session_store), - ) - .await); + let invoker = Arc::new( + CmfPluginInvoker::for_request( + Arc::clone(&mgr), + extensions, + cpex_core::cmf::MessagePayload { + message: cpex_core::cmf::Message::text( + cpex_core::cmf::enums::Role::User, + "fetch compensation", + ), + }, + Arc::clone(&plan), + Arc::clone(&session_store), + ) + .await + .expect("for_request"), + ); let delegations = Arc::new(DelegationPluginInvoker::new( Arc::clone(&mgr), invoker.extensions_arc(), @@ -440,19 +441,22 @@ routes: let extensions = ext_with_bearer("eyJ.fake.user-jwt"); let session_store: Arc = Arc::new(MemorySessionStore::new()); - let invoker = Arc::new(CmfPluginInvoker::for_request( - Arc::clone(&mgr), - extensions, - cpex_core::cmf::MessagePayload { - message: cpex_core::cmf::Message::text( - cpex_core::cmf::enums::Role::User, - "fetch comp", - ), - }, - Arc::clone(&plan), - Arc::clone(&session_store), - ) - .await); + let invoker = Arc::new( + CmfPluginInvoker::for_request( + Arc::clone(&mgr), + extensions, + cpex_core::cmf::MessagePayload { + message: cpex_core::cmf::Message::text( + cpex_core::cmf::enums::Role::User, + "fetch comp", + ), + }, + Arc::clone(&plan), + Arc::clone(&session_store), + ) + .await + .expect("for_request"), + ); let delegations = Arc::new(DelegationPluginInvoker::new( Arc::clone(&mgr), invoker.extensions_arc(), @@ -532,19 +536,19 @@ routes: let extensions = ext_with_bearer("eyJ.fake.user-jwt"); let session_store: Arc = Arc::new(MemorySessionStore::new()); - let invoker = Arc::new(CmfPluginInvoker::for_request( - Arc::clone(&mgr), - extensions, - cpex_core::cmf::MessagePayload { - message: cpex_core::cmf::Message::text( - cpex_core::cmf::enums::Role::User, - "any", - ), - }, - Arc::clone(&plan), - Arc::clone(&session_store), - ) - .await); + let invoker = Arc::new( + CmfPluginInvoker::for_request( + Arc::clone(&mgr), + extensions, + cpex_core::cmf::MessagePayload { + message: cpex_core::cmf::Message::text(cpex_core::cmf::enums::Role::User, "any"), + }, + Arc::clone(&plan), + Arc::clone(&session_store), + ) + .await + .expect("for_request"), + ); let delegations = Arc::new(DelegationPluginInvoker::new( Arc::clone(&mgr), invoker.extensions_arc(), @@ -643,19 +647,19 @@ routes: let extensions = ext_with_bearer("eyJ.fake.user-jwt"); let session_store: Arc = Arc::new(MemorySessionStore::new()); - let invoker = Arc::new(CmfPluginInvoker::for_request( - Arc::clone(&mgr), - extensions, - cpex_core::cmf::MessagePayload { - message: cpex_core::cmf::Message::text( - cpex_core::cmf::enums::Role::User, - "fanout", - ), - }, - Arc::clone(&plan), - Arc::clone(&session_store), - ) - .await); + let invoker = Arc::new( + CmfPluginInvoker::for_request( + Arc::clone(&mgr), + extensions, + cpex_core::cmf::MessagePayload { + message: cpex_core::cmf::Message::text(cpex_core::cmf::enums::Role::User, "fanout"), + }, + Arc::clone(&plan), + Arc::clone(&session_store), + ) + .await + .expect("for_request"), + ); let delegations = Arc::new(DelegationPluginInvoker::new( Arc::clone(&mgr), invoker.extensions_arc(), @@ -745,7 +749,11 @@ routes: "#; let (mgr, cfg, cache) = build_setup( yaml, - vec![("scoped-delegate".to_string(), Arc::clone(&plugin), plugin_cfg)], + vec![( + "scoped-delegate".to_string(), + Arc::clone(&plugin), + plugin_cfg, + )], ) .await; @@ -757,19 +765,22 @@ routes: // proves the cap filter is selective. let extensions = ext_with_subject_and_label("eyJ.fake.jwt", "alice", "pii"); let session_store: Arc = Arc::new(MemorySessionStore::new()); - let invoker = Arc::new(CmfPluginInvoker::for_request( - Arc::clone(&mgr), - extensions, - cpex_core::cmf::MessagePayload { - message: cpex_core::cmf::Message::text( - cpex_core::cmf::enums::Role::User, - "fetch compensation", - ), - }, - Arc::clone(&plan), - Arc::clone(&session_store), - ) - .await); + let invoker = Arc::new( + CmfPluginInvoker::for_request( + Arc::clone(&mgr), + extensions, + cpex_core::cmf::MessagePayload { + message: cpex_core::cmf::Message::text( + cpex_core::cmf::enums::Role::User, + "fetch compensation", + ), + }, + Arc::clone(&plan), + Arc::clone(&session_store), + ) + .await + .expect("for_request"), + ); let delegations = Arc::new(DelegationPluginInvoker::new( Arc::clone(&mgr), invoker.extensions_arc(), @@ -845,7 +856,11 @@ routes: "#; let (mgr, cfg, cache) = build_setup( yaml, - vec![("capless-delegate".to_string(), Arc::clone(&plugin), plugin_cfg)], + vec![( + "capless-delegate".to_string(), + Arc::clone(&plugin), + plugin_cfg, + )], ) .await; @@ -855,19 +870,19 @@ routes: let extensions = ext_with_subject_and_label("eyJ.fake.jwt", "alice", "pii"); let session_store: Arc = Arc::new(MemorySessionStore::new()); - let invoker = Arc::new(CmfPluginInvoker::for_request( - Arc::clone(&mgr), - extensions, - cpex_core::cmf::MessagePayload { - message: cpex_core::cmf::Message::text( - cpex_core::cmf::enums::Role::User, - "any", - ), - }, - Arc::clone(&plan), - Arc::clone(&session_store), - ) - .await); + let invoker = Arc::new( + CmfPluginInvoker::for_request( + Arc::clone(&mgr), + extensions, + cpex_core::cmf::MessagePayload { + message: cpex_core::cmf::Message::text(cpex_core::cmf::enums::Role::User, "any"), + }, + Arc::clone(&plan), + Arc::clone(&session_store), + ) + .await + .expect("for_request"), + ); let delegations = Arc::new(DelegationPluginInvoker::new( Arc::clone(&mgr), invoker.extensions_arc(), @@ -910,4 +925,3 @@ routes: "without read_inbound_credentials, inbound token must be hidden", ); } - diff --git a/crates/apl-cpex/tests/end_to_end_route.rs b/crates/apl-cpex/tests/end_to_end_route.rs index b06c64f..6df3831 100644 --- a/crates/apl-cpex/tests/end_to_end_route.rs +++ b/crates/apl-cpex/tests/end_to_end_route.rs @@ -36,7 +36,10 @@ use apl_core::{ PdpDecision, PdpDialect, PdpError, PdpResolver, RoutePayload, }; -use apl_cpex::{CmfPluginInvoker, DispatchCache, MemorySessionStore, SessionStore}; +use apl_cpex::{ + register_apl, AplOptions, CmfPluginInvoker, DispatchCache, MemorySessionStore, SessionStore, + SessionStoreError, +}; // Build Extensions carrying a client/upstream session id (tier-0) AND an // authenticated subject, and return the session-store key the resolver @@ -232,7 +235,8 @@ routes: plan, Arc::new(MemorySessionStore::new()), ) - .await, + .await + .expect("for_request"), ); let mut bag = AttributeBag::new(); @@ -282,7 +286,8 @@ routes: plan, Arc::new(MemorySessionStore::new()), ) - .await, + .await + .expect("for_request"), ); let mut bag = AttributeBag::new(); @@ -407,7 +412,8 @@ routes: let session_store = Arc::new(MemorySessionStore::new()); let invoker = Arc::new( CmfPluginInvoker::for_request(mgr, extensions, cmf_payload(), plan, session_store.clone()) - .await, + .await + .expect("for_request"), ); let mut bag = AttributeBag::new(); @@ -446,8 +452,11 @@ routes: // SessionStore persistence — host calls persist_session after route // evaluation; new labels (vs the post-hydration snapshot) land in // the store under the request's session_id. - invoker.persist_session().await; - let stored = session_store.load_labels(&session_key).await; + invoker.persist_session().await.expect("persist_session"); + let stored = session_store + .load_labels(&session_key) + .await + .expect("load_labels"); assert_eq!(stored, vec!["PII".to_string()]); } @@ -461,7 +470,8 @@ async fn session_store_hydrates_labels_at_request_start() { let session_store = Arc::new(MemorySessionStore::new()); session_store .append_labels(&session_key, &["PRIOR".to_string()]) - .await; + .await + .expect("append_labels"); let mgr = tainting_manager().await; let yaml = r#" @@ -483,7 +493,8 @@ routes: let invoker = Arc::new( CmfPluginInvoker::for_request(mgr, extensions, cmf_payload(), plan, session_store.clone()) - .await, + .await + .expect("for_request"), ); // Hydrated labels should be observable on the invoker's extensions. @@ -516,8 +527,11 @@ routes: assert_eq!(decision.taints.len(), 1); assert_eq!(decision.taints[0].label, "PII"); - invoker.persist_session().await; - let mut stored = session_store.load_labels(&session_key).await; + invoker.persist_session().await.expect("persist_session"); + let mut stored = session_store + .load_labels(&session_key) + .await + .expect("load_labels"); stored.sort(); assert_eq!(stored, vec!["PII".to_string(), "PRIOR".to_string()]); } @@ -550,7 +564,8 @@ routes: let session_store = Arc::new(MemorySessionStore::new()); let invoker = Arc::new( CmfPluginInvoker::for_request(mgr, extensions, cmf_payload(), plan, session_store.clone()) - .await, + .await + .expect("for_request"), ); let mut bag = AttributeBag::new(); @@ -591,7 +606,375 @@ routes: // And `persist_session` should pick up the label via the diff // against `initial_labels` (which was empty here). - invoker.persist_session().await; - let stored = session_store.load_labels(&session_key).await; + invoker.persist_session().await.expect("persist_session"); + let stored = session_store + .load_labels(&session_key) + .await + .expect("load_labels"); assert_eq!(stored, vec!["audit".to_string()]); } + +// --------------------------------------------------------------------- +// Fail-closed semantics (U2 / R4, R5, R18; AE1, AE6). +// +// A distributed SessionStore can fail. These tests use an erroring +// test-double to prove the request fails *closed* — a store error +// becomes a Deny, never a silent "no labels" Allow. +// --------------------------------------------------------------------- + +/// Test-double store that fails load and/or append on demand. +struct ErrorSessionStore { + fail_load: bool, + fail_append: bool, +} + +#[async_trait] +impl SessionStore for ErrorSessionStore { + async fn load_labels(&self, _session_id: &str) -> Result, SessionStoreError> { + if self.fail_load { + Err(SessionStoreError::Backend("simulated load failure".into())) + } else { + Ok(Vec::new()) + } + } + + async fn append_labels( + &self, + _session_id: &str, + _labels: &[String], + ) -> Result<(), SessionStoreError> { + if self.fail_append { + Err(SessionStoreError::Backend( + "simulated append failure".into(), + )) + } else { + Ok(()) + } + } +} + +// Tagger route wired through `register_apl` so requests flow through the +// real `AplRouteHandler::invoke` path (where the fail-closed logic lives). +const TAGGER_ROUTE_YAML: &str = r#" +plugins: + - name: tagger + kind: tagger + hooks: [cmf.tool_pre_invoke] + capabilities: [append_labels, read_labels] +routes: + - tool: get_weather + apl: + policy: + - "plugin(tagger)" +"#; + +// Route matching keys on the request's `meta` (entity type + name), so a +// request must carry tool meta for the `tool: get_weather` handler to fire. +fn set_tool_meta(ext: &mut Extensions, tool: &str) { + let mut meta = cpex_core::extensions::MetaExtension::default(); + meta.entity_type = Some("tool".to_string()); + meta.entity_name = Some(tool.to_string()); + ext.meta = Some(Arc::new(meta)); +} + +async fn tagger_manager_with_store(store: Arc) -> Arc { + let mgr = Arc::new(PluginManager::default()); + mgr.register_factory("tagger", Box::new(TaintingPluginFactory)); + register_apl( + &mgr, + AplOptions { + dispatch_cache: Arc::new(DispatchCache::new()), + session_store: store, + pdps: Vec::new(), + pdp_factories: Vec::new(), + session_store_factories: Vec::new(), + base_capabilities: None, + }, + ); + mgr.load_config_yaml(TAGGER_ROUTE_YAML) + .expect("load_config_yaml"); + mgr.initialize().await.expect("initialize"); + mgr +} + +/// AE1: a load failure during hydration fails the request closed *before* +/// any decision, with the distinguished `session.load_failed` violation. +#[tokio::test] +async fn load_failure_fails_request_closed() { + let store: Arc = Arc::new(ErrorSessionStore { + fail_load: true, + fail_append: false, + }); + let mgr = tagger_manager_with_store(store).await; + let (mut ext, _key) = session_ext_and_key("sess-load-fail", "alice"); + set_tool_meta(&mut ext, "get_weather"); + + let (result, _bg) = mgr + .invoke_named::("cmf.tool_pre_invoke", cmf_payload(), ext, None) + .await; + + assert!( + !result.continue_processing, + "a load failure must fail the request closed (Deny)" + ); + assert_eq!( + result.violation.as_ref().map(|v| v.code.as_str()), + Some("session.load_failed"), + ); +} + +/// AE6: an append failure after the (Allow) decision flips the request to +/// Deny with the distinguished `session.persist_failed` violation — the +/// accumulated taint is never silently dropped. +#[tokio::test] +async fn append_failure_fails_request_closed() { + let store: Arc = Arc::new(ErrorSessionStore { + fail_load: false, + fail_append: true, + }); + let mgr = tagger_manager_with_store(store).await; + let (mut ext, _key) = session_ext_and_key("sess-append-fail", "alice"); + set_tool_meta(&mut ext, "get_weather"); + + // The tagger emits a session-scoped label, so persist_session has a + // new label to append — which the store rejects. + let (result, _bg) = mgr + .invoke_named::("cmf.tool_pre_invoke", cmf_payload(), ext, None) + .await; + + assert!( + !result.continue_processing, + "an append failure must flip the Allow decision to Deny" + ); + assert_eq!( + result.violation.as_ref().map(|v| v.code.as_str()), + Some("session.persist_failed"), + ); +} + +/// R18 merge precedence: when the policy already Denies AND the append +/// fails, the original policy violation is preserved (not overwritten by +/// `session.persist_failed`) — the request is already denied, so the +/// append failure surfaces only as the alarm. +#[tokio::test] +async fn deny_plus_append_failure_preserves_policy_violation() { + const YAML: &str = r#" +plugins: + - name: tagger + kind: tagger + hooks: [cmf.tool_pre_invoke] + capabilities: [append_labels, read_labels] + - name: scope-gate + kind: scope-gate + hooks: [cmf.tool_pre_invoke] +routes: + - tool: get_weather + apl: + policy: + - "plugin(tagger)" + - "plugin(scope-gate)" +"#; + let store: Arc = Arc::new(ErrorSessionStore { + fail_load: false, + fail_append: true, + }); + let mgr = Arc::new(PluginManager::default()); + mgr.register_factory("tagger", Box::new(TaintingPluginFactory)); + mgr.register_factory("scope-gate", Box::new(DenyPluginFactory)); + register_apl( + &mgr, + AplOptions { + dispatch_cache: Arc::new(DispatchCache::new()), + session_store: store, + pdps: Vec::new(), + pdp_factories: Vec::new(), + session_store_factories: Vec::new(), + base_capabilities: None, + }, + ); + mgr.load_config_yaml(YAML).expect("load_config_yaml"); + mgr.initialize().await.expect("initialize"); + + let (mut ext, _key) = session_ext_and_key("sess-deny-append", "alice"); + set_tool_meta(&mut ext, "get_weather"); + let (result, _bg) = mgr + .invoke_named::("cmf.tool_pre_invoke", cmf_payload(), ext, None) + .await; + + assert!( + !result.continue_processing, + "policy denied → request blocked" + ); + // The original policy violation is preserved; the append failure does + // NOT overwrite it with session.persist_failed. + assert_eq!( + result.violation.as_ref().map(|v| v.code.as_str()), + Some("policy.forbidden"), + "Deny+append-err must keep the policy violation, not session.persist_failed" + ); +} + +/// Sessionless/anonymous traffic carries no session_id, so it never +/// touches the store and is unaffected by a store outage. +#[tokio::test] +async fn sessionless_request_unaffected_by_store_failure() { + let store: Arc = Arc::new(ErrorSessionStore { + fail_load: true, + fail_append: true, + }); + let mgr = tagger_manager_with_store(store).await; + + // Tool meta so the route handler fires, but no session/subject — so + // the request resolves to no session id and never touches the store. + let mut ext = Extensions::default(); + set_tool_meta(&mut ext, "get_weather"); + let (result, _bg) = mgr + .invoke_named::("cmf.tool_pre_invoke", cmf_payload(), ext, None) + .await; + + assert!( + result.continue_processing, + "sessionless traffic should not be denied by a store outage: {:?}", + result.violation + ); +} + +// --------------------------------------------------------------------- +// Config-driven backend selection (U3 / R2, R3; AE3, AE5). +// --------------------------------------------------------------------- + +/// Records every load/append so a test can prove which store was active. +#[derive(Default)] +struct RecordingSessionStore { + loads: std::sync::Mutex>, + appends: std::sync::Mutex)>>, +} + +#[async_trait] +impl SessionStore for RecordingSessionStore { + async fn load_labels(&self, session_id: &str) -> Result, SessionStoreError> { + self.loads.lock().unwrap().push(session_id.to_string()); + Ok(Vec::new()) + } + async fn append_labels( + &self, + session_id: &str, + labels: &[String], + ) -> Result<(), SessionStoreError> { + self.appends + .lock() + .unwrap() + .push((session_id.to_string(), labels.to_vec())); + Ok(()) + } +} + +/// Factory that hands back a specific recording store so the test can +/// inspect it after the config walk selected it. +struct RecordingFactory { + store: Arc, +} + +impl apl_cpex::SessionStoreFactory for RecordingFactory { + fn kind(&self) -> &str { + "recording-fake" + } + fn build( + &self, + _config: &serde_yaml::Value, + ) -> Result, Box> { + Ok(self.store.clone()) + } +} + +/// AE5: a `global.apl.session_store { kind: recording-fake }` block makes +/// the factory-built store the active one — the default `MemorySessionStore` +/// passed to `AplOptions` is overridden by config. +#[tokio::test] +async fn config_selects_session_store_via_factory() { + const YAML: &str = r#" +plugins: + - name: tagger + kind: tagger + hooks: [cmf.tool_pre_invoke] + capabilities: [append_labels, read_labels] +global: + apl: + session_store: + kind: recording-fake +routes: + - tool: get_weather + apl: + policy: + - "plugin(tagger)" +"#; + + let recording = Arc::new(RecordingSessionStore::default()); + let mgr = Arc::new(PluginManager::default()); + mgr.register_factory("tagger", Box::new(TaintingPluginFactory)); + register_apl( + &mgr, + AplOptions { + dispatch_cache: Arc::new(DispatchCache::new()), + // Default store that config should override: + session_store: Arc::new(MemorySessionStore::new()), + pdps: Vec::new(), + pdp_factories: Vec::new(), + session_store_factories: vec![Arc::new(RecordingFactory { + store: Arc::clone(&recording), + })], + base_capabilities: None, + }, + ); + mgr.load_config_yaml(YAML).expect("load_config_yaml"); + mgr.initialize().await.expect("initialize"); + + let (mut ext, _key) = session_ext_and_key("sess-cfg", "alice"); + set_tool_meta(&mut ext, "get_weather"); + let (result, _bg) = mgr + .invoke_named::("cmf.tool_pre_invoke", cmf_payload(), ext, None) + .await; + assert!(result.continue_processing, "tagger route allows"); + + // The config-selected recording store — NOT the default memory store — + // received the hydration load and the taint append. + assert!( + !recording.loads.lock().unwrap().is_empty(), + "config-selected store should receive the hydration load" + ); + assert_eq!( + recording.appends.lock().unwrap().len(), + 1, + "config-selected store should receive the taint append" + ); +} + +/// Unknown `kind` in a session_store block fails config load loudly. +#[tokio::test] +async fn unknown_session_store_kind_fails_config_load() { + const YAML: &str = r#" +global: + apl: + session_store: + kind: nonexistent-backend +"#; + let mgr = Arc::new(PluginManager::default()); + register_apl( + &mgr, + AplOptions { + dispatch_cache: Arc::new(DispatchCache::new()), + session_store: Arc::new(MemorySessionStore::new()), + pdps: Vec::new(), + pdp_factories: Vec::new(), + session_store_factories: Vec::new(), + base_capabilities: None, + }, + ); + let err = mgr + .load_config_yaml(YAML) + .expect_err("unknown kind must fail load"); + assert!( + format!("{err}").contains("nonexistent-backend"), + "error should name the unresolved kind: {err}" + ); +} diff --git a/crates/apl-cpex/tests/visitor_e2e.rs b/crates/apl-cpex/tests/visitor_e2e.rs index b4e0a5c..2eacfd4 100644 --- a/crates/apl-cpex/tests/visitor_e2e.rs +++ b/crates/apl-cpex/tests/visitor_e2e.rs @@ -76,10 +76,7 @@ impl PluginFactory for AllowGateFactory { // in `hooks: [...]`. Lets tests pin the plugin to llm / prompt // / resource hooks via YAML without per-entity factory copies. let handlers = hooks_for(config, plugin.clone()); - Ok(PluginInstance { - plugin, - handlers, - }) + Ok(PluginInstance { plugin, handlers }) } } @@ -89,10 +86,7 @@ impl PluginFactory for AllowGateFactory { fn hooks_for( config: &PluginConfig, plugin: Arc, -) -> Vec<( - &'static str, - Arc, -)> +) -> Vec<(&'static str, Arc)> where H: HookHandler + Plugin + 'static, { @@ -108,9 +102,8 @@ where hook_names .into_iter() .map(|name| { - let adapter: Arc = Arc::new( - TypedHandlerAdapter::::new(Arc::clone(&plugin)), - ); + let adapter: Arc = + Arc::new(TypedHandlerAdapter::::new(Arc::clone(&plugin))); (name, adapter) }) .collect() @@ -134,10 +127,7 @@ impl HookHandler for DenyGate { _extensions: &Extensions, _ctx: &mut PluginContext, ) -> PluginResult { - PluginResult::deny(PluginViolation::new( - "policy.forbidden", - "deny-gate fired", - )) + PluginResult::deny(PluginViolation::new("policy.forbidden", "deny-gate fired")) } } @@ -148,10 +138,7 @@ impl PluginFactory for DenyGateFactory { cfg: config.clone(), }); let handlers = hooks_for(config, plugin.clone()); - Ok(PluginInstance { - plugin, - handlers, - }) + Ok(PluginInstance { plugin, handlers }) } } @@ -190,6 +177,7 @@ async fn build_manager_with_visitor(yaml: &str) -> Arc { session_store: Arc::new(MemorySessionStore::new()), pdps: Vec::new(), pdp_factories: Vec::new(), + session_store_factories: Vec::new(), base_capabilities: None, }, ); @@ -227,12 +215,7 @@ routes: ..Default::default() }; let (result, _bg) = mgr - .invoke_named::( - "cmf.tool_pre_invoke", - cmf_payload("hi"), - ext, - None, - ) + .invoke_named::("cmf.tool_pre_invoke", cmf_payload("hi"), ext, None) .await; assert!( @@ -266,16 +249,13 @@ routes: ..Default::default() }; let (result, _bg) = mgr - .invoke_named::( - "cmf.tool_pre_invoke", - cmf_payload("hi"), - ext, - None, - ) + .invoke_named::("cmf.tool_pre_invoke", cmf_payload("hi"), ext, None) .await; assert!(!result.continue_processing, "deny path should halt"); - let violation = result.violation.expect("deny path must surface a violation"); + let violation = result + .violation + .expect("deny path must surface a violation"); assert_eq!( violation.reason, "deny-gate fired", "violation reason must propagate from the plugin through the handler" @@ -316,12 +296,7 @@ routes: ..Default::default() }; let (result, _bg) = mgr - .invoke_named::( - "cmf.tool_pre_invoke", - cmf_payload("hi"), - ext, - None, - ) + .invoke_named::("cmf.tool_pre_invoke", cmf_payload("hi"), ext, None) .await; let violation = result.violation.expect("route-level deny must fire"); @@ -357,12 +332,7 @@ routes: ..Default::default() }; let (result, _bg) = mgr - .invoke_named::( - "cmf.tool_pre_invoke", - cmf_payload("hi"), - ext, - None, - ) + .invoke_named::("cmf.tool_pre_invoke", cmf_payload("hi"), ext, None) .await; let violation = result @@ -455,12 +425,7 @@ routes: ..Default::default() }; let (result, _bg) = mgr - .invoke_named::( - "cmf.tool_pre_invoke", - cmf_payload("hi"), - ext, - None, - ) + .invoke_named::("cmf.tool_pre_invoke", cmf_payload("hi"), ext, None) .await; // Without APL annotations the route resolves through the legacy @@ -604,7 +569,10 @@ routes: let mgr = build_manager_with_visitor(YAML).await; let ext = Extensions { - meta: Some(Arc::new(meta_for_entity("resource", "hr://employees/E001234"))), + meta: Some(Arc::new(meta_for_entity( + "resource", + "hr://employees/E001234", + ))), ..Default::default() }; let (result, _bg) = mgr @@ -652,12 +620,7 @@ routes: // APL annotation. With no annotation AND no plugin registered on // cmf.tool_pre_invoke, dispatch returns continue. let (tool_result, _bg) = mgr - .invoke_named::( - "cmf.tool_pre_invoke", - cmf_payload("hi"), - ext.clone(), - None, - ) + .invoke_named::("cmf.tool_pre_invoke", cmf_payload("hi"), ext.clone(), None) .await; assert!( tool_result.continue_processing, @@ -695,7 +658,9 @@ routes: mgr.register_factory("allow-gate", Box::new(AllowGateFactory)); register_apl(&mgr, AplOptions::in_process()); - let err = mgr.load_config_yaml(YAML).expect_err("malformed APL block must error"); + let err = mgr + .load_config_yaml(YAML) + .expect_err("malformed APL block must error"); let msg = format!("{}", err); assert!( msg.contains("visitor 'apl'"), diff --git a/crates/apl-pdp-cedar-direct/tests/visitor_pdp_config.rs b/crates/apl-pdp-cedar-direct/tests/visitor_pdp_config.rs index 76ec2c1..9434639 100644 --- a/crates/apl-pdp-cedar-direct/tests/visitor_pdp_config.rs +++ b/crates/apl-pdp-cedar-direct/tests/visitor_pdp_config.rs @@ -27,9 +27,7 @@ use std::sync::Arc; use cpex_core::cmf::enums::Role; use cpex_core::cmf::{CmfHook, Message, MessagePayload}; -use cpex_core::extensions::{ - MetaExtension, SecurityExtension, SubjectExtension, SubjectType, -}; +use cpex_core::extensions::{MetaExtension, SecurityExtension, SubjectExtension, SubjectType}; use cpex_core::hooks::payload::Extensions; use cpex_core::manager::PluginManager; @@ -94,6 +92,7 @@ async fn build_manager() -> Arc { // visitor sees `kind: cedar-direct` in YAML and finds this // factory by key. pdp_factories: vec![Arc::new(CedarDirectPdpFactory::new())], + session_store_factories: Vec::new(), base_capabilities: None, }, ); @@ -157,7 +156,9 @@ async fn config_declared_cedar_pdp_denies_non_reader() { !result.continue_processing, "missing reader role should default-deny", ); - let v = result.violation.expect("deny path must surface a violation"); + let v = result + .violation + .expect("deny path must surface a violation"); assert_eq!( v.code, "cedar.default_deny", "default-deny path should use the cedar-direct sentinel code; got {}", diff --git a/crates/apl-pdp-cel/tests/visitor_cel_config.rs b/crates/apl-pdp-cel/tests/visitor_cel_config.rs index 57eeb4d..89b72c8 100644 --- a/crates/apl-pdp-cel/tests/visitor_cel_config.rs +++ b/crates/apl-pdp-cel/tests/visitor_cel_config.rs @@ -93,15 +93,15 @@ async fn build_manager_with_yaml( // The factory is the load-bearing wiring under test: the visitor // sees `kind: cel` in YAML and finds this factory by key. pdp_factories: vec![Arc::new(CelPdpFactory::new())], + session_store_factories: Vec::new(), base_capabilities: None, }, ); - mgr.load_config_yaml(yaml).map_err(|e| -> Box { - format!("{e}").into() - })?; - mgr.initialize().await.map_err(|e| -> Box { - format!("{e}").into() - })?; + mgr.load_config_yaml(yaml) + .map_err(|e| -> Box { format!("{e}").into() })?; + mgr.initialize() + .await + .map_err(|e| -> Box { format!("{e}").into() })?; Ok(mgr) } diff --git a/crates/apl-session-valkey/Cargo.toml b/crates/apl-session-valkey/Cargo.toml new file mode 100644 index 0000000..270a7db --- /dev/null +++ b/crates/apl-session-valkey/Cargo.toml @@ -0,0 +1,62 @@ +# Location: ./crates/apl-session-valkey/Cargo.toml +# Copyright 2026 +# SPDX-License-Identifier: Apache-2.0 +# Authors: Fred Araujo +# +# apl-session-valkey — a Valkey-backed `SessionStore` for distributed, +# cross-restart persistence of session security labels. +# +# # Dependency discipline +# +# This crate is OPTIONAL and feature-gated into cpex-ffi (`valkey` feature) +# and excluded from the workspace `default-members`, so the Valkey client + +# its TLS/async stack never land in the default FFI artifact or everyday +# `cargo build` (mirrors apl-cedarling). The redis client is pinned with +# `default-features = false` and a rustls TLS path to stay openssl-free, +# matching the `reqwest = { features = ["rustls-tls"] }` discipline in +# apl-identity-jwt / apl-delegator-oauth. + +[package] +name = "apl-session-valkey" +version.workspace = true +edition.workspace = true +license.workspace = true +authors.workspace = true + +[dependencies] +apl-cpex = { path = "../apl-cpex" } +async-trait = { workspace = true } +serde = { workspace = true } +serde_yaml = { workspace = true } +thiserror = { workspace = true } +tracing = { workspace = true } +tokio = { workspace = true } +sha2 = "0.10" +# URL building/parsing so connection credentials are percent-encoded and +# the wire scheme is consistent with the TLS setting (already in the tree +# via redis-rs). +url = "2" +# Async Valkey/Redis client. `default-features = false` drops the command +# groups we don't use (acl/geo/script/streams/json). `tokio-rustls-comp` +# selects the rustls TLS path (tokio-rustls + rustls), keeping the tree +# openssl-free; `connection-manager` adds reconnect-with-backoff. +redis = { version = "1.2", default-features = false, features = [ + "aio", + "tokio-comp", + "tokio-rustls-comp", + "connection-manager", +] } +# External async connection pool over redis-rs. Forward the rustls TLS +# feature explicitly so the rediss:// path is robust even if feature +# unification with the redis dep ever changes. +deadpool-redis = { version = "0.23", default-features = false, features = [ + "rt_tokio_1", + "tokio-rustls-comp", +] } + +[dev-dependencies] +tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread"] } +# Spins a real `valkey/valkey` container for integration tests; these are +# `#[ignore]`d by default and run via a dedicated job (see tests/). +testcontainers-modules = { version = "0.13", features = ["valkey"] } +testcontainers = "0.25" diff --git a/crates/apl-session-valkey/src/config.rs b/crates/apl-session-valkey/src/config.rs new file mode 100644 index 0000000..c103c47 --- /dev/null +++ b/crates/apl-session-valkey/src/config.rs @@ -0,0 +1,388 @@ +// Location: ./crates/apl-session-valkey/src/config.rs +// Copyright 2026 +// SPDX-License-Identifier: Apache-2.0 +// Authors: Fred Araujo +// +// Parses and validates the `global.apl.session_store` block for the +// Valkey backend. Deliberately minimal (R11): a single endpoint, TLS, +// auth, key prefix, optional sliding TTL, and fail-closed timeout/retry +// knobs with committed safe defaults. Sentinel/Cluster fields are NOT +// present — they are out of scope and would be dead config surface. + +use serde::Deserialize; + +use crate::error::BuildError; + +/// Default key prefix/namespace for the label keyspace. The `v1` segment +/// lets a future value-schema change bump the namespace cleanly. +fn default_key_prefix() -> String { + "taint:v1".to_string() +} + +// Committed fail-closed defaults (see plan Key Technical Decisions). They +// ship in code so behavior and tests are deterministic; operators tune +// from this baseline. +fn default_connect_timeout_ms() -> u64 { + 250 +} +fn default_command_timeout_ms() -> u64 { + 500 +} + +/// Parsed `global.apl.session_store` config for `kind: valkey`. +/// +/// Unknown keys (including `kind`, consumed by the factory dispatch) are +/// ignored so the same block can carry the discriminator. +#[derive(Debug, Clone, Deserialize)] +pub struct ValkeyConfig { + /// Endpoint: a `redis://` / `rediss://` URL or a bare `host:port`. + pub endpoint: String, + + /// Whether to use TLS. Implied `true` for a `rediss://` endpoint. + /// Required for any non-localhost endpoint (validated). + #[serde(default)] + pub tls: bool, + + /// Optional ACL username (Valkey 6+ ACLs). Paired with `password`. + #[serde(default)] + pub username: Option, + + /// Optional auth password / ACL secret. Sourced from config/env by + /// the operator; never hard-coded. + #[serde(default)] + pub password: Option, + + /// Key prefix/namespace for label keys (R9). + #[serde(default = "default_key_prefix")] + pub key_prefix: String, + + /// Sliding TTL in seconds, refreshed on load and append. `None` + /// (default) means no expiry (R7). + #[serde(default)] + pub ttl_seconds: Option, + + /// Declared maximum session-identity lifetime, used only to emit the + /// TTL-soundness warning (R17) when `ttl_seconds` is shorter. + #[serde(default)] + pub max_session_lifetime_seconds: Option, + + /// Connection acquisition timeout (ms). + #[serde(default = "default_connect_timeout_ms")] + pub connect_timeout_ms: u64, + + /// Per-command response timeout (ms) — the fail-closed hot-path knob. + #[serde(default = "default_command_timeout_ms")] + pub command_timeout_ms: u64, + // NOTE: bounded retry + circuit-breaker are deliberately NOT implemented + // in v0 (deferred follow-up). The store fails closed on the first + // backend error, which is safe — it just fails faster. A `max_retries` + // knob is intentionally absent rather than present-but-dead, so config + // never advertises behavior the code doesn't have. +} + +impl ValkeyConfig { + /// Parse from the YAML config block, then validate. + pub fn from_value(value: &serde_yaml::Value) -> Result { + let cfg: ValkeyConfig = + serde_yaml::from_value(value.clone()).map_err(|e| BuildError::Config(e.to_string()))?; + cfg.validate()?; + Ok(cfg) + } + + /// Enforce the non-negotiable invariants. TLS is mandatory off + /// localhost (R10); a `tls: true` + plaintext `redis://` scheme is a + /// contradiction (would connect in cleartext); the connection URL + /// must build; the TTL-soundness warning (R17) is emitted here. + /// + /// All error text routes the endpoint through [`redact_endpoint`] so + /// embedded credentials never leak into errors or logs. + fn validate(&self) -> Result<(), BuildError> { + // A fully-formed plaintext `redis://` endpoint with `tls: true` + // is contradictory: tls_enabled() would say "secure" while the + // explicit scheme forces cleartext. Reject rather than silently + // connecting in the clear. + if self.tls && self.endpoint.starts_with("redis://") { + return Err(BuildError::Config(format!( + "`tls: true` conflicts with the plaintext `redis://` scheme in endpoint '{}'; \ + use a `rediss://` URL or a bare host:port", + redact_endpoint(&self.endpoint) + ))); + } + + if !self.tls_enabled() && !endpoint_is_localhost(&self.endpoint) { + return Err(BuildError::TlsRequired(redact_endpoint(&self.endpoint))); + } + + // Credential-consistency checks, rejected loud at config-load rather + // than silently mis-connecting on first request. + // + // 1. A full `redis://`/`rediss://` endpoint carries its own + // credentials; the separate `username`/`password` fields are + // ignored for URL endpoints (connection_url returns early). Setting + // both is ambiguous — force credentials into one place. + let endpoint_is_url = + self.endpoint.starts_with("redis://") || self.endpoint.starts_with("rediss://"); + if endpoint_is_url && (self.username.is_some() || self.password.is_some()) { + return Err(BuildError::Config(format!( + "endpoint '{}' is a full URL; put credentials in the URL userinfo \ + (rediss://user:pass@host) or use a bare host:port — the separate \ + `username`/`password` fields are ignored for URL endpoints", + redact_endpoint(&self.endpoint) + ))); + } + + // 2. A `username` with no `password` would silently connect as the + // default user with the username dropped. Reject the ambiguity. + if self.username.is_some() && self.password.is_none() { + return Err(BuildError::Config( + "`username` is set without a `password`; supply a `password` for the ACL \ + user, or remove `username` to connect as the default user" + .to_string(), + )); + } + + // Build the URL now so a malformed endpoint / unencodable + // credential fails at config-load, not on first request. + self.connection_url()?; + + if let (Some(ttl), Some(life)) = (self.ttl_seconds, self.max_session_lifetime_seconds) { + if ttl < life { + tracing::warn!( + alarm = "session_store_ttl_unsound", + ttl_seconds = ttl, + max_session_lifetime_seconds = life, + "valkey session_store TTL is shorter than the declared max session lifetime; \ + accumulated taint can silently expire (downgrade-by-waiting) — see R8" + ); + } + } + Ok(()) + } + + /// TLS is on when explicitly set or implied by a `rediss://` scheme. + pub fn tls_enabled(&self) -> bool { + self.tls || self.endpoint.starts_with("rediss://") + } + + /// Build the `redis`/`rediss` connection URL deadpool consumes. + /// + /// Credentials are percent-encoded via the `url` crate (never naive + /// string interpolation), and the wire scheme always reflects + /// [`Self::tls_enabled`] so it cannot disagree with the validated TLS + /// intent. A fully-formed endpoint URL is parsed (and trusted for its + /// own embedded credentials); a bare `host:port` is assembled with + /// the configured scheme and any separate `username`/`password`. + pub fn connection_url(&self) -> Result { + if self.endpoint.starts_with("redis://") || self.endpoint.starts_with("rediss://") { + // Validate it parses; trust the operator's embedded scheme + + // credentials. (validate() has already rejected the + // tls:true + redis:// contradiction.) + let url = url::Url::parse(&self.endpoint).map_err(|e| { + BuildError::Config(format!( + "invalid endpoint URL '{}': {e}", + redact_endpoint(&self.endpoint) + )) + })?; + return Ok(url.to_string()); + } + + let scheme = if self.tls_enabled() { + "rediss" + } else { + "redis" + }; + let mut url = url::Url::parse(&format!("{scheme}://{}", self.endpoint)).map_err(|e| { + BuildError::Config(format!( + "invalid endpoint '{}': {e}", + redact_endpoint(&self.endpoint) + )) + })?; + // Apply credentials when either is present. `validate()` guarantees a + // `username` is always paired with a `password`; a lone `password` + // (default-user AUTH) stays valid and sets an empty username. + if self.username.is_some() || self.password.is_some() { + // set_username/set_password percent-encode and reject hosts + // that cannot carry userinfo (e.g. cannot-be-a-base URLs). + url.set_username(self.username.as_deref().unwrap_or("")) + .map_err(|_| BuildError::Config("endpoint cannot carry credentials".to_string()))?; + if let Some(password) = &self.password { + url.set_password(Some(password)).map_err(|_| { + BuildError::Config("endpoint cannot carry credentials".to_string()) + })?; + } + } + Ok(url.to_string()) + } +} + +/// Strip any `userinfo` (`user:pass@`) from an endpoint before it appears +/// in an error message or log line, so credentials are never disclosed. +fn redact_endpoint(endpoint: &str) -> String { + if let Some(scheme_end) = endpoint.find("://") { + let (scheme, after) = (&endpoint[..scheme_end], &endpoint[scheme_end + 3..]); + if let Some(at) = after.rfind('@') { + return format!("{scheme}://***@{}", &after[at + 1..]); + } + return endpoint.to_string(); + } + // Bare host:port may still carry userinfo if misconfigured. + if let Some(at) = endpoint.rfind('@') { + return format!("***@{}", &endpoint[at + 1..]); + } + endpoint.to_string() +} + +/// Best-effort localhost check for the TLS-required rule. Strips scheme, +/// credentials, and port, then matches the common loopback hosts. +fn endpoint_is_localhost(endpoint: &str) -> bool { + let no_scheme = endpoint + .strip_prefix("rediss://") + .or_else(|| endpoint.strip_prefix("redis://")) + .unwrap_or(endpoint); + // Drop any credentials before the host. + let host_port = no_scheme.rsplit('@').next().unwrap_or(no_scheme); + // Bracketed IPv6 loopback, e.g. [::1]:6379. + if host_port.starts_with("[::1]") { + return true; + } + let host = host_port.split(':').next().unwrap_or(host_port); + matches!(host, "localhost" | "127.0.0.1" | "::1") +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(yaml: &str) -> Result { + let v: serde_yaml::Value = serde_yaml::from_str(yaml).unwrap(); + ValkeyConfig::from_value(&v) + } + + #[test] + fn localhost_without_tls_is_allowed() { + let cfg = parse("kind: valkey\nendpoint: localhost:6379\n").unwrap(); + assert_eq!(cfg.key_prefix, "taint:v1"); + assert_eq!(cfg.connect_timeout_ms, 250); + assert_eq!(cfg.command_timeout_ms, 500); + assert!(cfg + .connection_url() + .unwrap() + .starts_with("redis://localhost:6379")); + } + + #[test] + fn non_localhost_without_tls_is_rejected() { + let err = parse("kind: valkey\nendpoint: valkey.prod.internal:6379\n").unwrap_err(); + assert!(matches!(err, BuildError::TlsRequired(_)), "got {err:?}"); + } + + #[test] + fn non_localhost_with_tls_uses_rediss_scheme() { + let cfg = parse("kind: valkey\nendpoint: valkey.prod.internal:6379\ntls: true\n").unwrap(); + assert!(cfg.tls_enabled()); + assert!(cfg + .connection_url() + .unwrap() + .starts_with("rediss://valkey.prod.internal:6379")); + } + + #[test] + fn rediss_scheme_implies_tls() { + let cfg = parse("kind: valkey\nendpoint: rediss://valkey.prod.internal:6379\n").unwrap(); + assert!(cfg.tls_enabled()); + assert!(cfg.connection_url().unwrap().starts_with("rediss://")); + } + + /// Regression for the TLS-bypass finding: `tls: true` with an explicit + /// plaintext `redis://` scheme must be rejected, not silently connect + /// in the clear. + #[test] + fn tls_true_with_plaintext_scheme_is_rejected() { + let err = parse("kind: valkey\nendpoint: redis://valkey.prod.internal:6379\ntls: true\n") + .unwrap_err(); + assert!(matches!(err, BuildError::Config(_)), "got {err:?}"); + } + + #[test] + fn credentials_are_percent_encoded_in_url() { + // A password with URL-significant characters must be encoded, not + // interpolated raw (which would corrupt the URL). + let cfg = parse( + "kind: valkey\nendpoint: valkey.prod.internal:6379\ntls: true\nusername: gw\npassword: \"p@ss:w/rd\"\n", + ) + .unwrap(); + let url = cfg.connection_url().unwrap(); + assert!(url.starts_with("rediss://gw:"), "url: {url}"); + assert!(url.contains("@valkey.prod.internal:6379"), "url: {url}"); + // The raw special chars must NOT appear unencoded in the userinfo. + assert!( + url.contains("p%40ss"), + "password '@' must be encoded: {url}" + ); + } + + /// Nit 1: a `username` with no `password` is ambiguous (would silently + /// connect as the default user). Reject it at config-load. + #[test] + fn username_without_password_is_rejected() { + let err = parse( + "kind: valkey\nendpoint: valkey.prod.internal:6379\ntls: true\nusername: gateway\n", + ) + .unwrap_err(); + assert!(matches!(err, BuildError::Config(_)), "got {err:?}"); + } + + /// Nit 2: a full URL endpoint carries its own credentials; separate + /// `username`/`password` fields are ignored, so supplying both is rejected. + #[test] + fn url_endpoint_with_separate_credentials_is_rejected() { + let err = parse( + "kind: valkey\nendpoint: rediss://valkey.prod.internal:6379\nusername: gw\npassword: s3cret\n", + ) + .unwrap_err(); + assert!(matches!(err, BuildError::Config(_)), "got {err:?}"); + } + + /// A lone `password` (no username) is the default-user AUTH case and stays + /// valid, producing `redis://:pass@host` (empty username). + #[test] + fn password_without_username_uses_default_user() { + let cfg = parse("kind: valkey\nendpoint: localhost:6379\npassword: s3cret\n").unwrap(); + let url = cfg.connection_url().unwrap(); + assert!(url.starts_with("redis://:s3cret@localhost:6379"), "url: {url}"); + } + + #[test] + fn missing_endpoint_is_config_error() { + let err = parse("kind: valkey\n").unwrap_err(); + assert!(matches!(err, BuildError::Config(_)), "got {err:?}"); + } + + #[test] + fn ipv6_loopback_without_tls_is_allowed() { + let cfg = parse("kind: valkey\nendpoint: \"[::1]:6379\"\n").unwrap(); + assert!(!cfg.tls_enabled()); + } + + #[test] + fn redact_endpoint_strips_userinfo() { + assert_eq!( + redact_endpoint("rediss://user:secret@host:6379"), + "rediss://***@host:6379" + ); + assert_eq!(redact_endpoint("host:6379"), "host:6379"); + } + + /// Credentials must never leak into the TLS-required error. + #[test] + fn tls_required_error_redacts_credentials() { + // rediss-less, non-localhost, with embedded creds, tls off → error. + let err = + parse("kind: valkey\nendpoint: redis://user:topsecret@prod.host:6379\n").unwrap_err(); + let msg = format!("{err}"); + assert!( + !msg.contains("topsecret"), + "error leaked credentials: {msg}" + ); + } +} diff --git a/crates/apl-session-valkey/src/connection.rs b/crates/apl-session-valkey/src/connection.rs new file mode 100644 index 0000000..526a484 --- /dev/null +++ b/crates/apl-session-valkey/src/connection.rs @@ -0,0 +1,30 @@ +// Location: ./crates/apl-session-valkey/src/connection.rs +// Copyright 2026 +// SPDX-License-Identifier: Apache-2.0 +// Authors: Fred Araujo +// +// Internal connection layer (R14): builds and holds the deadpool-redis +// pool for the Valkey backend. Kept private to this crate — it is NOT a +// public reusable API. When a second consumer (the planned OAuth token +// cache) is actually scheduled, extract a shared layer then +// (refactor-then-reuse), shaped by two real consumers. + +use deadpool_redis::{Config as PoolConfig, Pool, Runtime}; + +use crate::config::ValkeyConfig; +use crate::error::BuildError; + +/// Build the connection pool from validated config. The pool is created +/// lazily — `create_pool` does not dial Valkey, so a bad endpoint surfaces +/// on first use (where it correctly fails the request closed) rather than +/// blocking `load_config_yaml`. +pub(crate) fn build_pool(cfg: &ValkeyConfig) -> Result { + let url = cfg.connection_url()?; + let pool_cfg = PoolConfig::from_url(url); + // Note: the pool-create error is intentionally not interpolated with + // the URL — that string carries credentials. `connection_url()` has + // already validated the URL parses, so failures here are rare. + pool_cfg + .create_pool(Some(Runtime::Tokio1)) + .map_err(|e| BuildError::Pool(e.to_string())) +} diff --git a/crates/apl-session-valkey/src/error.rs b/crates/apl-session-valkey/src/error.rs new file mode 100644 index 0000000..38169e9 --- /dev/null +++ b/crates/apl-session-valkey/src/error.rs @@ -0,0 +1,31 @@ +// Location: ./crates/apl-session-valkey/src/error.rs +// Copyright 2026 +// SPDX-License-Identifier: Apache-2.0 +// Authors: Fred Araujo +// +// Construction-time errors for the Valkey session-store backend. These +// surface when the `global.apl.session_store` config block is malformed +// or the connection pool cannot be built — i.e. at `load_config_yaml` +// time, NOT on the request hot path. Request-time failures flow through +// `apl_cpex::SessionStoreError` (the trait's return type) so callers can +// fail closed. + +/// Error returned while building a `ValkeySessionStore` from config. +#[derive(Debug, thiserror::Error)] +pub enum BuildError { + /// The config block was structurally invalid (missing/!typed fields). + #[error("invalid valkey session_store config: {0}")] + Config(String), + + /// TLS is mandatory for any non-localhost endpoint (R10): session + /// security labels must not transit a network segment in plaintext. + #[error( + "valkey session_store requires TLS for non-localhost endpoint '{0}' \ + — set `tls: true` or use a `rediss://` URL" + )] + TlsRequired(String), + + /// The connection pool could not be constructed (bad URL, etc.). + #[error("failed to build valkey connection pool: {0}")] + Pool(String), +} diff --git a/crates/apl-session-valkey/src/factory.rs b/crates/apl-session-valkey/src/factory.rs new file mode 100644 index 0000000..576918c --- /dev/null +++ b/crates/apl-session-valkey/src/factory.rs @@ -0,0 +1,45 @@ +// Location: ./crates/apl-session-valkey/src/factory.rs +// Copyright 2026 +// SPDX-License-Identifier: Apache-2.0 +// Authors: Fred Araujo +// +// `ValkeySessionStoreFactory` — the `SessionStoreFactory` that lets the +// apl-cpex visitor build a `ValkeySessionStore` from a +// `global.apl.session_store: { kind: valkey, ... }` block. Mirrors the +// PDP factories (CelPdpFactory, CedarDirectPdpFactory). + +use std::sync::Arc; + +use apl_cpex::{SessionStore, SessionStoreFactory}; + +use crate::config::ValkeyConfig; +use crate::store::ValkeySessionStore; + +/// The `kind:` discriminator this factory builds. Part of the public +/// surface — it is the string operators write in their config. +pub const KIND: &str = "valkey"; + +/// Factory the host registers via `AplOptions.session_store_factories`. +#[derive(Default)] +pub struct ValkeySessionStoreFactory; + +impl ValkeySessionStoreFactory { + pub fn new() -> Self { + Self + } +} + +impl SessionStoreFactory for ValkeySessionStoreFactory { + fn kind(&self) -> &str { + KIND + } + + fn build( + &self, + config: &serde_yaml::Value, + ) -> Result, Box> { + let cfg = ValkeyConfig::from_value(config)?; + let store = ValkeySessionStore::from_config(&cfg)?; + Ok(Arc::new(store)) + } +} diff --git a/crates/apl-session-valkey/src/lib.rs b/crates/apl-session-valkey/src/lib.rs new file mode 100644 index 0000000..5449f73 --- /dev/null +++ b/crates/apl-session-valkey/src/lib.rs @@ -0,0 +1,44 @@ +// Location: ./crates/apl-session-valkey/src/lib.rs +// Copyright 2026 +// SPDX-License-Identifier: Apache-2.0 +// Authors: Fred Araujo +// +// apl-session-valkey — a Valkey-backed `apl_cpex::SessionStore` for +// distributed, cross-restart persistence of session security labels. +// +// # Where this sits +// +// apl-cpex (SessionStore trait, SessionStoreFactory) +// ▲ +// │ implements +// apl-session-valkey ──uses──▶ redis-rs + deadpool-redis (rustls) +// +// The host registers `ValkeySessionStoreFactory` via +// `AplOptions.session_store_factories`; a `global.apl.session_store: +// { kind: valkey, ... }` block then selects it during config load. When +// no such block is present, apl-cpex keeps its default in-process +// `MemorySessionStore`, so this crate is entirely opt-in. +// +// # Design invariants (carried from the requirements/plan) +// +// - Fail-closed: any backend error (unreachable, timeout, undecodable) +// becomes `SessionStoreError`; only a confirmed key-miss is empty. +// - Atomic union: `append_labels` is a single server-side `SADD`. +// - Primary-only: a single endpoint, no replica read-splitting. +// - TLS required off-localhost; `noeviction` is an operator runbook +// concern the client can only warn about. +// +// The connection layer is kept internal (no public reusable API): the +// planned OAuth token cache is the trigger to extract a shared layer +// later, shaped by two real consumers. + +mod config; +mod connection; +mod error; +mod factory; +mod store; + +pub use config::ValkeyConfig; +pub use error::BuildError; +pub use factory::{ValkeySessionStoreFactory, KIND}; +pub use store::ValkeySessionStore; diff --git a/crates/apl-session-valkey/src/store.rs b/crates/apl-session-valkey/src/store.rs new file mode 100644 index 0000000..9f80d4e --- /dev/null +++ b/crates/apl-session-valkey/src/store.rs @@ -0,0 +1,167 @@ +// Location: ./crates/apl-session-valkey/src/store.rs +// Copyright 2026 +// SPDX-License-Identifier: Apache-2.0 +// Authors: Fred Araujo +// +// `ValkeySessionStore` — the Valkey-backed `SessionStore`. Labels live in +// a Redis SET per session so `append_labels` is a single atomic +// server-side union (`SADD`), never a client-side read-modify-write that +// would lose labels under concurrent cross-node appends (R16). +// +// # Fail-closed mapping (R5/R15) +// +// - `SMEMBERS` on a missing key returns an empty set → `Ok(empty)` +// (unknown session, R15). It is NOT an error. +// - connection/timeout/protocol/decode failures → `Err(Backend)` so the +// caller fails the request closed. +// +// # Sliding TTL (R7) +// +// `append_labels` issues `SADD` + `EXPIRE` in one atomic pipeline. +// `load_labels` refreshes the TTL fail-open: the read already succeeded, +// so a refresh failure is alarmed but the labels are still returned. + +use std::fmt::Write as _; +use std::time::Duration; + +use apl_cpex::{SessionStore, SessionStoreError}; +use async_trait::async_trait; +use deadpool_redis::{Connection, Pool}; +use redis::AsyncCommands; +use sha2::{Digest, Sha256}; + +use crate::config::ValkeyConfig; +use crate::connection::build_pool; +use crate::error::BuildError; + +/// Valkey-backed session label store. +pub struct ValkeySessionStore { + pool: Pool, + key_prefix: String, + ttl_seconds: Option, + connect_timeout: Duration, + command_timeout: Duration, +} + +impl ValkeySessionStore { + /// Build from validated config. The pool is created lazily, so this + /// does not dial Valkey — connection failures surface on first use + /// and correctly fail the request closed. + pub fn from_config(cfg: &ValkeyConfig) -> Result { + Ok(Self { + pool: build_pool(cfg)?, + key_prefix: cfg.key_prefix.clone(), + ttl_seconds: cfg.ttl_seconds, + connect_timeout: Duration::from_millis(cfg.connect_timeout_ms), + command_timeout: Duration::from_millis(cfg.command_timeout_ms), + }) + } + + /// Key schema: `:`. The full-width + /// digest keeps the Valkey keyspace collision-free and removes raw + /// session ids from it. + fn key(&self, session_id: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(session_id.as_bytes()); + let digest = hasher.finalize(); + let mut hex = String::with_capacity(digest.len() * 2); + for byte in digest { + let _ = write!(hex, "{byte:02x}"); + } + format!("{}:{}", self.key_prefix, hex) + } + + /// Acquire a pooled connection, bounded by the connect timeout (the + /// fail-fast knob for a dead/slow endpoint, distinct from the + /// per-command timeout applied to SMEMBERS/SADD below). + async fn conn(&self) -> Result { + match tokio::time::timeout(self.connect_timeout, self.pool.get()).await { + Ok(Ok(conn)) => Ok(conn), + Ok(Err(e)) => Err(backend(e)), + Err(_) => Err(SessionStoreError::Backend( + "valkey connection acquire timed out".to_string(), + )), + } + } +} + +/// Map any backend failure to the fail-closed `SessionStoreError`. +fn backend(e: impl std::fmt::Display) -> SessionStoreError { + SessionStoreError::Backend(e.to_string()) +} + +#[async_trait] +impl SessionStore for ValkeySessionStore { + async fn load_labels(&self, session_id: &str) -> Result, SessionStoreError> { + let key = self.key(session_id); + let mut conn = self.conn().await?; + + // SMEMBERS on a missing key returns an empty set (Ok), so an + // unknown session naturally maps to Ok(empty) (R15). Only a real + // backend failure becomes Err (R5). + let labels: Vec = + match tokio::time::timeout(self.command_timeout, conn.smembers(&key)).await { + Ok(res) => res.map_err(backend)?, + Err(_) => { + return Err(SessionStoreError::Backend( + "valkey SMEMBERS timed out".to_string(), + )) + } + }; + + // Sliding-TTL refresh is fail-open for the read: the labels were + // read successfully, so a refresh failure is alarmed, not failed + // closed (R7). A persistently-failing refresh risks silent key + // expiry across requests — see the operator runbook. + if let Some(ttl) = self.ttl_seconds { + let refresh: Result = + match tokio::time::timeout(self.command_timeout, conn.expire(&key, ttl as i64)) + .await + { + Ok(res) => res, + Err(_) => Ok(false), // treat timeout as a failed refresh + }; + if let Err(e) = refresh { + tracing::warn!( + alarm = "session_store_ttl_refresh_failed", + error = %e, + "valkey TTL refresh on load failed; returning read labels (fail-open)" + ); + } + } + + Ok(labels) + } + + async fn append_labels( + &self, + session_id: &str, + labels: &[String], + ) -> Result<(), SessionStoreError> { + if labels.is_empty() { + return Ok(()); + } + let key = self.key(session_id); + let mut conn = self.conn().await?; + + // Atomic server-side union + optional TTL refresh in one round + // trip (MULTI/EXEC). SADD is a commutative merge, so concurrent + // cross-node appends never lose labels (R16). + let mut pipe = redis::pipe(); + pipe.atomic(); + pipe.sadd(&key, labels).ignore(); + if let Some(ttl) = self.ttl_seconds { + pipe.expire(&key, ttl as i64).ignore(); + } + + match tokio::time::timeout(self.command_timeout, pipe.query_async::<()>(&mut conn)).await { + Ok(res) => res.map_err(backend)?, + Err(_) => { + return Err(SessionStoreError::Backend( + "valkey append (SADD+EXPIRE) timed out".to_string(), + )) + } + } + Ok(()) + } +} diff --git a/crates/apl-session-valkey/tests/valkey_store_integration.rs b/crates/apl-session-valkey/tests/valkey_store_integration.rs new file mode 100644 index 0000000..f3e2286 --- /dev/null +++ b/crates/apl-session-valkey/tests/valkey_store_integration.rs @@ -0,0 +1,222 @@ +// Location: ./crates/apl-session-valkey/tests/valkey_store_integration.rs +// Copyright 2026 +// SPDX-License-Identifier: Apache-2.0 +// Authors: Fred Araujo +// +// Integration tests for ValkeySessionStore against a real `valkey/valkey` +// container (testcontainers). These are `#[ignore]`d by default so unit +// runs don't require Docker; run them with: +// +// cargo test -p apl-session-valkey -- --ignored +// +// Skip discipline (learning from PR #67's silent no-op tests): +// - If `VALKEY_TEST_URL` is set, run against that endpoint (a CI service +// container or a locally-run `valkey/valkey`) — no testcontainers. +// - Else start a testcontainers `valkey/valkey`. If that can't start AND +// `REQUIRE_VALKEY_TESTS=1` is set (CI), that is a hard failure (panic) +// — the test genuinely ran. +// - Otherwise (local, no Docker) the helper prints a loud SKIPPED line +// and the test returns without asserting. The visible line is what +// stops a silent green. + +use apl_cpex::{SessionStore, SessionStoreError}; +use apl_session_valkey::{ValkeyConfig, ValkeySessionStore}; +use sha2::{Digest, Sha256}; +use testcontainers_modules::testcontainers::runners::AsyncRunner; +use testcontainers_modules::testcontainers::ContainerAsync; +use testcontainers_modules::valkey::{Valkey, VALKEY_PORT}; + +/// A Valkey endpoint to test against, plus the container handle when one +/// was started (kept alive for the test's duration). +struct Target { + url: String, + _container: Option>, +} + +/// Resolve a Valkey target, or skip loudly when none is available. +/// Returns `None` to signal the caller should return early (skip). +async fn valkey_target() -> Option { + if let Ok(url) = std::env::var("VALKEY_TEST_URL") { + return Some(Target { + url, + _container: None, + }); + } + match Valkey::default().start().await { + Ok(node) => { + let host = node.get_host().await.expect("container host"); + let port = node + .get_host_port_ipv4(VALKEY_PORT) + .await + .expect("container port"); + Some(Target { + url: format!("redis://{host}:{port}"), + _container: Some(node), + }) + } + Err(e) => { + if std::env::var("REQUIRE_VALKEY_TESTS").as_deref() == Ok("1") { + panic!("REQUIRE_VALKEY_TESTS=1 but no Valkey available: {e} (set VALKEY_TEST_URL or start Docker)"); + } + eprintln!( + "SKIPPED: no Valkey available ({e}); set VALKEY_TEST_URL or REQUIRE_VALKEY_TESTS=1" + ); + None + } + } +} + +/// Build a store pointed at the target. +fn store_for(target: &Target, ttl_seconds: Option) -> ValkeySessionStore { + let mut yaml = format!("kind: valkey\nendpoint: {}\n", target.url); + if let Some(ttl) = ttl_seconds { + yaml.push_str(&format!("ttl_seconds: {ttl}\n")); + } + let value: serde_yaml::Value = serde_yaml::from_str(&yaml).unwrap(); + let cfg = ValkeyConfig::from_value(&value).expect("valid config"); + ValkeySessionStore::from_config(&cfg).expect("build store") +} + +/// Raw connection for white-box assertions (TTL, seeding a wrong-typed key). +async fn raw_conn(target: &Target) -> redis::aio::MultiplexedConnection { + redis::Client::open(target.url.clone()) + .unwrap() + .get_multiplexed_async_connection() + .await + .unwrap() +} + +/// Replicate the store's key schema so white-box tests can target the +/// exact key (documents the schema as a side effect). +fn store_key(session_id: &str) -> String { + let digest = Sha256::digest(session_id.as_bytes()); + let hex: String = digest.iter().map(|b| format!("{b:02x}")).collect(); + format!("taint:v1:{hex}") +} + +/// AE4 / R16: concurrent appends from two "nodes" (separate store +/// instances against one Valkey) union without loss; a third reader sees +/// the full set. +#[tokio::test] +#[ignore] +async fn cross_node_concurrent_append_unions() { + let Some(target) = valkey_target().await else { + return; + }; + let node_a = store_for(&target, None); + let node_b = store_for(&target, None); + let sid = "sess-union"; + + let labels_a = vec!["PII".to_string()]; + let labels_b = vec!["INTERNAL".to_string()]; + let (ra, rb) = tokio::join!( + node_a.append_labels(sid, &labels_a), + node_b.append_labels(sid, &labels_b), + ); + ra.expect("node A append"); + rb.expect("node B append"); + + let reader = store_for(&target, None); + let mut labels = reader.load_labels(sid).await.expect("load"); + labels.sort(); + assert_eq!(labels, vec!["INTERNAL".to_string(), "PII".to_string()]); +} + +/// R15: an unknown session is a confirmed key-miss → Ok(empty), not Err. +#[tokio::test] +#[ignore] +async fn unknown_session_returns_empty_ok() { + let Some(target) = valkey_target().await else { + return; + }; + let store = store_for(&target, None); + let labels = store + .load_labels("never-written") + .await + .expect("unknown session must be Ok(empty), not Err"); + assert!(labels.is_empty()); +} + +/// R5: a reachable but undecodable reply (key holds a string, not a SET) +/// fails closed (Err) rather than returning Ok(empty). +#[tokio::test] +#[ignore] +async fn wrongtype_reply_fails_closed() { + let Some(target) = valkey_target().await else { + return; + }; + let store = store_for(&target, None); + + // Seed the exact key as a plain string so SMEMBERS returns WRONGTYPE. + let mut conn = raw_conn(&target).await; + let sid = "sess-wrongtype"; + let _: () = redis::cmd("SET") + .arg(store_key(sid)) + .arg("not-a-set") + .query_async(&mut conn) + .await + .unwrap(); + + let result = store.load_labels(sid).await; + assert!( + matches!(result, Err(SessionStoreError::Backend(_))), + "WRONGTYPE must fail closed, got {result:?}" + ); +} + +/// R5: an unreachable endpoint fails closed quickly (bounded by the +/// command timeout). No container needed, but kept with the suite. +#[tokio::test] +#[ignore] +async fn unreachable_endpoint_fails_closed() { + // Port 1 is not listening; localhost so TLS is not required. + let value: serde_yaml::Value = + serde_yaml::from_str("kind: valkey\nendpoint: 127.0.0.1:1\ncommand_timeout_ms: 300\n") + .unwrap(); + let cfg = ValkeyConfig::from_value(&value).unwrap(); + let store = ValkeySessionStore::from_config(&cfg).unwrap(); + + let result = store.load_labels("sess-x").await; + assert!( + matches!(result, Err(SessionStoreError::Backend(_))), + "unreachable endpoint must fail closed, got {result:?}" + ); +} + +/// AE2 / R7: a configured TTL is set on append and refreshed on load. +#[tokio::test] +#[ignore] +async fn ttl_set_on_append_and_refreshed_on_load() { + let Some(target) = valkey_target().await else { + return; + }; + let store = store_for(&target, Some(100)); + let sid = "sess-ttl"; + store + .append_labels(sid, &["PII".to_string()]) + .await + .expect("append"); + + let mut conn = raw_conn(&target).await; + let ttl_after_append: i64 = redis::cmd("TTL") + .arg(store_key(sid)) + .query_async(&mut conn) + .await + .unwrap(); + assert!( + ttl_after_append > 0 && ttl_after_append <= 100, + "append should set a positive TTL, got {ttl_after_append}" + ); + + // A load refreshes the sliding TTL back toward the configured window. + let _ = store.load_labels(sid).await.expect("load"); + let ttl_after_load: i64 = redis::cmd("TTL") + .arg(store_key(sid)) + .query_async(&mut conn) + .await + .unwrap(); + assert!( + ttl_after_load > 0, + "load should keep/refresh a positive TTL, got {ttl_after_load}" + ); +} diff --git a/crates/cpex-ffi/Cargo.toml b/crates/cpex-ffi/Cargo.toml index 8adcb8c..d77180f 100644 --- a/crates/cpex-ffi/Cargo.toml +++ b/crates/cpex-ffi/Cargo.toml @@ -33,6 +33,11 @@ apl-pdp-cedar-direct = { path = "../apl-pdp-cedar-direct" } # Heavy (~200 transitive deps via the Cedarling git dep); kept out of the # default `.a` and behind the `cedarling` feature. apl-cedarling = { path = "../apl-cedarling", optional = true } +# Valkey-backed SessionStore (redis client + rustls TLS stack). Optional +# and behind the `valkey` feature so the default `.a` artifact size is +# unaffected; default-members exclusion alone does NOT keep its object +# code out of a `-p cpex-ffi` build — the feature gate does. +apl-session-valkey = { path = "../apl-session-valkey", optional = true } tokio = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } @@ -45,6 +50,9 @@ default = [] # Opt-in Cedarling-backed identity + PDP. Build with # `cargo build -p cpex-ffi --features cedarling`. cedarling = ["dep:apl-cedarling"] +# Opt-in Valkey-backed SessionStore. Build with +# `cargo build -p cpex-ffi --features valkey`. +valkey = ["dep:apl-session-valkey"] [dev-dependencies] async-trait = { workspace = true } diff --git a/crates/cpex-ffi/src/apl.rs b/crates/cpex-ffi/src/apl.rs index d930046..cfa1d07 100644 --- a/crates/cpex-ffi/src/apl.rs +++ b/crates/cpex-ffi/src/apl.rs @@ -85,8 +85,19 @@ pub unsafe extern "C" fn cpex_apl_install(mgr: *const CpexManagerInner) -> c_int // default. The visitor keeps a Weak (see // CpexManagerInner) that upgrades during load_config_yaml. let mut opts = apl_cpex::AplOptions::in_process(); - opts.pdp_factories = - vec![Arc::new(apl_pdp_cedar_direct::CedarDirectPdpFactory::new())]; + opts.pdp_factories = vec![Arc::new(apl_pdp_cedar_direct::CedarDirectPdpFactory::new())]; + + // With the `valkey` cargo feature, register the Valkey + // SessionStore factory so a `global.apl.session_store: + // { kind: valkey, ... }` config block selects it. Without the + // feature, the default in-process MemorySessionStore stays active + // and no Valkey object code is linked. + #[cfg(feature = "valkey")] + { + opts.session_store_factories = vec![Arc::new( + apl_session_valkey::ValkeySessionStoreFactory::new(), + )]; + } apl_cpex::register_apl(&inner.manager, opts); })); diff --git a/deploy/valkey-compose.yml b/deploy/valkey-compose.yml new file mode 100644 index 0000000..8aab271 --- /dev/null +++ b/deploy/valkey-compose.yml @@ -0,0 +1,36 @@ +# Location: ./deploy/valkey-compose.yml +# Copyright 2026 +# SPDX-License-Identifier: Apache-2.0 +# +# Local development / integration Valkey for the apl-session-valkey +# backend. Brings up a single Valkey primary configured the way the +# security model requires (see docs/operations/valkey-session-store.md): +# +# - maxmemory-policy noeviction: a full instance fails writes closed +# rather than silently evicting accumulated taint labels (R9). +# +# Usage: +# docker compose -f deploy/valkey-compose.yml up -d +# VALKEY_TEST_URL=redis://127.0.0.1:6379 \ +# cargo test -p apl-session-valkey --test valkey_store_integration -- --ignored +# +# This is a DEV/TEST topology only — no TLS, no ACL. Production deployments +# must add TLS (mTLS recommended), a least-privilege ACL, and HA via a +# fronting endpoint. See the operator runbook. + +services: + valkey: + image: valkey/valkey:8 + command: + - valkey-server + - --maxmemory + - 256mb + - --maxmemory-policy + - noeviction + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "valkey-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 diff --git a/docs/brainstorms/valkey-session-store-requirements.md b/docs/brainstorms/valkey-session-store-requirements.md new file mode 100644 index 0000000..e0934c7 --- /dev/null +++ b/docs/brainstorms/valkey-session-store-requirements.md @@ -0,0 +1,158 @@ +--- +date: 2026-06-17 +topic: valkey-session-store +--- + +# Valkey-backed Session Store + +## Summary + +Add a config-selectable, Valkey-backed `SessionStore` alongside the in-process `MemorySessionStore`, so session security labels persist across application restarts and are shared across gateway nodes. It is **fail-closed**, serves **primary-only reads**, and supports an optional **sliding TTL** that is sound only when set ≥ the gateway's session-identity lifetime. To make fail-closed possible, the `SessionStore` trait gains an error channel. + +--- + +## Problem Frame + +CPEX embeds in an AI gateway that mediates A2A / MCP interactions. The only session-scoped state today is `extensions.security.labels` — a monotonic accumulation of taint/security labels per `session_id` that feed information-flow authorization decisions. The single built-in backend, `MemorySessionStore` (`crates/apl-cpex/src/session_store.rs`), holds this in a process-local `HashMap`. + +Two deployment realities break that model. When a gateway process restarts, every session's accumulated taint is lost — a session that was exposed to PII before the restart looks clean afterward. And in a multi-node deployment, a session pinned to node A carries no taint when its next request lands on node B, because the nodes share no state. In both cases the failure is silent and security-relevant: labels that should constrain a downstream operation simply aren't there. The store's own design comments already anticipate a distributed backend ("Redis or DynamoDB for distributed ones"), but none exists, and the FFI host currently hardcodes the memory store with no way to swap it. That comment predates this work; the design here is Valkey-specific (the config `kind:` is `valkey`), and broader Redis-compatibility or multi-store support is not a goal. + +--- + +## Actors + +- A1. Gateway node (CPEX-embedded): hydrates session labels at request start and appends newly-accumulated labels at request end. Multiple nodes share one store. +- A2. Operator: provisions and manages Valkey, supplies the YAML config (endpoint, TLS, auth, TTL, key prefix), and owns capacity planning and the eviction policy. +- A3. Valkey instance/endpoint: the shared, persistent KV backing the session labels. + +--- + +## Key Flows + +- F1. Cross-node / cross-restart label propagation + - **Trigger:** A session request arrives at any gateway node. + - **Actors:** A1, A3 + - **Steps:** Node loads the session's labels from the Valkey primary → evaluates policy → appends any newly-accumulated labels back (union), refreshing TTL if configured. A later request for the same session on a different node (or after a restart) loads the same unioned labels. + - **Outcome:** Accumulated taint is consistent across nodes and survives restarts. + - **Covered by:** R1, R6, R7, R15 + +- F2. Valkey store error (fail-closed) + - **Trigger:** A load or append errors (unreachable, timeout, or a reachable-but-invalid response). + - **Actors:** A1, A3 + - **Steps:** The store returns an error rather than empty labels or a dropped append. A load error fails the request closed before any decision is made. An append error also fails the request closed: `continue_processing` is computed after `persist_session` in `route_handler.rs`, so the error flips the outcome to Deny — in the Pre phase this blocks the mediated action before it is externalized (write-ahead); in the Post phase it blocks the tainted response from returning. + - **Outcome:** No silent taint loss; a store error degrades to denial, never to under-labeling. During a full outage this is self-covering — the next request's load also fails closed, so a lost append cannot cause downstream under-labeling. The residual case (append fails while reads still succeed) is alarmed. + - **Covered by:** R4, R5, R18 + +```mermaid +sequenceDiagram + participant N1 as Gateway node 1 + participant V as Valkey (primary) + participant N2 as Gateway node 2 + N1->>V: load_labels(sess) → {} + N1->>V: append_labels(sess, [PII]) (refresh TTL) + N2->>V: load_labels(sess) → {PII} + Note over N2,V: cross-node read-your-writes + N2--xV: append_labels(sess, [INTERNAL]) (unreachable) + N2-->>N2: error → request fails closed +``` + +--- + +## Requirements + +**Backend and selection** +- R1. Provide a Valkey-backed `SessionStore` implementing the existing trait surface (`load_labels` / `append_labels`), usable alongside `MemorySessionStore`. +- R2. The backend is selectable through the unified YAML config via a `SessionStore` factory/registry that mirrors the existing `PdpFactory` pattern (a `kind:`-tagged block under `global.apl`). The FFI/gateway host registers the factory so Valkey can be enabled without recompiling. +- R3. When no session-store config block is present, the default remains `MemorySessionStore` — existing deployments are unaffected. + +**Trait change and compatibility** +- R4. The `SessionStore` trait methods return a `Result` so failures propagate to callers. `MemorySessionStore` adapts (it is infallible, returning `Ok`). All call sites are updated to propagate: in the CMF (CmfPluginInvoker) invoker, `persist_session` (which calls `append_labels`) and `for_request` (which calls `load_labels` and currently returns `Self`, not `Result`) must both become fallible, and `route_handler.rs` must act on the propagated error. The trait's error type is a crate-local enum (e.g., via `thiserror`, already a workspace dependency) — not `anyhow` — so `apl-cpex` gains no new dependency. Note: this makes error-propagation part of the shared `SessionStore` contract that future bridges (apl-mcp, apl-langgraph) inherit, not a Valkey-only detail; that is intended. +- R15. Preserve monotonic union semantics across both backends: `append_labels` unions labels into the session's set, `load_labels` returns the union, and an unknown session returns empty (not an error). "Unknown session" means a positively-confirmed key-miss (the session id has no stored labels — never seen or already expired), which is distinct from a store error (R5). Within a configured TTL window, accumulation is monotonic; TTL expiry (R7) is the only sanctioned time-based removal, and explicit declassification remains out of scope. +- R16. `append_labels` MUST be implemented as a single atomic server-side set-union operation (so concurrent appends from different nodes for the same session are race-free and monotonic). Client-side read-modify-write of the label set is forbidden — it loses labels under concurrent cross-node appends. +- R18. An `append_labels` error fails the request closed, uniformly with a load error. Because `continue_processing` is computed after `persist_session` (`route_handler.rs`), the handler flips the outcome to Deny on append error: in the Pre phase this prevents the mediated action (write-ahead — taint is durably committed before the side effect is externalized); in the Post phase it blocks the tainted response. The backend MUST emit a distinguished alarm/metric on append failure, since the dangerous residual is a *selective* failure (append rejected while reads still succeed), where a subsequent load would otherwise return a stale, smaller label set. A full outage is self-covering (the next load also fails closed). + +**Failure and consistency semantics** +- R5. When Valkey is unreachable, times out, or returns an error on load or append, the store returns an error (fail-closed). It must not return empty labels or silently drop an append. A reachable-but-invalid response — a value that cannot be decoded into the expected label representation, or a partial/truncated result — is also treated as a store error (fail-closed), never as an empty or partial label set. This is distinct from the positively-confirmed key-miss of R15, which returns empty. +- R6. Reads are served from the primary only (read-your-writes consistency). No replica read-splitting. +- R19. The label keyspace is a security system-of-record, not a cache, so Valkey persistence must be configured for durability. Document as an operator runbook note (peer to the R9 `noeviction` contract) that AOF must be enabled with `appendfsync everysec` as the floor (`appendfsync always` where the threat model cannot tolerate the ~1s crash-loss window). A `SADD` acknowledged but lost to a crash before fsync makes the next read return a positively-confirmed `Ok(empty)` (R15) — *not* an error — so fail-closed (R5) never trips and the request proceeds under-labeled: a silent downgrade, and one invisible to all alarms because nothing errors. This interacts with R6: asynchronous replication means a failover can promote a replica missing the most recent un-replicated appends — the same downgrade by a different path. Like `noeviction`, the client cannot enforce this server setting; an optional best-effort `CONFIG GET appendonly`/`appendfsync` startup self-check is deferred (it would require dialing at config-load, which the lazy pool does not do today). + +**Expiry and lifecycle** +- R7. Support a configurable sliding TTL on session keys, refreshed on every load and append. Default is no expiry (TTL off). Note that refresh-on-load makes a read also issue a write (e.g., `EXPIRE`) to the primary; the design must define what happens when that refresh write fails on an otherwise-successful load (in particular under `noeviction` at capacity, see R9): a failed TTL refresh must not corrupt the load result, and the load/refresh failure semantics must be stated rather than left implicit. +- R8. Document the soundness rule: a TTL may be enabled only when set ≥ the maximum session-identity lifetime; a shorter TTL silently expires taint (downgrade-by-waiting) and is unsound. +- R17. When a TTL is configured, emit a startup WARNING (or structured audit event) if it is shorter than the configured/declared maximum session-identity lifetime. The soundness rule (R8) is otherwise enforced by nothing; a best-effort comparison catches the most common misconfiguration before it silently downgrades taint. +- R9. Provide a configurable key prefix/namespace (software requirement). Separately, document as an operator runbook note that the label keyspace must run under `maxmemory-policy noeviction`, so a full instance fails-closed on write rather than silently evicting taint — the client cannot enforce this server setting. Optionally, the backend issues a `CONFIG GET maxmemory-policy` check at startup and warns if it is not `noeviction`, making the durability property self-auditing. + +**Connection and deployment** +- R10. Connect to a single Valkey endpoint (URL or host:port) with optional password/ACL auth. TLS is **required** for any non-localhost endpoint: security labels reveal session sensitivity and must not transit a network segment in plaintext, where passive interception discloses taint state and active MITM can inject or suppress labels. The minimum auth posture for production deployments is documented in operator guidance. +- R11. The connection config specifies a single endpoint, TLS settings, and auth. Keep it minimal; if Sentinel or Cluster support is added later, the config schema is versioned at that time. (Do not pre-add unused topology fields now — that is dead config surface with no current consumer; see Scope Boundaries.) +- R12. Provide a Valkey container / compose setup for local development and integration tests. + +**Crate structure and reuse** +- R13. Ship the Valkey backend in its own crate and wire it into `cpex-ffi` as an **optional dependency behind a cargo feature** (e.g. `valkey = ["dep:apl-session-valkey"]`), mirroring `cedarling = ["dep:apl-cedarling"]`. `default-members` exclusion alone does not keep object code out of a `-p cpex-ffi` build — only feature-gating keeps the default FFI artifact (`libcpex_ffi.a`) size unaffected. Also exclude the crate from `default-members` so everyday `cargo build` stays lean. +- R14. Implement the connection/client logic (endpoint, TLS, auth, pooling, key prefix) inside the Valkey crate as an internal module. Extract a shared connection layer only when a second consumer (e.g. the OAuth token cache) is actually scheduled — at that point the interface is shaped by two real consumers (refactor-then-reuse) rather than speculatively designed for one. + +--- + +## Acceptance Examples + +- AE1. **Covers R4, R5.** Given Valkey is unreachable, when `load_labels` is called during hydration, the store returns an error and the request fails closed before any decision is made — it does not return an empty label set. +- AE6. **Covers R18.** Given a request that accumulated a new label and an `append_labels` that errors, when `persist_session` runs, the handler fails the request closed (Deny) and emits an append-failure alarm — it does not silently drop the append. +- AE2. **Covers R7.** Given a sliding TTL of 24h is configured, when a session is loaded or appended at the 23h mark, its key TTL is refreshed to 24h from that access. +- AE3. **Covers R3, R15.** Given no session-store config block, when APL is installed, `MemorySessionStore` is used and label load/append behavior is unchanged from today. +- AE4. **Covers R1, R6, R15.** Given an append on node A followed by a load on node B for the same `session_id`, node B observes the unioned labels via the shared primary. +- AE5. **Covers R2.** Given a `kind: valkey` config block, when APL is installed, the Valkey-backed store is selected as the active `SessionStore`. + +--- + +## Success Criteria + +- Session labels survive a gateway process restart and are visible across nodes sharing one Valkey endpoint. +- A Valkey outage produces explicit fail-closed errors at the call sites, never silent taint loss or under-labeling. +- Deployments that do not configure Valkey see no change to default build contents or FFI artifact size. +- `ce-plan` can implement the backend without inventing product behavior: the failure posture (fail-closed), TTL policy, and selection approach are decided here. The few genuinely technical or behavioral unknowns are enumerated explicitly under Outstanding Questions (the append-path fail-closed semantics being the one that must be resolved before planning). + +--- + +## Scope Boundaries + +- OAuth/exchanged token cache (the planned `TokenCacheControl`) — the most likely next consumer of the same Valkey connection, and the trigger for extracting a shared connection layer (R14). The token cache itself is not built here, and the connection layer is not pre-factored for it. +- Client-side Sentinel discovery and Valkey Cluster support — deferred; an infra-fronted single endpoint covers HA. +- Replica read-splitting — rejected; replication lag would silently downgrade taint. +- Local in-process fallback during outages — rejected in favor of fail-closed. +- Declassification / label removal — out of scope; the surface stays monotonic. +- Broader session KV surface (delegation hops, conversation history) beyond labels — deferred until those consumers exist. +- Sharing the CEL program cache, route/dispatch caches, or JWKS `KeyStore` through the KV store — these are per-node compute derived from config; a network store would be a pessimization, not a benefit. + +--- + +## Key Decisions + +- Config-driven selection via a `SessionStore` factory mirroring `PdpFactory`: the embedded FFI/gateway host cannot recompile to swap stores, and this reuses an established, understood pattern in the codebase. +- Fail-closed on store errors: labels drive information-flow authorization, so unavailability must degrade to denial, never to silent under-labeling. +- Append failure fails the request closed, uniformly with load failure (R18): `continue_processing` is computed after `persist_session`, so an append error flips the outcome to Deny — Pre-phase blocks the action (write-ahead), Post-phase blocks the tainted response. Chosen over best-effort+alarm because it is consistent with the never-silently-under-label thesis and the availability tradeoff already accepted; a full outage is self-covering (next load also fails closed), and the selective-failure residual is alarmed. +- `SessionStore` trait returns `Result`: the current `Vec` / `()` signatures have no error channel, which fail-closed requires. The memory store adapts trivially. +- Sliding TTL, default off: time-based expiry is time-based declassification and is sound only when the window ≥ session-identity lifetime. Gateway sessions are bounded, so a sliding TTL is available and recommended for production, but off is the safe default. +- `noeviction` + primary-only reads: eviction under memory pressure and replica replication lag are each independent silent-downgrade vectors; both are closed off so fail-closed stays honest. +- Separate crate + feature-gated FFI wiring: keeps the lean default build and FFI artifact size unchanged. The connection layer is kept internal for now and extracted only when a second consumer materializes (R14). +- Availability tradeoff (accepted): fail-closed + single-endpoint + primary-only + no local fallback means a Valkey outage or failover converts directly into correlated, fleet-wide request denial on the auth hot path. This is the deliberate price of never silently under-labeling; operators own HA via a fronting endpoint, and a latency/timeout budget bounds the blast radius (deferred to planning). + +--- + +## Dependencies / Assumptions + +- The gateway issues bounded-lifetime `session_id`s (tied to auth/conversation lifetime). This is the precondition that makes a sliding TTL sound; if it ceases to hold, TTL must be disabled. +- The operator provisions and manages Valkey, including HA via a fronting endpoint (K8s Service / VIP / proxy) and the `noeviction` memory policy. +- A Rust Valkey/Redis async client and connection pool will be selected during planning. + +--- + +## Outstanding Questions + +### Deferred to Planning + +- [Affects R2][Technical] **Config-selection seam.** "Mirror the `PdpFactory` pattern" is not a drop-in: PDPs are built *during* the config walk and threaded into handlers, but `session_store` is a single `Arc` captured into the visitor at construction (`register.rs`) before YAML parses, and the FFI `cpex_apl_install` hardcodes `in_process()` with no YAML. Planning must choose: defer store construction to the config walk, or parse the session-store block before `register_apl`. *(Raised by feasibility.)* +- [Affects R5, R6, R10][Technical] Connection/operation timeout, retry budget, and a stated latency/availability target (p99, thundering-herd control) for putting a synchronous primary round-trip on every request's critical path. *(Raised by product-lens, adversarial.)* +- [Affects R5, R16][Security] Label integrity against a compromised/writable Valkey — whether stored values need an HMAC/signature, or Valkey is accepted as a fully trusted component. Weigh the cost against the trust boundary. *(Raised by security.)* +- [Affects R1, R14][Technical] Rust client + pooling choice (e.g., `redis-rs` + `deadpool` vs `fred`), weighed against the workspace's lean-deps discipline. +- [Affects R1, R15, R16][Technical] Key/value representation and exact key schema (e.g., a Valkey SET per session keyed by prefix + `session_id`, with `SADD`/`SMEMBERS` giving the atomic union R16 requires). +- [Affects R10][Needs research] Whether the chosen client's TLS stack aligns with the workspace's existing `rustls` preference (as used by `reqwest` in `apl-identity-jwt` / `apl-delegator-oauth`). diff --git a/docs/operations/valkey-session-store.md b/docs/operations/valkey-session-store.md new file mode 100644 index 0000000..fd8f689 --- /dev/null +++ b/docs/operations/valkey-session-store.md @@ -0,0 +1,238 @@ +# Operating the Valkey Session Store + +The Valkey-backed `SessionStore` (`apl-session-valkey`) persists per-session +security **taint labels** across process restarts and shares them across +gateway nodes. Those labels drive information-flow authorization, so the +backend is **fail-closed**: any store error denies the request rather than +letting it proceed with missing taint. + +This runbook covers the operator-owned controls the backend depends on but +**cannot enforce from the client**. Getting them wrong silently weakens the +security guarantee, so treat this as part of the deployment contract. + +> Build note: the backend is compiled into the FFI artifact only with the +> `valkey` cargo feature (`cargo build -p cpex-ffi --features valkey`). +> Without it, the default in-process memory store is used and nothing here +> applies. + +--- + +## 1. Enabling it + +Add a `session_store` block under `global.apl` in the unified config: + +```yaml +global: + apl: + session_store: + kind: valkey + endpoint: valkey.internal:6379 # or rediss://valkey.internal:6379 + tls: true + username: gateway # ACL user (see §3) + password: ${VALKEY_PASSWORD} # inject from a secrets manager + key_prefix: taint:v1 # default; bump only on schema change + ttl_seconds: 86400 # optional sliding TTL (see §4) + max_session_lifetime_seconds: 86400 # enables the TTL-soundness warning + command_timeout_ms: 500 # fail-closed hot-path budget (default) + connect_timeout_ms: 250 # default +``` + +When no `session_store` block is present, the gateway keeps its in-process +memory store and none of this applies. + +--- + +## 2. `maxmemory-policy noeviction` (required) + +Run the Valkey instance backing the label keyspace with: + +``` +maxmemory-policy noeviction +``` + +Why: with any `*-lru` / `*-lfu` / `*-random` / `volatile-*` policy, Valkey +can **silently evict** a live session's taint set under memory pressure. A +later read then returns an empty set, the gateway under-labels, and may +**over-authorize** — the exact fail-open this store exists to prevent. With +`noeviction`, a full instance instead fails *writes* with an error, which the +backend converts into a denied request (fail-closed). The client cannot set +or enforce this — it is a server config you own. + +Note the volatile policies are **not** safe here even though they sound +scoped: the label keys carry a TTL (§4), so `volatile-lru`/`volatile-ttl` +would happily evict live keys. Use `noeviction` unconditionally. + +**Verify and monitor:** + +``` +valkey-cli CONFIG GET maxmemory-policy # must be "noeviction" +valkey-cli CONFIG GET maxmemory # must be a non-zero bound +``` + +- Alert if `evicted_keys` in `INFO stats` is ever non-zero — it must stay `0`. +- Watch `used_memory` vs `maxmemory` and the OOM write-error rate so you scale + before the instance fills. + +This is operator-owned contract — the backend does **not** verify it for you. +A best-effort startup `CONFIG GET maxmemory-policy` self-check that warns when +the policy is not `noeviction` is a deferred follow-up (it would require the +connection pool to dial at config-load, which today it does not). Until then, +the authoritative control — and its monitoring — is yours. + +--- + +## 3. TLS and least-privilege ACL + +**TLS is required for any non-localhost endpoint** — the backend rejects a +plaintext, non-localhost config at load. Security labels reveal which sessions +carry sensitive taint; in plaintext they are exposed to passive interception +and active MITM (label injection/suppression). Prefer **mTLS** so a stolen ACL +password alone cannot connect. + +``` +# valkey.conf (sketch) +port 0 +tls-port 6379 +tls-cert-file /etc/valkey/tls/server.crt +tls-key-file /etc/valkey/tls/server.key +tls-ca-cert-file /etc/valkey/tls/ca.crt +# tls-auth-clients yes # require client certs (mTLS) +``` + +**Minimum ACL** for the gateway user — it only needs `SADD`, `SMEMBERS`, +`EXPIRE`, and (for the self-check) `CONFIG|GET`, scoped to the key prefix: + +``` +ACL SETUSER gateway on >$STRONG_SECRET resetchannels -@all \ + ~taint:v1:* \ + +sadd +smembers +expire +config|get +``` + +- `~taint:v1:*` confines key access to the label namespace. +- Grant `+config|get` (the subcommand) — never bare `+config`. +- Consider giving `CONFIG|GET` to a separate health/admin user so the hot-path + writer's surface stays minimal. + +**Credentials:** never hard-code the secret; inject from a secrets manager. +Valkey ACL users support multiple password hashes, enabling overlap rotation +(add new, roll clients, drop old) with no downtime; with mTLS, rotate the +client cert via `tls-auto-reload-interval`. + +--- + +## 4. Sliding TTL and the soundness rule + +The TTL (`ttl_seconds`) is optional and **off by default**. When set, it is a +sliding TTL: refreshed on every load and append. + +**Soundness rule (R8):** a TTL is sound **only if it is ≥ the maximum lifetime +of a session identity.** A shorter TTL lets accumulated taint expire while the +session is still usable — a "downgrade-by-waiting": an adversary holds a +tainted session, waits out the TTL, and resumes it clean. If your gateway's +session identities are not bounded (e.g. header- or identity-derived ids with +no expiry), **leave the TTL off.** + +Set `max_session_lifetime_seconds` to your gateway's bound and the backend will +emit a startup warning (`alarm = "session_store_ttl_unsound"`) when the +configured TTL is shorter. This is best-effort; the operator owns the invariant. + +**TTL-refresh failures are fail-open for the read** (the labels were read +successfully). A *persistently* failing refresh, though, lets a sliding-TTL key +expire between requests and silently drop taint. Alert on +`alarm = "session_store_ttl_refresh_failed"`. + +--- + +## 5. Persistence and durability (required) + +The label keyspace is a **security system-of-record**, not a cache. Promoting +Valkey to hold an authorization input inverts its default durability +assumptions, so its on-disk persistence is part of the deployment contract — +alongside `noeviction` (§2) and the TTL rule (§4), this closes the third way a +label can silently vanish. + +**The failure mode this closes.** A `SADD` is acknowledged to the gateway, the +node crashes before the write is fsync'd to disk, and the label is **gone**. On +restart (or replica failover) the next read returns a normal `Ok(empty)` — +*not* an error — so fail-closed never trips. The request proceeds with **less +taint than actually accumulated**: a silent downgrade. Critically, this is +**invisible to every alarm in §7** because nothing errors, which is exactly why +it has to be closed at the server-config layer rather than detected at runtime. + +**The fsync options and their crash-loss windows:** + +| Setting | On crash | Notes | +|---------|----------|-------| +| `appendonly no` (RDB only) | Lose everything since the last snapshot (minutes) | Cache-shaped; **unsafe** for the label keyspace | +| `appendfsync everysec` | ~1s loss window | Recommended floor | +| `appendfsync always` | Effectively no loss | Per-write latency cost | + +**Recommended baseline:** AOF on with `appendfsync everysec` as the floor; use +`appendfsync always` where the threat model cannot tolerate the ~1s window. + +``` +# valkey.conf (sketch) +appendonly yes +appendfsync everysec # or: always +``` + +**Failover interaction with §6.** The "fail over to a healthy primary" guidance +inherits Valkey's **asynchronous** replication: a failover can promote a replica +that is missing the most recent un-replicated appends — the same downgrade by a +different path. Tighten replication durability (e.g. `min-replicas-to-write` / +`min-replicas-max-lag`, or `WAIT`-aware fronting) if your failover budget +demands it. + +Like `noeviction`, this is operator-owned contract: the client cannot set or +enforce it, and the backend does **not** self-check it today. A best-effort +startup `CONFIG GET appendonly` / `appendfsync` warning is a deferred follow-up +(same dial-at-config-load constraint as the `noeviction` self-check in §2). + +--- + +## 6. Topology, availability, and blast radius + +- **Single endpoint, primary-only reads.** The backend reads and writes one + endpoint and never read-splits to replicas — replica replication lag would + return stale (smaller) label sets, a silent downgrade. Achieve HA by pointing + `endpoint` at a fronting address (K8s Service, VIP, or proxy) that fails over + to a healthy primary. Client-side Sentinel/Cluster are not supported in v0. + +- **Availability tradeoff (accepted).** Because the store is fail-closed, + single-endpoint, and has no local fallback, a Valkey outage or failover + denies **session-bearing** requests across all nodes until it recovers. This + is the deliberate price of never silently under-labeling. The + `command_timeout_ms` / `connect_timeout_ms` budgets bound how long a request + waits before failing closed. + +- **Anonymous/sessionless traffic is unaffected** — requests with no resolved + session id never touch the store, so a Valkey outage does not deny them. + +- **No live-reload (v0).** Changing the `session_store` config requires a + reload/restart of the gateway to take effect for newly-installed routes; the + store is selected during config load and captured by route handlers. + +--- + +## 7. Alarms to wire up + +| Signal | Meaning | Action | +|--------|---------|--------| +| `alarm = "session_store_failure"` (op=load/append) | A store load/append failed; request was denied | Investigate Valkey health/connectivity; sustained → outage | +| `alarm = "session_store_ttl_refresh_failed"` | Sliding-TTL refresh failed on an otherwise-successful read | Risk of silent key expiry; check ACL grants `+expire`, instance health | +| `alarm = "session_store_ttl_unsound"` | Configured TTL < declared session lifetime | Raise the TTL or disable it | +| `evicted_keys > 0` (Valkey `INFO`) | Eviction is dropping taint keys | Fix `maxmemory-policy` to `noeviction`; scale memory | + +--- + +## 8. Local development + +``` +docker compose -f deploy/valkey-compose.yml up -d +VALKEY_TEST_URL=redis://127.0.0.1:6379 \ + cargo test -p apl-session-valkey --test valkey_store_integration -- --ignored +``` + +The compose file runs a `noeviction`-configured Valkey. It has no TLS/ACL and +runs RDB-default (non-durable, no AOF) — those are dev-only conveniences; +production must add TLS/ACL per §3 and AOF persistence per §5. diff --git a/docs/plans/2026-06-17-001-feat-valkey-session-store-plan.md b/docs/plans/2026-06-17-001-feat-valkey-session-store-plan.md new file mode 100644 index 0000000..3aac6b9 --- /dev/null +++ b/docs/plans/2026-06-17-001-feat-valkey-session-store-plan.md @@ -0,0 +1,487 @@ +--- +title: "feat: Valkey-backed SessionStore for CPEX" +type: feat +status: completed +date: 2026-06-17 +deepened: 2026-06-17 +origin: docs/brainstorms/valkey-session-store-requirements.md +--- + +# feat: Valkey-backed SessionStore for CPEX + +## Summary + +Add a config-selectable, Valkey-backed `SessionStore` alongside the in-process `MemorySessionStore`, so session security labels persist across restarts and are shared across gateway nodes. The work lands in eight dependency-ordered units: make the `SessionStore` trait fallible, propagate fail-closed semantics through the CMF invoker and route handler, add a `SessionStoreFactory` config seam, build a new feature-gated `apl-session-valkey` crate (redis-rs + deadpool-redis over rustls) with an atomic-SADD store, wire the factory through the FFI, add container-backed integration tests, and write operator docs. + +--- + +## Problem Frame + +CPEX embeds in an AI gateway mediating A2A/MCP interactions. Session security labels (`extensions.security.labels` — a monotonic taint set per `session_id` driving information-flow authorization) live only in a process-local `HashMap` (`MemorySessionStore`), so they vanish on restart and are invisible across nodes — both silent, security-relevant failures. See origin for the full frame and the resolved fail-closed/TTL/selection decisions. + +--- + +## Requirements + +Carried from origin (`docs/brainstorms/valkey-session-store-requirements.md`); R-IDs trace to it. + +- R1. Valkey-backed `SessionStore` implementing the trait surface, usable alongside `MemorySessionStore`. +- R2. Config-driven backend selection via a `SessionStoreFactory` (mirrors `PdpFactory`), `kind:`-tagged block under `global.apl`; host registers the factory. +- R3. Default remains `MemorySessionStore` when no session-store config block is present. +- R4. `SessionStore` trait methods return `Result`; `MemorySessionStore` adapts; all call sites (`for_request`, `persist_session`, `route_handler.rs`) propagate. Crate-local `thiserror` error type, not `anyhow`. +- R5. Unreachable/timeout/error, **and** reachable-but-undecodable/partial responses → store error (fail-closed). Distinct from a positively-confirmed key-miss (R15). +- R6. Primary-only reads (read-your-writes); no replica read-splitting. +- R7. Configurable sliding TTL refreshed on load and append; default off. Refresh-on-load is a write; define load/refresh-failure semantics. +- R8. Document the TTL soundness rule (TTL ≥ max session-identity lifetime). +- R9. Configurable key prefix/namespace (software); `noeviction` as operator runbook note + optional startup `CONFIG GET maxmemory-policy` self-check. +- R10. Single endpoint with optional password/ACL auth; **TLS required for non-localhost**. +- R11. Minimal connection config (single endpoint, TLS, auth); no pre-added Sentinel/Cluster fields. +- R12. Valkey container/compose setup for local dev and integration tests. +- R13. Own crate, feature-gated into `cpex-ffi` (`valkey = ["dep:apl-session-valkey"]`) **and** excluded from `default-members`. +- R14. Connection/client logic internal to the crate; no pre-factored shared layer for the deferred token cache. +- R15. Monotonic union semantics; unknown session (confirmed key-miss) returns empty, not error; monotonic within the TTL window. +- R16. `append_labels` is a single atomic server-side set-union; no client-side read-modify-write. +- R17. Startup WARNING when configured TTL < declared max session-identity lifetime. +- R18. Append error fails the request closed uniformly with load error (via `continue_processing` computed after `persist_session`); distinguished append-failure alarm on the selective-failure residual. + +**Origin actors:** A1 (gateway node), A2 (operator), A3 (Valkey endpoint). +**Origin flows:** F1 (cross-node/cross-restart propagation), F2 (store error → fail-closed). +**Origin acceptance examples:** AE1 (R4,R5 load fail-closed), AE2 (R7 TTL refresh), AE3 (R3,R15 default unchanged), AE4 (R1,R6,R15 cross-node union), AE5 (R2 config selection), AE6 (R18 append fail-closed + alarm). + +--- + +## Scope Boundaries + +- Sentinel/Cluster support, replica read-splitting, local in-process fallback, declassification/label removal, broader session KV surface, sharing per-node compute caches through the store — all out of scope (see origin). +- OAuth/exchanged token cache (`TokenCacheControl`) — the eventual second consumer and the trigger to extract a shared connection layer; not built here, connection layer not pre-factored. +- Application-level HMAC/signing of stored labels — out of scope for v0 (see Key Technical Decisions); the trust model is TLS/ACL/`noeviction`/network isolation. + +### Deferred to Follow-Up Work + +- HMAC-of-stored-values: revisit only if a deployment's threat model includes a Valkey writable by a party who cannot reach the gateway's signing key. + +--- + +## Context & Research + +### Relevant Code and Patterns + +- `crates/apl-cpex/src/session_store.rs` — `SessionStore` trait (`#[async_trait]`, `load_labels -> Vec`, `append_labels -> ()`), `MemorySessionStore`, unit-test shape (sort-before-assert, `Arc`). +- `crates/apl-cpex/src/cmf_invoker.rs` — `for_request` (`-> Self`, calls `load_labels` during hydration) and `persist_session` (`-> ()`, calls `append_labels`). The only two trait call sites. +- `crates/apl-cpex/src/route_handler.rs` — builds invoker via `for_request().await`, calls `persist_session().await` after evaluation; `continue_processing` derived from `decision.decision` **after** persist. Handler returns `Result<_, Box>`. +- `crates/apl-core/src/step.rs` — `PdpFactory` trait (`kind()` + `build(&serde_yaml::Value) -> Result, Box>`): the model to mirror. +- `crates/apl-cpex/src/visitor.rs` — `visit_global` walks `global.apl.pdp[]` and consults factories during the config walk; `build_pdp_from_config`. Session store, by contrast, is captured at visitor construction (`register.rs`), not built during the walk — the seam to bridge. +- `crates/apl-cpex/src/register.rs` — `AplOptions { session_store, pdp_factories, ... }`, `register_apl`; `AplOptions::in_process()` defaults to `MemorySessionStore`. +- `crates/cpex-ffi/src/apl.rs` — `cpex_apl_install` hardcodes `in_process()`, registers `pdp_factories`, receives no YAML (config arrives later via `cpex_load_config`). +- `crates/cpex-ffi/Cargo.toml` + root `Cargo.toml` — `apl-cedarling` optional-dep + `cedarling = ["dep:apl-cedarling"]` feature, plus `default-members` exclusion: the exact pattern R13 mirrors. +- `crates/apl-pdp-cel/` — reference leaf-crate layout (`Cargo.toml`, `lib.rs`, `factory.rs`, `error.rs`, `resolver.rs`, `tests/visitor_cel_config.rs`), `pub const KIND`, `thiserror` `BuildError`. +- `crates/apl-pdp-cel/tests/visitor_cel_config.rs` — canonical config-driven-backend integration-test harness (register_apl → load_config_yaml → initialize → invoke_named). +- `crates/apl-delegator-oauth/Cargo.toml`, `crates/apl-identity-jwt/Cargo.toml` — `reqwest = { default-features = false, features = ["json","rustls-tls"] }`: the rustls-over-native-tls discipline. +- `crates/cpex-core/src/error.rs` — `PluginError` (`Config { message }`, `Denied { violation }`, …) and host propagation. + +### Institutional Learnings + +- No `docs/solutions/` knowledge base exists; the origin requirements doc is the authoritative spec. After this lands, capture the trait-change and client decisions somewhere durable. +- PR #67 lesson: external-dependency tests silently passed as no-ops when the native module was absent. Integration tests here must skip **loudly** and be **CI-enforced** (env gate) so they cannot green-wash zero coverage. +- File-header convention (Location/Copyright/SPDX/Authors) is `make`-checked and mandatory on every new file. + +### External References + +- redis-rs (`redis` 1.x) docs.rs — `aio`/`tokio-comp`/`tokio-rustls-comp` features (tokio-rustls 0.26 + rustls 0.23, no native-tls), `AsyncConnectionConfig` timeouts, `ConnectionManager` reconnect/backoff, `pipe().atomic()` for MULTI/EXEC, `RedisError` predicates (`is_timeout`/`is_io_error`/`is_connection_dropped`, `ErrorKind::Parse`/`UnexpectedReturnType`). +- `deadpool-redis` 0.23 — external pool forwarding `tokio-rustls-comp`. +- Valkey docs — Transactions (MULTI/EXEC isolation), EXPIRE (refresh updates TTL; SADD leaves TTL untouched; overwrite commands clear it), Eviction (`noeviction`, `evicted_keys`), Replication (async → stale-read fail-open), ACL (least-privilege `~taint:v1:* +sadd +smembers +expire +config|get`), TLS/mTLS. +- AWS Builders' Library — timeouts/retries/backoff-with-jitter, circuit breaker, retry-storm avoidance (token-bucket budget). +- `testcontainers-modules` valkey feature. + +--- + +## Key Technical Decisions + +- **Config seam = factory + in-visitor store swap.** Add a `SessionStoreFactory` trait mirroring `PdpFactory`; the visitor consults it on a `global.apl.session_store` block during `visit_global`. Because `visit_global` runs before `install_handler` in the config walk, it can swap the visitor's own `session_store` field before any handler captures it — no per-request indirection, no handler-signature change, no new FFI entry point. (`ArcSwap` is reserved for a future live-reload need, not v0; see U3.) Rationale: truest fit for R2/AE5 and host-agnostic; the FFI-boundary alternative would not satisfy "selected via the walked YAML." +- **Client = `redis` (redis-rs) 1.x + `deadpool-redis` 0.23, `default-features=false`, `tokio-rustls-comp`.** Leanest deps, no forced crypto provider, tokio-minimal, CI-tested against Valkey 7+, `pipe().atomic()` = SADD+EXPIRE in one MULTI/EXEC. fred is the heavier batteries-included alternative; rejected on dep weight + older release cadence. +- **Key schema = `taint:v1:` SET.** Full-width SHA-256 keeps the Valkey keyspace itself collision-free and removes raw ids from the keyspace (charset-safe). It does **not** restore entropy lost upstream: `session_id` is already a 64-bit truncated digest from `session_resolver.rs` (`short_hash`), so two subjects colliding there already share one logical session and will deterministically share one Valkey key — closing that upstream collision (a wider `short_hash`) is out of scope here (cross-referenced as a residual). `SADD` for atomic union, `SMEMBERS` for load. +- **No application-level HMAC in v0.** Signing protects against altered labels but not deletion/under-labeling (the primary risk) and only helps if the attacker can't reach the gateway's signing key. Trust Valkey within the boundary (TLS/mTLS + least-privilege ACL + `noeviction` + network isolation); document the residual. +- **Fail-closed error mapping:** `Ok(empty set)` → empty labels (unknown session, R15); `Err` where `is_timeout|is_io_error|is_connection_dropped` or `ErrorKind::Parse|UnexpectedReturnType` → store error (R5). Empty-set is never an error (SMEMBERS on a missing key returns `[]`). +- **Append fail-closed mechanism:** `persist_session` returns `Result`; `route_handler.rs` converts an append `Err` into `continue_processing = false` + a `PluginViolation` (e.g. `session.persist_failed`) **before** building `ErasedResultFields`, since that struct is constructed after persist. Emit a distinguished append-failure metric/log (R18). +- **TTL via atomic pipeline:** `pipe().atomic().sadd(...).ignore().expire(...).ignore()` for append+refresh in one round trip. On load, a separate `EXPIRE` refresh is **fail-open for the current request**: the successfully-read labels are returned `Ok`, and a refresh failure is alarmed (not failed-closed) — the read itself succeeded. Cross-request consequence (a persistently-failing refresh lets a sliding-TTL key expire → a later load returns `Ok(empty)`, silently dropping taint) is covered by the alarm and documented in U8; this is the deliberate trade for not denying a request whose labels were read correctly. +- **Timeouts/retries (committed defaults, configurable):** ship concrete defaults so behavior and tests are deterministic — connect timeout **250ms**, per-command response timeout **500ms** (never `None`), **1** jittered retry behind a token-bucket budget at a single layer, circuit-breaker opens after **N consecutive failures** (default e.g. 5) → immediate fail-closed. Operators tune these from the committed baseline; they are not left undecided. + +--- + +## Open Questions + +### Resolved During Planning + +- Config-selection seam (origin deferred): resolved → factory + `arc-swap` late-bound handle (see Key Technical Decisions, U3). +- Client + pooling choice (origin deferred): resolved → redis-rs + deadpool-redis, rustls. +- Key/value representation (origin deferred): resolved → prefixed SHA-256 SET, SADD/SMEMBERS, atomic SADD+EXPIRE. +- rustls alignment (origin deferred): resolved → `tokio-rustls-comp` (tokio-rustls 0.26 + rustls 0.23), no native-tls/openssl. +- Label integrity/HMAC (origin deferred): resolved → no HMAC in v0; trust-boundary controls instead. +- Timeout/retry budget (origin deferred): resolved as configurable defaults (see Key Technical Decisions); exact production values are operator-tuned. + +### Deferred to Implementation + +- **Config live-reload behavior** — if `load_config_yaml` re-walks the *same* `AplConfigVisitor` on reload, the v0 in-visitor swap re-targets the field but already-installed handlers captured the prior `Arc` by value, and in-flight requests hold the prior store. v0 does **not** support session-store live-reload; if/when required, this is the one case that justifies the `ArcSwap` handle (U3). State the v0 limitation in U8. +- Whether the `CONFIG GET maxmemory-policy` self-check and the TTL-vs-lifetime warning run at factory `build()` time or at first connection — depends on when a live connection is first available. +- Exact `PluginError`/`PluginViolation` variant names and `SessionStoreError` variants — finalize against the surrounding code. The committed timeout/retry defaults (250ms/500ms/1-retry/5-failure-breaker) are tuning baselines, not open questions. +- **R14 verification** — "no public connection-layer API" is enforced by inspection (the `connection` module stays non-`pub` in `lib.rs`); there is no runtime test for it. Flag for reviewer check rather than a test assertion. + +--- + +## Output Structure + + crates/apl-session-valkey/ + Cargo.toml + src/ + lib.rs # module docs, pub use surface, pub const KIND + config.rs # YAML config parse: endpoint, TLS, auth, prefix, TTL, timeouts + error.rs # thiserror BuildError (config/connection construction) + connection.rs # internal: redis-rs + deadpool-redis pool, rustls, timeouts, reconnect + store.rs # ValkeySessionStore: SADD/SMEMBERS, atomic pipeline, key schema, error mapping + factory.rs # ValkeySessionStoreFactory: kind()="valkey", build(&serde_yaml::Value) + tests/ + valkey_store_integration.rs # testcontainers valkey: union/TTL/noeviction/ACL/fail-closed + docs/ + operations/valkey-session-store.md # operator runbook (R8/R9/R10) + deploy/ + valkey-compose.yml # local dev / integration container + +--- + +## High-Level Technical Design + +> *This illustrates the intended approach and is directional guidance for review, not implementation specification. The implementing agent should treat it as context, not code to reproduce.* + +Config-selection seam — how a `kind: valkey` block becomes the active store without changing the FFI's no-YAML install contract: + +```mermaid +sequenceDiagram + participant Host as Host / cpex-ffi + participant Reg as register_apl + participant Vis as AplConfigVisitor + participant Walk as load_config_yaml (visit_global) + participant H as AplRouteHandler + + Host->>Reg: AplOptions { session_store_factories:[valkey], default=Memory } + Reg->>Vis: new(.. session_store field = Memory ..) + Note over Host,Walk: later — config arrives via cpex_load_config + Walk->>Vis: visit_global sees global.apl.session_store { kind: valkey, .. } + Vis->>Vis: factory.build(cfg) yields a SessionStore Arc, then swaps visitor.session_store + Walk->>H: visit_route → install_handler clones the (already-swapped) Arc by value + Note over H: request time — handler uses the config-selected (valkey) store +``` + +Fail-closed wiring across load (pre-decision) and append (post-decision): + +```mermaid +flowchart TD + A[for_request: load_labels] -->|Err| D[fail closed before decision] + A -->|Ok labels| B[evaluate policy -> decision] + B --> C[persist_session: append_labels] + C -->|Err| E[continue_processing=false + violation + alarm] + C -->|Ok| F[continue_processing from decision] + E --> G[ErasedResultFields] + F --> G +``` + +--- + +## Implementation Units + +- U1. **Make `SessionStore` fallible** + +**Goal:** Change the trait to return `Result` with a crate-local error type; adapt `MemorySessionStore` and its tests. + +**Requirements:** R4, R15 + +**Dependencies:** None + +**Files:** +- Modify: `crates/apl-cpex/src/session_store.rs` +- Create: `crates/apl-cpex/src/session_store_error.rs` (or an inline `SessionStoreError` in `session_store.rs`) +- Test: `crates/apl-cpex/src/session_store.rs` (existing `#[cfg(test)] mod tests`) + +**Approach:** +- Define `SessionStoreError` via `thiserror` (workspace dep), variants covering connection/timeout, decode/protocol, and a generic backend message. Keep it string-friendly so non-CMF bridges can map it. +- Change `load_labels -> Result, SessionStoreError>` and `append_labels -> Result<(), SessionStoreError>`. Preserve the R15 contract in the doc comment: unknown session → `Ok(empty)`, never `Err`. +- `MemorySessionStore` returns `Ok(...)`; update the four unit tests to `.await.unwrap()`. + +**Patterns to follow:** `crates/apl-pdp-cel/src/error.rs` (thiserror `BuildError`); existing async-trait usage in `session_store.rs`. + +**Test scenarios:** +- Happy path: `append_then_load` round-trips, returns `Ok`. Covers AE3. +- Edge case: unknown session → `Ok(empty)`, not `Err`. Covers AE3. +- Edge case: monotonic dedupe across appends still holds under the `Result` signature. + +**Verification:** `apl-cpex` compiles; memory-store unit tests pass; trait doc states the unknown-session = `Ok(empty)` invariant. + +--- + +- U2. **Propagate fail-closed through CMF invoker + route handler** + +**Goal:** Thread the `Result` through `for_request`/`persist_session` and make `route_handler.rs` fail the request closed on load (pre-decision) and append (post-decision) errors, with a distinguished append-failure alarm. + +**Requirements:** R4, R5, R18; F2; AE1, AE6 + +**Dependencies:** U1 + +**Files:** +- Modify: `crates/apl-cpex/src/cmf_invoker.rs` (`for_request`, `persist_session`) +- Modify: `crates/apl-cpex/src/route_handler.rs` (`invoke`) +- Create: a test-double `SessionStore` (erroring + call-recording) — none exists today (all test sites use `MemorySessionStore`); place under `crates/apl-cpex/src/session_store.rs` `#[cfg(test)]` or a shared `tests/support/` module. +- Test: `crates/apl-cpex/tests/cmf_invoker_dispatch.rs`, `crates/apl-cpex/tests/end_to_end_route.rs` + +**Approach:** +- `for_request` → `Result`; a load error `?`-propagates as `Box` so the request fails closed before any decision (load runs pre-evaluation). Note: `load_labels` only runs when `session_id` is `Some` — sessionless/anonymous traffic has no state to load and is unaffected by a store outage (see blast-radius note in U8/R5). +- `persist_session` → `Result<(), …>`. In `invoke`, capture its `Result` (today it is a bare `.await;` discarding `()`), then apply this explicit **merge precedence** when building the `(continue_processing, violation)` tuple that feeds `ErasedResultFields` (a single `Option` slot): + - decision = **Allow** + append `Ok` → `(true, None)` (unchanged). + - decision = **Allow** + append `Err` → flip to `(false, Some(session.persist_failed violation))`. + - decision = **Deny** + append `Err` → keep the original policy violation (preserve attribution); `continue_processing` is already `false`. The append failure surfaces **only** as the distinguished alarm/metric, not in the violation slot. +- Emit a distinguished `tracing` error + metric on append failure regardless of decision (the selective-failure residual). Note `persist_session` no-ops when no new labels were added, so the append-fail path is only reachable on label-producing requests. + +**Execution note:** Start with a failing integration test asserting append-error → Deny (AE6), then wire the handler. + +**Patterns to follow:** `PluginError::Config`/`Denied` construction in `route_handler.rs`; existing `end_to_end_route.rs` harness. + +**Test scenarios:** +- Error path: `load_labels` returns `Err` during hydration → `invoke` fails closed, no decision computed. Covers AE1. +- Error path: decision Allow + `append_labels` `Err` → `continue_processing=false` + `session.persist_failed` violation + alarm. Covers AE6. +- Error path: decision Deny + `append_labels` `Err` → original policy violation preserved, append failure only alarmed (merge precedence). +- Happy path: both `Ok` → behavior identical to today (Allow still allows, Deny still denies). Covers AE3. +- Integration: a fake store erroring only on append (reads succeed) → request denied and alarm fired (selective-failure residual). +- Edge case: sessionless request (no `session_id`) during a simulated store outage → unaffected (no load, no append). + +**Verification:** New tests pass; a store error never yields an Allow with dropped labels; existing route tests still pass with the memory store. + +--- + +- U3. **`SessionStoreFactory` trait + config-selection seam** + +**Goal:** Add config-driven backend selection mirroring `PdpFactory`, with a late-bound active-store handle so the config walk can install the selected store. + +**Requirements:** R2, R3; AE3, AE5 + +**Dependencies:** U1 + +**Files:** +- Modify: `crates/apl-core/src/step.rs` (or a new `apl-core` module) — define `SessionStoreFactory` +- Modify: `crates/apl-cpex/src/register.rs` (`AplOptions.session_store_factories` + the exhaustive `AplOptions { .. }` destructure at the top of `register_apl`) +- Modify: `crates/apl-cpex/src/visitor.rs` (`visit_global` consults `global.apl.session_store`; swap the visitor's own store field) +- Modify the exhaustive `AplOptions { .. }` struct-literal sites that will otherwise fail to compile (no `..Default::default()` today): `crates/apl-cpex/tests/config_override.rs`, `crates/apl-cpex/tests/visitor_e2e.rs`, `crates/apl-cpex/tests/capability_gating.rs` (3 sites), `crates/apl-pdp-cel/tests/visitor_cel_config.rs`, `crates/apl-pdp-cedar-direct/tests/visitor_pdp_config.rs`. Consider adding `Default`/a builder for `AplOptions` so future field additions don't break literals. +- Test: `crates/apl-cpex/tests/config_override.rs` (or a new `tests/session_store_config.rs`) + +**Approach:** +- `SessionStoreFactory`: `kind() -> &str`, `build(&serde_yaml::Value) -> Result, Box>` — exact shape of `PdpFactory`. +- `AplOptions.session_store_factories: Vec>`, registered into the visitor like `pdp_factories`. Empty list → memory default, so existing `AplOptions::in_process()` callers are unaffected. +- **Primary mechanism (simplest correct):** `visit_global` runs strictly before `visit_route`/`install_handler` during the config walk (confirmed: `visitor.rs:307` vs `381+`, ordering in `manager.rs`). So `visit_global` parses the optional `global.apl.session_store { kind, ... }`, looks up the factory, builds the store, and **swaps the visitor's own `session_store` field**; `install_handler` then clones the already-selected `Arc` into each handler by value, exactly as today. No per-request indirection and no handler-signature change. +- **`ArcSwap` is reserved for live config-reload only** — re-targeting *already-installed* handlers after a swap. v0 does not support session-store live-reload (see Deferred to Implementation); if added later, use `Arc>>` (the inner `Arc` must be sized) on a shared handle the handler reads. Do not pay the per-request `ArcSwap` load cost in v0. +- No block present → default memory store (R3). Unknown `kind` / malformed block → `VisitorError` failing `load_config_yaml`. + +**Technical design:** see High-Level Technical Design sequence diagram (directional). + +**Patterns to follow:** `build_pdp_from_config` + `register_pdp_factory` in `visitor.rs`; `PdpFactory` in `apl-core/src/step.rs`. + +**Test scenarios:** +- Happy path: no `session_store` block → memory store active; load/append unchanged. Covers AE3. +- Happy path: a `kind: ` block → the fake store is selected and observed at request time. Covers AE5 (structure; Valkey-specific selection verified in U7). +- Error path: unknown `kind` → config load fails with a clear error. +- Edge case: handle defaults to memory before the walk; after the walk the swapped store is visible to a freshly installed handler. + +**Verification:** A config-selected fake store receives `append_labels`/`load_labels` calls during an end-to-end route; default path unchanged. + +--- + +- U4. **`apl-session-valkey` crate: connection layer + config** + +**Goal:** Scaffold the new feature-gated crate with config parsing and an internal redis-rs/deadpool connection module over rustls. + +**Requirements:** R10, R11, R13, R14 + +**Dependencies:** None (parallel with U1–U3) + +**Files:** +- Create: `crates/apl-session-valkey/Cargo.toml`, `src/lib.rs`, `src/config.rs`, `src/error.rs`, `src/connection.rs` +- Modify: root `Cargo.toml` (add to `members`, **not** `default-members`) +- Test: unit tests in `src/config.rs` (`#[cfg(test)]`) + +**Approach:** +- `redis = { version = "1.2", default-features = false, features = ["aio","tokio-comp","tokio-rustls-comp","connection-manager"] }`, `deadpool-redis = { version = "0.23", default-features = false, features = ["rt_tokio_1","tokio-rustls-comp"] }`, `thiserror`/`serde`/`serde_yaml`/`tracing`/`async-trait` from workspace. Document the rustls/no-native-tls rationale in `Cargo.toml` (mirror oauth/jwt comments). +- `config.rs`: parse endpoint (URL or host:port), TLS settings (TLS required when host is non-localhost — reject plaintext non-localhost at parse time, R10), auth (password/ACL, sourced from config/env), key prefix, TTL (optional; default off), and timeout/retry knobs with safe defaults. `BuildError` (thiserror) for malformed config. +- `connection.rs`: build the deadpool pool with connect + response timeouts and a jittered, budgeted reconnect policy. Internal module only (R14) — no public connection-layer API. +- File headers on every file (Location/Copyright/SPDX/Authors), `*.workspace = true` package fields. + +**Patterns to follow:** `crates/apl-pdp-cel/Cargo.toml` + layout; `reqwest` rustls feature lines in `apl-delegator-oauth`/`apl-identity-jwt`; `apl-cedarling` exclusion in root `Cargo.toml`. + +**Test scenarios:** +- Happy path: a well-formed YAML block parses into the config struct with expected endpoint/TLS/prefix/TTL. +- Edge case: non-localhost endpoint without TLS → `BuildError` (R10). +- Edge case: missing endpoint / unknown key → `BuildError`. +- Error path: TTL present but unparseable → `BuildError`. + +**Verification:** `cargo build -p apl-session-valkey` succeeds; `cargo build` (default-members) is unaffected; config unit tests pass; no native-tls/openssl in the dep tree (`cargo tree` shows rustls only). + +--- + +- U5. **`ValkeySessionStore`: atomic union, TTL, fail-closed mapping** + +**Goal:** Implement the trait against Valkey with an atomic SADD+EXPIRE, SMEMBERS load, the prefixed-SHA-256 key schema, and the R5/R15 error mapping. + +**Requirements:** R1, R5, R6, R7, R8, R9, R15, R16, R17 + +**Dependencies:** U1, U4 + +**Files:** +- Create: `crates/apl-session-valkey/src/store.rs` +- Test: `crates/apl-session-valkey/tests/valkey_store_integration.rs` (U7 owns the harness; basic per-method assertions can start here) + +**Approach:** +- Key = `:`; SET value-space. +- `append_labels`: `pipe().atomic().sadd(key, members).ignore()` and, when TTL configured, `.expire(key, ttl).ignore()` — one MULTI/EXEC round trip (R16); inspect EXEC replies and map any error to `Err`. +- `load_labels`: `SMEMBERS key` → `Ok(set)`; on configured TTL, refresh via `EXPIRE` that is **fail-open for the request** — return the read labels `Ok` and alarm on refresh failure (the read succeeded; see Key Technical Decisions). `Ok(empty)` for a missing key (R15). +- Error mapping (R5): `is_timeout|is_io_error|is_connection_dropped` or `ErrorKind::Parse|UnexpectedReturnType` → `SessionStoreError`. Primary-only connection (R6 — no replica routing). +- Startup self-checks: `CONFIG GET maxmemory-policy` → warn if not `noeviction` (R9); warn if configured TTL < declared max session-identity lifetime (R17). Exact timing per Deferred-to-Implementation. + +**Execution note:** Implement append/load test-first against the U7 container harness. + +**Patterns to follow:** redis-rs `pipe().atomic()` and `RedisError` predicates (External References); R15/R5 mapping sketch from the brainstorm. + +**Test scenarios:** +- Happy path: append then load round-trips the union (single node). Covers AE4 (single-node leg). +- Edge case: unknown session → `Ok(empty)` (key-miss, not error). Covers R15. +- Edge case: concurrent appends from two connections → final SMEMBERS is the full union (regression test for client-side RMW). Covers R16. +- Error path: unreachable endpoint → `Err` (fail-closed). Covers AE1/R5. +- Error path: reachable but undecodable reply → `Err`, not empty. Concrete mechanism: pre-seed the key as a non-SET type so `SMEMBERS`/EXEC returns `WRONGTYPE` → maps to `ErrorKind::UnexpectedReturnType`/`Parse` → `Err`. Covers R5 (closes the gap that fake-store and unreachable-container tests both miss). +- TTL: append/load at sub-TTL refreshes expiry; SADD alone does not reset a refreshed TTL unexpectedly. Covers AE2. +- TTL refresh fail-open: load succeeds but the `EXPIRE` refresh fails (e.g. ACL-denied EXPIRE) → returns `Ok(labels)` plus an alarm, not `Err`. + +**Verification:** Integration tests (U7) pass against a live container; error mapping distinguishes key-miss from connection/protocol failure. + +--- + +- U6. **Valkey factory + feature-gated FFI wiring** + +**Goal:** Implement `ValkeySessionStoreFactory` and wire it into `cpex-ffi` behind the `valkey` cargo feature. + +**Requirements:** R2, R13; AE5 + +**Dependencies:** U3, U5 + +**Files:** +- Create: `crates/apl-session-valkey/src/factory.rs` (+ `pub const KIND = "valkey"` in `lib.rs`) +- Modify: `crates/cpex-ffi/Cargo.toml` (optional dep + `valkey = ["dep:apl-session-valkey"]`) +- Modify: `crates/cpex-ffi/src/apl.rs` (`#[cfg(feature = "valkey")]` register the factory into `session_store_factories`) +- Test: `crates/apl-session-valkey/tests/valkey_store_integration.rs` (selection path) + +**Approach:** +- `ValkeySessionStoreFactory`: `kind() = "valkey"`, `build(cfg)` parses config (U4) and constructs `Arc`. +- FFI: add the optional dependency and feature exactly like `cedarling`; under `#[cfg(feature = "valkey")]`, push the factory into `opts.session_store_factories` in `cpex_apl_install`. Default (feature off) build is byte-for-byte unchanged. + +**Patterns to follow:** `CelPdpFactory`/`CedarDirectPdpFactory`; `cedarling` feature wiring in `cpex-ffi/Cargo.toml` and `apl.rs`. + +**Test scenarios:** +- Happy path: `kind: valkey` config block + registered factory → `ValkeySessionStore` is the active store end-to-end. Covers AE5. +- Edge case: feature off → no Valkey symbols linked; default FFI build/test unchanged. +- Error path: malformed valkey block → config load fails with a clear error. + +**Verification:** `cargo build -p cpex-ffi` (no features) unchanged; `cargo build -p cpex-ffi --features valkey` links the backend; AE5 passes. + +--- + +- U7. **Integration tests + Valkey container** + +**Goal:** Container-backed integration tests covering the cross-node and failure behaviors, skipping loudly without Docker and enforced in CI. + +**Requirements:** R12; AE2, AE4; verifies R5, R6, R16, R18 + +**Dependencies:** U5, U6 + +**Files:** +- Create: `crates/apl-session-valkey/tests/valkey_store_integration.rs` +- Create: `deploy/valkey-compose.yml` +- Modify: `crates/apl-session-valkey/Cargo.toml` (`[dev-dependencies]` testcontainers-modules valkey, tokio test features) +- Modify: CI workflow (env gate `REQUIRE_VALKEY_TESTS=1`) — note in Documentation if CI config lives outside this repo + +**Approach:** +- `testcontainers-modules` valkey image, tag pinned to the prod version. Tests `#[ignore]` by default, run via a dedicated `--ignored` job. +- Skip-cleanly: when Docker is absent and `REQUIRE_VALKEY_TESTS` is unset → `eprintln!` a loud SKIPPED line and return; when the env var is set (CI) → a `.start()` failure is a hard `panic!`. No silent `Ok(())`. +- Cross-node union: simulate two nodes by two pool connections appending concurrently; assert the union (AE4, R16). + +**Patterns to follow:** `apl-pdp-cel/tests/visitor_cel_config.rs` harness; `mockito` skip-discipline precedent; PR #67 anti-pattern (no silent no-op). + +**Test scenarios:** +- Integration: append on connection A, load on connection B → unioned labels (AE4). +- Integration: TTL refresh on load/append at sub-TTL extends expiry (AE2). +- Integration: `CONFIG GET maxmemory-policy` is asserted `noeviction` in the test container; `evicted_keys == 0`. +- Integration: restricted ACL user denied a command outside its grant (asserts the error). +- Error path: stopped/unreachable container → store returns `Err` (fail-closed, R5). +- Edge case: Docker absent without the env gate → test prints SKIPPED and returns; with the gate set → hard failure. + +**Verification:** Tests pass against a live container locally and in the CI `--ignored` job; the suite cannot green-wash when the container is missing in CI. + +--- + +- U8. **Operator documentation** + +**Goal:** Runbook covering the operator-owned controls the backend depends on. + +**Requirements:** R8, R9, R10 + +**Dependencies:** U4 (config shape stable) + +**Files:** +- Create: `docs/operations/valkey-session-store.md` + +**Approach:** +- Document: `maxmemory-policy noeviction` (+ `evicted_keys == 0` monitoring), least-privilege ACL (`on >secret resetchannels -@all ~taint:v1:* +sadd +smembers +expire +config|get`), TLS/mTLS setup and the non-localhost TLS requirement, the TTL soundness rule (TTL ≥ max session-identity lifetime) and the startup warning, the sliding-TTL refresh-failure alarm (a persistently-failing refresh risks silent key expiry → taint loss), credential handling/rotation (overlap rotation; mTLS auto-reload), HA via a fronting endpoint. +- **Blast radius (state precisely):** the availability tradeoff is "a Valkey outage → fail-closed denial of **session-bearing** requests." Anonymous/sessionless traffic (no resolved `session_id`) loads no state and is **not** denied by a store outage — do not overstate it as fleet-wide. + +**Test scenarios:** Test expectation: none — documentation only. + +**Verification:** Runbook covers every operator-owned precondition referenced by R8/R9/R10 and the Key Decisions availability tradeoff. + +--- + +## System-Wide Impact + +- **Interaction graph:** The trait change touches every `SessionStore` consumer — `MemorySessionStore`, `CmfPluginInvoker` (`for_request`, `persist_session`), `AplRouteHandler::invoke`, and all test files constructing `MemorySessionStore` (~10 under `crates/apl-cpex/tests/`, plus `apl-pdp-cel`/`apl-pdp-cedar-direct` visitor tests). The string-typed trait is also the surface future apl-mcp/apl-langgraph bridges inherit — the `Result` becomes part of their contract (intended, R4). +- **Error propagation:** Load error → `Box` out of `invoke` (host failure, pre-decision). Append error → `continue_processing=false` + violation (Deny, post-decision) + distinguished alarm. Build/config error → `VisitorError` failing `load_config_yaml`. +- **State lifecycle risks:** Concurrent cross-node append must be server-side SADD (U5/R16) — client-side RMW would lose labels. TTL refresh-on-load is a write; a refresh failure must not corrupt an already-successful read (R7). +- **API surface parity:** `AplOptions` gains `session_store_factories`; existing callers using `AplOptions::in_process()` are unaffected (empty factory list → memory default). +- **Integration coverage:** Cross-node union, fail-closed-on-unreachable, append-fail-closed→Deny, and key-miss→empty are not provable by unit mocks alone — covered by U2 (fake-store) and U7 (live container). +- **Unchanged invariants:** Monotonic union semantics and unknown-session→empty are preserved across both backends (R15). Default (no config) behavior is byte-for-byte unchanged (R3, AE3); default FFI artifact size unchanged when the `valkey` feature is off (R13). + +--- + +## Risks & Dependencies + +| Risk | Mitigation | +|------|------------| +| Trait `Result` change ripples to ~10 test files + bridges; noisy diff | U1 lands the signature + memory adaptation atomically; mechanical `.unwrap()`/`?` updates; CI `--workspace` catches stragglers. | +| Config-seam late-binding subtle (walk order vs request time) | `ArcSwap` handle decouples build-time from request-time; U3 tests the default-then-swap path explicitly. | +| Fail-closed couples fleet availability to one Valkey (accepted tradeoff) | Documented (origin Key Decisions + U8); bounded by connect/command timeouts, ≤1 budgeted retry, circuit breaker → immediate fail-closed. | +| Integration tests silently no-op without Docker (PR #67 lesson) | Loud SKIPPED + CI env gate (`REQUIRE_VALKEY_TESTS=1`) makes a missing container a hard CI failure. | +| Client/dep weight creeping into FFI artifact | `default-features=false` + feature gate + `default-members` exclusion (U4/U6); verify with `cargo tree` and artifact-size check. | +| `noeviction` is operator-owned; the client can't enforce it | Startup `CONFIG GET maxmemory-policy` warning (U5/R9) + runbook + `evicted_keys` monitoring (U8). | +| Config live-reload would swap the store under in-flight requests holding the prior `Arc` | v0 does not support session-store live-reload (Deferred to Implementation); documented limitation. `ArcSwap` handle is the future fix if reload is required. | +| Persistently-failing sliding-TTL refresh silently expires keys → cross-request taint loss | Refresh failure is alarmed (U5); runbook calls out the monitoring signal (U8); `noeviction` does not cover TTL expiry, so the alarm is the control. | + +--- + +## Documentation / Operational Notes + +- New operator runbook `docs/operations/valkey-session-store.md` (U8). +- New `deploy/valkey-compose.yml` for local dev/integration. +- CI gains an `--ignored` Valkey integration job with `REQUIRE_VALKEY_TESTS=1`; if CI config lives in another repo, flag the change there. +- After merge, capture the trait-change and client decisions in a durable learnings location (no `docs/solutions/` exists today). + +--- + +## Sources & References + +- **Origin document:** [docs/brainstorms/valkey-session-store-requirements.md](../brainstorms/valkey-session-store-requirements.md) +- Trait + call sites: `crates/apl-cpex/src/session_store.rs`, `crates/apl-cpex/src/cmf_invoker.rs`, `crates/apl-cpex/src/route_handler.rs` +- Factory pattern: `crates/apl-core/src/step.rs`, `crates/apl-cpex/src/visitor.rs`, `crates/apl-cpex/src/register.rs` +- Feature-gate precedent: `crates/cpex-ffi/Cargo.toml`, root `Cargo.toml`, `crates/cpex-ffi/src/apl.rs` +- Reference crate: `crates/apl-pdp-cel/` (layout, factory, error, tests) +- rustls discipline: `crates/apl-delegator-oauth/Cargo.toml`, `crates/apl-identity-jwt/Cargo.toml` +- External: redis-rs (docs.rs/redis), deadpool-redis, Valkey docs (transactions/expire/eviction/replication/acl/tls), testcontainers-modules, AWS Builders' Library (timeouts/retries/circuit-breaker)