From 8fc14b1dbd7dda0b4737044511d06c0cbb2798c5 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 24 Jun 2026 17:58:29 -0700 Subject: [PATCH 1/2] Add Brev profiler CLI flow --- Cargo.lock | 461 +++++++++++++++++++++++++++++++++++++++++++-- Cargo.toml | 1 + README.md | 28 ++- docs/profiling.md | 55 +++++- src/cmd/mod.rs | 76 ++++++-- src/cmd/submit.rs | 111 +++++++++-- src/service/mod.rs | 309 +++++++++++++++++++++++++++++- 7 files changed, 994 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bf83f27..3f6d376 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,17 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -88,6 +99,15 @@ version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -142,11 +162,26 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bumpalo" -version = "3.17.0" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + +[[package]] +name = "byteorder" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" @@ -154,6 +189,25 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cassowary" version = "0.3.0" @@ -175,6 +229,8 @@ version = "1.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -209,6 +265,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "clap" version = "4.5.36" @@ -278,6 +344,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "core-foundation" version = "0.9.4" @@ -294,6 +366,45 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crossterm" version = "0.27.0" @@ -319,6 +430,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "ctrlc" version = "3.4.6" @@ -329,6 +450,40 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "deflate64" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2" + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" + +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + [[package]] name = "dirs" version = "5.0.1" @@ -398,6 +553,16 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -471,6 +636,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -489,9 +664,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasi 0.14.2+wasi-0.2.4", + "wasm-bindgen", ] [[package]] @@ -536,6 +713,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "home" version = "0.5.11" @@ -790,6 +976,15 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -837,7 +1032,7 @@ dependencies = [ "combine", "jni-sys", "log", - "thiserror", + "thiserror 1.0.69", "walkdir", "windows-sys 0.45.0", ] @@ -848,6 +1043,16 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.2", + "libc", +] + [[package]] name = "js-sys" version = "0.3.77" @@ -898,9 +1103,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.27" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad" [[package]] name = "lru" @@ -911,6 +1116,27 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "malloc_buf" version = "0.0.6" @@ -949,6 +1175,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -992,6 +1219,12 @@ dependencies = [ "libc", ] +[[package]] +name = "num-conv" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441" + [[package]] name = "num-traits" version = "0.2.19" @@ -1060,6 +1293,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -1078,6 +1321,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + [[package]] name = "popcorn-cli" version = "0.0.0-dev" @@ -1101,8 +1350,15 @@ dependencies = [ "tokio", "urlencoding", "webbrowser", + "zip", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "proc-macro2" version = "1.0.94" @@ -1170,7 +1426,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom 0.2.15", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1318,18 +1574,28 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -1373,6 +1639,17 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1409,6 +1686,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + [[package]] name = "slab" version = "0.4.9" @@ -1484,6 +1767,12 @@ dependencies = [ "syn", ] +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.100" @@ -1552,7 +1841,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", ] [[package]] @@ -1566,6 +1864,36 @@ dependencies = [ "syn", ] +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "time" +version = "0.3.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85c17d80feb7334b40c484e45ed1a5273dfd8bfda537c3be2e74a06a6686f327" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde_core", + "time-core", +] + +[[package]] +name = "time-core" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" + [[package]] name = "tinystr" version = "0.7.6" @@ -1659,6 +1987,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + [[package]] name = "unicase" version = "2.8.1" @@ -1741,6 +2075,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" @@ -2214,6 +2554,15 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.7.5" @@ -2259,6 +2608,26 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zeroize" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c50655cbb0fe3fc43170059e702f1ce5e19b84cec58dc87b037a09935c2f328" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zerovec" version = "0.10.4" @@ -2280,3 +2649,73 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "aes", + "arbitrary", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "deflate64", + "displaydoc", + "flate2", + "getrandom 0.3.2", + "hmac", + "indexmap", + "lzma-rs", + "memchr", + "pbkdf2", + "sha1", + "thiserror 2.0.18", + "time", + "xz2", + "zeroize", + "zopfli", + "zstd", +] + +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index d8d6c05..8780959 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ chrono = "0.4" urlencoding = "2.1.3" bytes = "1.11.1" futures-util = "0.3.31" +zip = "2.2.2" [dev-dependencies] tempfile = "3.10" diff --git a/README.md b/README.md index 12e13f2..24e9cc4 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,27 @@ Tested on linux and mac but should just work on Windows as well. ## New: Nsight Compute Profiling -Profile your kernels with `--mode profile` and get detailed metrics. Currently only available for the NVFP4 Blackwell competition (Modal, which we use for other competitions, does not support NCU). See [docs/profiling.md](docs/profiling.md) for details. +Profile your kernels with `--mode profile` and get detailed metrics. Modal does not expose NCU, so Modal-ranked competitions can use the Brev-backed B200 profiler. See [docs/profiling.md](docs/profiling.md) for details. + +For GPU Mode competitions that normally rank on Modal, you can request a Brev B200 Nsight Compute run without changing your `submission.py`: + +```bash +POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn-cli submission.py --profile-brev +``` + +`--profile-brev` requires `POPCORN_BREV_PROFILER_URL` or `BREV_PROFILER_URL`. +Do not point it at a public shared service until the Brev worker runs +untrusted submissions in a per-job container or equivalent locked-down +environment with no SSH keys, operator secrets, or other users' submissions +mounted. + +The CLI downloads and extracts the `.ncu-rep` file, prints a clickable terminal +link to the report, and ends with a macOS command that opens it in Nsight +Compute: + +```bash +open -a "NVIDIA Nsight Compute" profile.0-.../profile.ncu-rep +``` ## [NEW] Submit To The Linear Algebra Competition @@ -102,6 +122,12 @@ popcorn submit solution.py # Direct submission with all options popcorn submit --leaderboard grayscale_v2 --gpu A100 --mode leaderboard solution.py +# Nsight Compute profile on the GPU Mode Brev B200 +POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn submit --leaderboard grayscale_v2 --profile-brev solution.py + +# Profile one benchmark shape, useful for quick QR demos +POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn submit --leaderboard qr --profile-brev --benchmark-index 0 solution.py + # Plain output mode (no TUI, good for CI/scripts) popcorn submit --no-tui --leaderboard grayscale_v2 --gpu A100 --mode test solution.py diff --git a/docs/profiling.md b/docs/profiling.md index 45601c6..f8c7c78 100644 --- a/docs/profiling.md +++ b/docs/profiling.md @@ -2,7 +2,7 @@ Profile your kernels directly from the CLI and get detailed Nsight Compute metrics. This is particularly useful for the NVIDIA NVFP4 Blackwell competition where you need to optimize tensor core utilization. -**Note:** Profiling is currently only available for the NVFP4 Blackwell competition. Modal, which we use for other competitions, does not support NCU. +**Note:** Modal does not expose NCU. For Modal-ranked competitions, use the Brev-backed B200 profiler below. ## Quick Start @@ -10,6 +10,30 @@ Profile your kernels directly from the CLI and get detailed Nsight Compute metri popcorn-cli submit submission.py --leaderboard nvfp4_dual_gemm --gpu NVIDIA --mode profile --no-tui ``` +For competitions whose ranked runs use Modal, use the Brev-backed B200 profiler: + +```bash +POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn-cli submission.py --profile-brev +``` + +`--profile-brev` requires `POPCORN_BREV_PROFILER_URL` or `BREV_PROFILER_URL`. +The endpoint should be a local/staging profiler or a hardened shared service. +Do not expose a shared Brev profiler to untrusted users until submissions run +in a per-job container or equivalent locked-down environment with no SSH keys, +operator secrets, or other users' submissions mounted. + +This uses the `#!POPCORN leaderboard ...` directive in `submission.py`. If the file does not include a leaderboard directive, pass one explicitly: + +```bash +POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn-cli submit submission.py --leaderboard grayscale_v2 --profile-brev +``` + +For a quick single-shape QR profile: + +```bash +POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn-cli submit submission.py --leaderboard qr --profile-brev --benchmark-index 0 +``` + ## Expected Output The profiler returns three key metric tables for each benchmark: @@ -46,20 +70,39 @@ Stall Barrier inst 0.75 ## Trace Files -After profiling, a zip file is saved to your current directory: +After profiling, a zip file is saved to your current directory and the `.ncu-rep` +file is extracted next to it: ``` -profile_20260113_031052_result0_profile0.zip +profile.0-batch-20-n-32-cond-1-seed-43214.zip +profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep ``` This contains a `.ncu-rep` file (the full Nsight Compute report): ``` -$ unzip -l profile_20260113_031052_result0_profile0.zip +$ unzip -l profile.0-batch-20-n-32-cond-1-seed-43214.zip Length Date Time Name --------- ---------- ----- ---- 2178383 01-13-2026 03:10 profile.ncu-rep ``` -You can open this file in the Nsight Compute GUI for detailed analysis: +The CLI prints a clickable terminal link to the extracted report and makes the +last line a macOS command that opens it in Nsight Compute: ```bash -ncu-ui profile.ncu-rep +open -a "NVIDIA Nsight Compute" profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep ``` + +## Operator Notes + +The CLI does not assume a Brev provider username or home directory. Configure +the profiler service with explicit paths, or derive them from `$HOME` on the +Brev machine, instead of hardcoding paths such as `/home/`. + +For SSH access, prefer a dedicated restricted SSH key for the profiler proxy. +If you use the Brev CLI to maintain host metadata, run `brev refresh` once and +then use normal `ssh`/`scp` against the refreshed host alias. Avoid putting +`brev shell` or `brev copy` in per-job paths because they refresh each time. + +The Brev worker should run each untrusted `submission.py` inside a container or +similarly isolated runtime before a public profiler endpoint is enabled. +Container isolation is not a complete sandbox, but it materially reduces the +risk of submissions reading host secrets, SSH keys, or other submissions. diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index 21b72bb..a2b7a92 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -62,6 +62,14 @@ pub struct Cli { #[arg(long)] pub mode: Option, + /// Profile on the GPU Mode Brev B200 and save the Nsight Compute trace locally + #[arg(long)] + pub profile_brev: bool, + + /// Optional: Profile a single benchmark index when using --profile-brev + #[arg(long)] + pub benchmark_index: Option, + // Optional: Specify output file #[arg(short, long)] pub output: Option, @@ -137,6 +145,14 @@ enum Commands { #[arg(long)] mode: Option, + /// Profile on the GPU Mode Brev B200 and save the Nsight Compute trace locally + #[arg(long)] + profile_brev: bool, + + /// Optional: Profile a single benchmark index when using --profile-brev + #[arg(long)] + benchmark_index: Option, + // Optional: Specify output file #[arg(short, long)] output: Option, @@ -184,6 +200,8 @@ pub async fn execute(cli: Cli) -> Result<()> { gpu, leaderboard, mode, + profile_brev, + benchmark_index, output, no_tui, }) => { @@ -198,23 +216,34 @@ pub async fn execute(cli: Cli) -> Result<()> { // Use filepath from Submit command first, fallback to top-level filepath let final_filepath = filepath.or(cli.filepath); + let final_gpu = if profile_brev { + Some("B200_Brev".to_string()) + } else { + gpu + }; + let final_mode = if profile_brev { + Some("profile".to_string()) + } else { + mode + }; - if no_tui { + if no_tui || profile_brev { submit::run_submit_plain( final_filepath, // Resolved filepath - gpu, // From Submit command + final_gpu, // From Submit command leaderboard, // From Submit command - mode, // From Submit command + final_mode, // From Submit command cli_id, + benchmark_index.or(cli.benchmark_index), output, // From Submit command ) .await } else { submit::run_submit_tui( final_filepath, // Resolved filepath - gpu, // From Submit command + final_gpu, // From Submit command leaderboard, // From Submit command - mode, // From Submit command + final_mode, // From Submit command cli_id, output, // From Submit command ) @@ -269,7 +298,9 @@ pub async fn execute(cli: Cli) -> Result<()> { } None => { // Check if any of the submission-related flags were used at the top level - if cli.gpu.is_some() || cli.leaderboard.is_some() || cli.mode.is_some() { + if !cli.profile_brev + && (cli.gpu.is_some() || cli.leaderboard.is_some() || cli.mode.is_some()) + { return Err(anyhow!( "Please use the 'submit' subcommand when specifying submission options:\n\ popcorn-cli submit [--gpu GPU] [--leaderboard LEADERBOARD] [--mode MODE] FILEPATH" @@ -287,16 +318,29 @@ pub async fn execute(cli: Cli) -> Result<()> { ) })?; - // Run TUI with only filepath, no other options - submit::run_submit_tui( - Some(top_level_filepath), - None, // No GPU option - None, // No leaderboard option - None, // No mode option - cli_id, - None, // No output option - ) - .await + if cli.profile_brev { + submit::run_submit_plain( + Some(top_level_filepath), + Some("B200_Brev".to_string()), + cli.leaderboard, + Some("profile".to_string()), + cli_id, + cli.benchmark_index, + cli.output, + ) + .await + } else { + // Run TUI with only filepath, no other options + submit::run_submit_tui( + Some(top_level_filepath), + None, // No GPU option + None, // No leaderboard option + None, // No mode option + cli_id, + None, // No output option + ) + .await + } } else { Err(anyhow!( "No command or submission file specified. Use --help for usage." diff --git a/src/cmd/submit.rs b/src/cmd/submit.rs index ec8e6fb..29364f6 100644 --- a/src/cmd/submit.rs +++ b/src/cmd/submit.rs @@ -9,6 +9,7 @@ use ratatui::prelude::*; use ratatui::style::{Color, Style, Stylize}; use ratatui::text::{Line, Span}; use ratatui::widgets::{Block, Borders, List, ListItem, ListState}; +use serde_json::Value; use tokio::task::JoinHandle; use crate::models::{AppState, GpuItem, LeaderboardItem, SubmissionModeItem}; @@ -686,6 +687,7 @@ pub async fn run_submit_plain( leaderboard: Option, mode: Option, cli_id: String, + benchmark_index: Option, output: Option, ) -> Result<()> { let file_to_submit = match filepath { @@ -747,18 +749,34 @@ pub async fn run_submit_plain( // Create client and submit let client = service::create_client(Some(cli_id))?; - let result = service::submit_solution( - &client, - &file_to_submit, - &file_content, - &final_leaderboard, - &final_gpu, - &final_mode, - Some(Box::new(|msg| { - eprintln!("{}", msg); - })), - ) - .await?; + let result = if final_mode.eq_ignore_ascii_case("profile") + && final_gpu.eq_ignore_ascii_case("B200_Brev") + { + service::profile_brev_solution( + &client, + &file_to_submit, + &file_content, + &final_leaderboard, + benchmark_index, + Some(Box::new(|msg| { + eprintln!("{}", msg); + })), + ) + .await? + } else { + service::submit_solution( + &client, + &file_to_submit, + &file_content, + &final_leaderboard, + &final_gpu, + &final_mode, + Some(Box::new(|msg| { + eprintln!("{}", msg); + })), + ) + .await? + }; // Clean up the result text let trimmed = result.trim(); @@ -768,6 +786,8 @@ pub async fn run_submit_plain( trimmed }; + let profile_report_links = profile_report_links(content); + let content = content.replace("\\n", "\n"); // Write to file if output is specified @@ -783,6 +803,73 @@ pub async fn run_submit_plain( // Print to stdout println!("\n{}", content); + print_profile_report_links(profile_report_links); Ok(()) } + +#[derive(Debug)] +struct ProfileReportLink { + file_url: String, + label: String, + open_command: String, +} + +fn profile_report_links(content: &str) -> Vec { + let Ok(result) = serde_json::from_str::(content) else { + return Vec::new(); + }; + let Some(artifacts) = result + .get("downloaded_artifacts") + .and_then(|value| value.as_array()) + else { + return Vec::new(); + }; + + let mut links = Vec::new(); + for artifact in artifacts { + let Some(reports) = artifact.get("reports").and_then(|value| value.as_array()) else { + continue; + }; + for report in reports { + let Some(file_url) = report.get("file_url").and_then(|value| value.as_str()) else { + continue; + }; + let label = report + .get("path") + .and_then(|value| value.as_str()) + .unwrap_or("profile.ncu-rep"); + let open_command = report + .get("open_command") + .and_then(|value| value.as_str()) + .map(ToOwned::to_owned) + .unwrap_or_else(|| { + format!("open -a \"NVIDIA Nsight Compute\" {}", shell_quote(label)) + }); + links.push(ProfileReportLink { + file_url: file_url.to_string(), + label: label.to_string(), + open_command, + }); + } + } + links +} + +fn print_profile_report_links(links: Vec) { + for link in links { + println!( + "\nOpen in Nsight Compute: {}", + terminal_link(&link.file_url, &link.label) + ); + println!("{}", link.open_command); + } +} + +fn shell_quote(value: &str) -> String { + format!("'{}'", value.replace('\'', "'\\''")) +} + +fn terminal_link(url: &str, label: &str) -> String { + format!("\x1b]8;;{}\x1b\\{}\x1b]8;;\x1b\\", url, label) +} diff --git a/src/service/mod.rs b/src/service/mod.rs index 07a43d2..2ecb96a 100644 --- a/src/service/mod.rs +++ b/src/service/mod.rs @@ -6,10 +6,13 @@ use reqwest::multipart::{Form, Part}; use reqwest::Client; use serde_json::Value; use std::env; -use std::path::Path; +use std::fs::File as StdFile; +use std::io::Cursor; +use std::path::{Path, PathBuf}; use std::time::Duration; use tokio::io::AsyncWriteExt; use tokio::time::sleep; +use zip::ZipArchive; use crate::models::{ GpuItem, LeaderboardItem, SubmissionDetails, SubmissionJobStatus, SubmissionRun, @@ -602,6 +605,310 @@ pub async fn submit_solution>( .await } +pub async fn profile_brev_solution>( + client: &Client, + filepath: P, + file_content: &[u8], + leaderboard: &str, + benchmark_index: Option, + on_log: Option>, +) -> Result { + let base_url = env::var("POPCORN_BREV_PROFILER_URL") + .or_else(|_| env::var("BREV_PROFILER_URL")) + .map_err(|_| { + anyhow!( + "POPCORN_BREV_PROFILER_URL or BREV_PROFILER_URL is not set. Configure a hardened Brev profiler endpoint before using --profile-brev." + ) + })?; + let base_url = base_url.trim_end_matches('/'); + + let filename = filepath + .as_ref() + .file_name() + .ok_or_else(|| anyhow!("Invalid filepath"))? + .to_string_lossy(); + + let part = Part::bytes(file_content.to_vec()).file_name(filename.to_string()); + let mut form = Form::new() + .part("file", part) + .text("leaderboard", leaderboard.to_string()); + if let Some(index) = benchmark_index { + form = form.text("benchmark_index", index.to_string()); + } + + let resp = client + .post(format!("{}/profile", base_url)) + .multipart(form) + .timeout(Duration::from_secs(60)) + .send() + .await?; + + let status = resp.status(); + if !status.is_success() { + return Err(anyhow!( + "Profiler returned status {}: {}", + status, + response_error_text(resp).await? + )); + } + + let accepted: Value = resp.json().await?; + let job_id = accepted + .get("job_id") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Profiler did not return a job_id"))? + .to_string(); + + if let Some(ref cb) = on_log { + cb(format!( + "Profile job {} accepted. Waiting for results...", + job_id + )); + } + + let mut elapsed = 0; + loop { + let resp = match client + .get(format!("{}/jobs/{}", base_url, job_id)) + .timeout(Duration::from_secs(30)) + .send() + .await + { + Ok(resp) => resp, + Err(err) => { + if elapsed >= SUBMISSION_POLL_TIMEOUT_SECONDS { + return Err(err.into()); + } + if let Some(ref cb) = on_log { + cb(format!( + "Profile job {} status poll failed: {}. Retrying...", + job_id, err + )); + } + sleep(Duration::from_secs(SUBMISSION_POLL_INTERVAL_SECONDS)).await; + elapsed += SUBMISSION_POLL_INTERVAL_SECONDS; + continue; + } + }; + + let status = resp.status(); + if !status.is_success() { + return Err(anyhow!( + "Profiler status returned {}: {}", + status, + response_error_text(resp).await? + )); + } + + let job: Value = resp.json().await?; + let job_status = job + .get("status") + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + let queue_position = job.get("queue_position").and_then(|v| v.as_i64()); + + if let Some(ref cb) = on_log { + match queue_position { + Some(pos) => cb(format!( + "Profile job {} status: {} (queue position {}, {}s)", + job_id, job_status, pos, elapsed + )), + None => cb(format!( + "Profile job {} status: {} ({}s)", + job_id, job_status, elapsed + )), + } + } + + match job_status { + "succeeded" => { + let artifacts = download_profile_artifacts(client, base_url, &job).await?; + let mut result = job; + result["downloaded_artifacts"] = Value::Array( + artifacts + .iter() + .map(|artifact| artifact.to_json()) + .collect(), + ); + return serde_json::to_string_pretty(&result) + .map_err(|e| anyhow!("Failed to format profile result: {}", e)); + } + "failed" | "timed_out" => { + let error = job + .get("error") + .and_then(|v| v.as_str()) + .unwrap_or("No error details were provided"); + return Err(anyhow!("Profile job {} {}: {}", job_id, job_status, error)); + } + _ => {} + } + + if elapsed >= SUBMISSION_POLL_TIMEOUT_SECONDS { + return Err(anyhow!( + "Timed out waiting for profile job {} after {} seconds", + job_id, + SUBMISSION_POLL_TIMEOUT_SECONDS + )); + } + + sleep(Duration::from_secs(SUBMISSION_POLL_INTERVAL_SECONDS)).await; + elapsed += SUBMISSION_POLL_INTERVAL_SECONDS; + } +} + +#[derive(Debug)] +struct DownloadedProfileArtifact { + zip_path: PathBuf, + reports: Vec, +} + +impl DownloadedProfileArtifact { + fn to_json(&self) -> Value { + let reports: Vec = self + .reports + .iter() + .map(|path| { + serde_json::json!({ + "path": path.display().to_string(), + "file_url": file_url(path), + "open_command": format!( + "open -a \"NVIDIA Nsight Compute\" {}", + shell_quote(&path.display().to_string()) + ), + "ncu_ui_command": format!("ncu-ui {}", shell_quote(&path.display().to_string())), + }) + }) + .collect(); + + serde_json::json!({ + "zip_path": self.zip_path.display().to_string(), + "reports": reports, + }) + } +} + +async fn download_profile_artifacts( + client: &Client, + base_url: &str, + job: &Value, +) -> Result> { + let artifacts = job + .get("artifacts") + .and_then(|v| v.as_array()) + .ok_or_else(|| anyhow!("Profiler job did not include artifacts"))?; + + let mut saved = Vec::new(); + for artifact in artifacts { + let name = artifact + .get("name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Profiler artifact missing name"))?; + let url = artifact + .get("url") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Profiler artifact missing url"))?; + let artifact_url = if url.starts_with("http://") || url.starts_with("https://") { + url.to_string() + } else { + format!("{}{}", base_url, url) + }; + let bytes = client + .get(artifact_url) + .timeout(Duration::from_secs(120)) + .send() + .await? + .error_for_status()? + .bytes() + .await?; + let zip_path = PathBuf::from(name); + std::fs::write(&zip_path, bytes.as_ref()) + .map_err(|e| anyhow!("Failed to write profile artifact {}: {}", name, e))?; + let reports = extract_ncu_reports(&zip_path, bytes.as_ref())?; + saved.push(DownloadedProfileArtifact { zip_path, reports }); + } + Ok(saved) +} + +fn extract_ncu_reports(zip_path: &Path, bytes: &[u8]) -> Result> { + let mut archive = ZipArchive::new(Cursor::new(bytes)).map_err(|e| { + anyhow!( + "Failed to read profile artifact {}: {}", + zip_path.display(), + e + ) + })?; + let extract_dir = zip_path.with_extension(""); + std::fs::create_dir_all(&extract_dir).map_err(|e| { + anyhow!( + "Failed to create profile report directory {}: {}", + extract_dir.display(), + e + ) + })?; + + let mut reports = Vec::new(); + for idx in 0..archive.len() { + let mut entry = archive.by_index(idx).map_err(|e| { + anyhow!( + "Failed to read profile artifact entry in {}: {}", + zip_path.display(), + e + ) + })?; + if !entry.name().ends_with(".ncu-rep") { + continue; + } + + let file_name = Path::new(entry.name()) + .file_name() + .ok_or_else(|| anyhow!("Profile artifact contains an invalid report path"))?; + let mut report_path = extract_dir.join(file_name); + if report_path.exists() { + let stem = report_path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("profile"); + report_path = extract_dir.join(format!("{}-{}.ncu-rep", stem, idx)); + } + + let mut output = StdFile::create(&report_path) + .map_err(|e| anyhow!("Failed to create {}: {}", report_path.display(), e))?; + std::io::copy(&mut entry, &mut output) + .map_err(|e| anyhow!("Failed to extract {}: {}", report_path.display(), e))?; + reports.push(report_path); + } + Ok(reports) +} + +fn file_url(path: &Path) -> String { + let absolute = path + .canonicalize() + .unwrap_or_else(|_| path.to_path_buf()) + .display() + .to_string(); + format!( + "file://{}", + urlencoding::encode(&absolute).replace("%2F", "/") + ) +} + +fn shell_quote(value: &str) -> String { + format!("'{}'", value.replace('\'', "'\\''")) +} + +async fn response_error_text(resp: reqwest::Response) -> Result { + let text = resp.text().await?; + Ok(serde_json::from_str::(&text) + .ok() + .and_then(|v| { + v.get("detail") + .or_else(|| v.get("message")) + .and_then(|d| d.as_str()) + .map(str::to_string) + }) + .unwrap_or(text)) +} + async fn submit_solution_background>( client: &Client, filepath: P, From a9475db773a846ddf3c5d62adce0dc67c43a7e49 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 24 Jun 2026 22:57:02 -0700 Subject: [PATCH 2/2] Document QR v2 NCU profiling flow --- README.md | 25 ++++---- docs/linalg-qr-b200.md | 15 ++++- docs/profiling.md | 129 ++++++++++++++++++----------------------- 3 files changed, 80 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index 24e9cc4..f5a597c 100644 --- a/README.md +++ b/README.md @@ -5,22 +5,19 @@ A command-line interface tool for submitting solutions to the [gpumode.com](http Tested on linux and mac but should just work on Windows as well. -## New: Nsight Compute Profiling +## New: QR v2 Nsight Compute Profiling -Profile your kernels with `--mode profile` and get detailed metrics. Modal does not expose NCU, so Modal-ranked competitions can use the Brev-backed B200 profiler. See [docs/profiling.md](docs/profiling.md) for details. +Profile QR v2 submissions on the hosted GPU Mode B200 Nsight Compute service. +See [docs/profiling.md](docs/profiling.md) for a complete copy-paste flow. -For GPU Mode competitions that normally rank on Modal, you can request a Brev B200 Nsight Compute run without changing your `submission.py`: +Quick QR v2 example: ```bash -POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn-cli submission.py --profile-brev +curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py +export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run +popcorn submit submission.py --leaderboard qr_v2 --profile-brev --benchmark-index 0 --no-tui ``` -`--profile-brev` requires `POPCORN_BREV_PROFILER_URL` or `BREV_PROFILER_URL`. -Do not point it at a public shared service until the Brev worker runs -untrusted submissions in a per-job container or equivalent locked-down -environment with no SSH keys, operator secrets, or other users' submissions -mounted. - The CLI downloads and extracts the `.ncu-rep` file, prints a clickable terminal link to the report, and ends with a macOS command that opens it in Nsight Compute: @@ -122,11 +119,11 @@ popcorn submit solution.py # Direct submission with all options popcorn submit --leaderboard grayscale_v2 --gpu A100 --mode leaderboard solution.py -# Nsight Compute profile on the GPU Mode Brev B200 -POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn submit --leaderboard grayscale_v2 --profile-brev solution.py +# Nsight Compute profile on the hosted GPU Mode B200 profiler +POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run popcorn submit --leaderboard qr_v2 --profile-brev solution.py -# Profile one benchmark shape, useful for quick QR demos -POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn submit --leaderboard qr --profile-brev --benchmark-index 0 solution.py +# Profile one QR v2 benchmark shape +POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run popcorn submit --leaderboard qr_v2 --profile-brev --benchmark-index 0 solution.py # Plain output mode (no TUI, good for CI/scripts) popcorn submit --no-tui --leaderboard grayscale_v2 --gpu A100 --mode test solution.py diff --git a/docs/linalg-qr-b200.md b/docs/linalg-qr-b200.md index c63719c..258aea2 100644 --- a/docs/linalg-qr-b200.md +++ b/docs/linalg-qr-b200.md @@ -1,4 +1,4 @@ -# Submit To The Linear Algebra QR Competition +# Submit To The Linear Algebra QR v2 Competition First install and register Popcorn: @@ -7,7 +7,7 @@ curl -fsSL https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/install.s popcorn register discord ``` -Get the starter B200 QR submission: +Get the starter B200 QR v2 submission: ```bash curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py @@ -19,6 +19,17 @@ Run a correctness test: popcorn submit --leaderboard qr_v2 --gpu B200 --mode test submission.py ``` +Profile the first benchmark shape with Nsight Compute: + +```bash +export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run +popcorn submit --leaderboard qr_v2 --profile-brev --benchmark-index 0 submission.py +``` + +The CLI downloads a `.zip`, extracts `profile.ncu-rep`, and prints an +`open -a "NVIDIA Nsight Compute" ...` command. See +[profiling.md](profiling.md) for the complete QR v2 profiling flow. + Submit to the leaderboard: ```bash diff --git a/docs/profiling.md b/docs/profiling.md index f8c7c78..ac0762d 100644 --- a/docs/profiling.md +++ b/docs/profiling.md @@ -1,108 +1,91 @@ -# Nsight Compute Profiling +# QR v2 Nsight Compute Profiling -Profile your kernels directly from the CLI and get detailed Nsight Compute metrics. This is particularly useful for the NVIDIA NVFP4 Blackwell competition where you need to optimize tensor core utilization. +This profiles the GPU Mode QR v2 problem from `reference-kernels` and downloads +an Nsight Compute `.ncu-rep` report that you can open locally. -**Note:** Modal does not expose NCU. For Modal-ranked competitions, use the Brev-backed B200 profiler below. - -## Quick Start - -```bash -popcorn-cli submit submission.py --leaderboard nvfp4_dual_gemm --gpu NVIDIA --mode profile --no-tui -``` - -For competitions whose ranked runs use Modal, use the Brev-backed B200 profiler: +## 1. Install and Register ```bash -POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn-cli submission.py --profile-brev +curl -fsSL https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/install.sh | bash +popcorn register discord ``` -`--profile-brev` requires `POPCORN_BREV_PROFILER_URL` or `BREV_PROFILER_URL`. -The endpoint should be a local/staging profiler or a hardened shared service. -Do not expose a shared Brev profiler to untrusted users until submissions run -in a per-job container or equivalent locked-down environment with no SSH keys, -operator secrets, or other users' submissions mounted. +Restart your terminal if `popcorn` is not found after installation. -This uses the `#!POPCORN leaderboard ...` directive in `submission.py`. If the file does not include a leaderboard directive, pass one explicitly: +## 2. Get the QR v2 Starter Submission ```bash -POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn-cli submit submission.py --leaderboard grayscale_v2 --profile-brev +mkdir -p qr-v2-profile +cd qr-v2-profile +curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py ``` -For a quick single-shape QR profile: +The profiler uses the hosted GPU Mode NCU service: ```bash -POPCORN_BREV_PROFILER_URL=http://127.0.0.1:8765 popcorn-cli submit submission.py --leaderboard qr --profile-brev --benchmark-index 0 +export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run ``` -## Expected Output +## 3. Profile One QR v2 Shape -The profiler returns three key metric tables for each benchmark: +This profiles `benchmarks[0]` from +`reference-kernels/problems/linalg/qr_v2/task.yml`: -**GPU Throughput** - Overall utilization: -``` -Metric Name Metric Unit Metric Value ----------------- ----------- ------------ -Memory [%] % 32.48 -Compute (SM) [%] % 13.23 +```bash +popcorn submit submission.py \ + --leaderboard qr_v2 \ + --profile-brev \ + --benchmark-index 0 \ + --no-tui ``` -**Pipe Utilization** - Which pipelines are active: -``` -Metric Name Metric Unit Metric Value --------------------- ----------- ------------ -TC % 16.67 -TMEM (Tensor Memory) % 15.27 -Tensor (FP) % 12.58 -ALU % 2.38 -TMA % 0.29 -``` +The first QR v2 benchmark shape is: -**Warp State** - Where your warps are stalling: -``` -Metric Name Metric Unit Metric Value ------------------------- ----------- ------------ -Stall Long Scoreboard inst 18.31 -Stall Wait inst 1.88 -Stall Short Scoreboard inst 1.23 -Selected inst 1.00 -Stall Barrier inst 0.75 +```text +batch: 20; n: 32; cond: 1; seed: 43214 ``` -## Trace Files +## 4. Open the Report -After profiling, a zip file is saved to your current directory and the `.ncu-rep` -file is extracted next to it: -``` +After the run finishes, the CLI downloads and extracts files like: + +```text profile.0-batch-20-n-32-cond-1-seed-43214.zip profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep ``` -This contains a `.ncu-rep` file (the full Nsight Compute report): -``` -$ unzip -l profile.0-batch-20-n-32-cond-1-seed-43214.zip - Length Date Time Name ---------- ---------- ----- ---- - 2178383 01-13-2026 03:10 profile.ncu-rep +The last line printed by the CLI opens the report on macOS: + +```bash +open -a "NVIDIA Nsight Compute" 'profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep' ``` -The CLI prints a clickable terminal link to the extracted report and makes the -last line a macOS command that opens it in Nsight Compute: +## Profile All QR v2 Benchmark Shapes + +Omit `--benchmark-index`: + ```bash -open -a "NVIDIA Nsight Compute" profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep +popcorn submit submission.py \ + --leaderboard qr_v2 \ + --profile-brev \ + --no-tui ``` -## Operator Notes +This profiles every entry in the `benchmarks:` list in QR v2 `task.yml`, not +the `tests:` list. It will produce one zip and one extracted `.ncu-rep` per +benchmark shape. + +## Normal Submit Commands -The CLI does not assume a Brev provider username or home directory. Configure -the profiler service with explicit paths, or derive them from `$HOME` on the -Brev machine, instead of hardcoding paths such as `/home/`. +For correctness testing: -For SSH access, prefer a dedicated restricted SSH key for the profiler proxy. -If you use the Brev CLI to maintain host metadata, run `brev refresh` once and -then use normal `ssh`/`scp` against the refreshed host alias. Avoid putting -`brev shell` or `brev copy` in per-job paths because they refresh each time. +```bash +popcorn submit submission.py --leaderboard qr_v2 --gpu B200 --mode test --no-tui +``` + +For leaderboard submission: + +```bash +popcorn submit submission.py --leaderboard qr_v2 --gpu B200 --mode leaderboard --no-tui +``` -The Brev worker should run each untrusted `submission.py` inside a container or -similarly isolated runtime before a public profiler endpoint is enabled. -Container isolation is not a complete sandbox, but it materially reduces the -risk of submissions reading host secrets, SSH keys, or other submissions.