diff --git a/Cargo.lock b/Cargo.lock index bf83f27..3f6d376 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,17 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -88,6 +99,15 @@ version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -142,11 +162,26 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bumpalo" -version = "3.17.0" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + +[[package]] +name = "byteorder" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" @@ -154,6 +189,25 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cassowary" version = "0.3.0" @@ -175,6 +229,8 @@ version = "1.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -209,6 +265,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "clap" version = "4.5.36" @@ -278,6 +344,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "core-foundation" version = "0.9.4" @@ -294,6 +366,45 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crossterm" version = "0.27.0" @@ -319,6 +430,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "ctrlc" version = "3.4.6" @@ -329,6 +450,40 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "deflate64" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2" + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" + +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + [[package]] name = "dirs" version = "5.0.1" @@ -398,6 +553,16 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -471,6 +636,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -489,9 +664,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasi 0.14.2+wasi-0.2.4", + "wasm-bindgen", ] [[package]] @@ -536,6 +713,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "home" version = "0.5.11" @@ -790,6 +976,15 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -837,7 +1032,7 @@ dependencies = [ "combine", "jni-sys", "log", - "thiserror", + "thiserror 1.0.69", "walkdir", "windows-sys 0.45.0", ] @@ -848,6 +1043,16 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.2", + "libc", +] + [[package]] name = "js-sys" version = "0.3.77" @@ -898,9 +1103,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.27" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad" [[package]] name = "lru" @@ -911,6 +1116,27 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "malloc_buf" version = "0.0.6" @@ -949,6 +1175,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -992,6 +1219,12 @@ dependencies = [ "libc", ] +[[package]] +name = "num-conv" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441" + [[package]] name = "num-traits" version = "0.2.19" @@ -1060,6 +1293,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -1078,6 +1321,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + [[package]] name = "popcorn-cli" version = "0.0.0-dev" @@ -1101,8 +1350,15 @@ dependencies = [ "tokio", "urlencoding", "webbrowser", + "zip", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "proc-macro2" version = "1.0.94" @@ -1170,7 +1426,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom 0.2.15", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1318,18 +1574,28 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -1373,6 +1639,17 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1409,6 +1686,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + [[package]] name = "slab" version = "0.4.9" @@ -1484,6 +1767,12 @@ dependencies = [ "syn", ] +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.100" @@ -1552,7 +1841,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", ] [[package]] @@ -1566,6 +1864,36 @@ dependencies = [ "syn", ] +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "time" +version = "0.3.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85c17d80feb7334b40c484e45ed1a5273dfd8bfda537c3be2e74a06a6686f327" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde_core", + "time-core", +] + +[[package]] +name = "time-core" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" + [[package]] name = "tinystr" version = "0.7.6" @@ -1659,6 +1987,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + [[package]] name = "unicase" version = "2.8.1" @@ -1741,6 +2075,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" @@ -2214,6 +2554,15 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.7.5" @@ -2259,6 +2608,26 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zeroize" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c50655cbb0fe3fc43170059e702f1ce5e19b84cec58dc87b037a09935c2f328" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zerovec" version = "0.10.4" @@ -2280,3 +2649,73 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "aes", + "arbitrary", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "deflate64", + "displaydoc", + "flate2", + "getrandom 0.3.2", + "hmac", + "indexmap", + "lzma-rs", + "memchr", + "pbkdf2", + "sha1", + "thiserror 2.0.18", + "time", + "xz2", + "zeroize", + "zopfli", + "zstd", +] + +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index d8d6c05..8780959 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ chrono = "0.4" urlencoding = "2.1.3" bytes = "1.11.1" futures-util = "0.3.31" +zip = "2.2.2" [dev-dependencies] tempfile = "3.10" diff --git a/README.md b/README.md index 12e13f2..f5a597c 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,26 @@ A command-line interface tool for submitting solutions to the [gpumode.com](http Tested on linux and mac but should just work on Windows as well. -## New: Nsight Compute Profiling +## New: QR v2 Nsight Compute Profiling -Profile your kernels with `--mode profile` and get detailed metrics. Currently only available for the NVFP4 Blackwell competition (Modal, which we use for other competitions, does not support NCU). See [docs/profiling.md](docs/profiling.md) for details. +Profile QR v2 submissions on the hosted GPU Mode B200 Nsight Compute service. +See [docs/profiling.md](docs/profiling.md) for a complete copy-paste flow. + +Quick QR v2 example: + +```bash +curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py +export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run +popcorn submit submission.py --leaderboard qr_v2 --profile-brev --benchmark-index 0 --no-tui +``` + +The CLI downloads and extracts the `.ncu-rep` file, prints a clickable terminal +link to the report, and ends with a macOS command that opens it in Nsight +Compute: + +```bash +open -a "NVIDIA Nsight Compute" profile.0-.../profile.ncu-rep +``` ## [NEW] Submit To The Linear Algebra Competition @@ -102,6 +119,12 @@ popcorn submit solution.py # Direct submission with all options popcorn submit --leaderboard grayscale_v2 --gpu A100 --mode leaderboard solution.py +# Nsight Compute profile on the hosted GPU Mode B200 profiler +POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run popcorn submit --leaderboard qr_v2 --profile-brev solution.py + +# Profile one QR v2 benchmark shape +POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run popcorn submit --leaderboard qr_v2 --profile-brev --benchmark-index 0 solution.py + # Plain output mode (no TUI, good for CI/scripts) popcorn submit --no-tui --leaderboard grayscale_v2 --gpu A100 --mode test solution.py diff --git a/docs/linalg-qr-b200.md b/docs/linalg-qr-b200.md index c63719c..258aea2 100644 --- a/docs/linalg-qr-b200.md +++ b/docs/linalg-qr-b200.md @@ -1,4 +1,4 @@ -# Submit To The Linear Algebra QR Competition +# Submit To The Linear Algebra QR v2 Competition First install and register Popcorn: @@ -7,7 +7,7 @@ curl -fsSL https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/install.s popcorn register discord ``` -Get the starter B200 QR submission: +Get the starter B200 QR v2 submission: ```bash curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py @@ -19,6 +19,17 @@ Run a correctness test: popcorn submit --leaderboard qr_v2 --gpu B200 --mode test submission.py ``` +Profile the first benchmark shape with Nsight Compute: + +```bash +export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run +popcorn submit --leaderboard qr_v2 --profile-brev --benchmark-index 0 submission.py +``` + +The CLI downloads a `.zip`, extracts `profile.ncu-rep`, and prints an +`open -a "NVIDIA Nsight Compute" ...` command. See +[profiling.md](profiling.md) for the complete QR v2 profiling flow. + Submit to the leaderboard: ```bash diff --git a/docs/profiling.md b/docs/profiling.md index 45601c6..ac0762d 100644 --- a/docs/profiling.md +++ b/docs/profiling.md @@ -1,65 +1,91 @@ -# Nsight Compute Profiling +# QR v2 Nsight Compute Profiling -Profile your kernels directly from the CLI and get detailed Nsight Compute metrics. This is particularly useful for the NVIDIA NVFP4 Blackwell competition where you need to optimize tensor core utilization. +This profiles the GPU Mode QR v2 problem from `reference-kernels` and downloads +an Nsight Compute `.ncu-rep` report that you can open locally. -**Note:** Profiling is currently only available for the NVFP4 Blackwell competition. Modal, which we use for other competitions, does not support NCU. - -## Quick Start +## 1. Install and Register ```bash -popcorn-cli submit submission.py --leaderboard nvfp4_dual_gemm --gpu NVIDIA --mode profile --no-tui +curl -fsSL https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/install.sh | bash +popcorn register discord ``` -## Expected Output +Restart your terminal if `popcorn` is not found after installation. -The profiler returns three key metric tables for each benchmark: +## 2. Get the QR v2 Starter Submission -**GPU Throughput** - Overall utilization: -``` -Metric Name Metric Unit Metric Value ----------------- ----------- ------------ -Memory [%] % 32.48 -Compute (SM) [%] % 13.23 +```bash +mkdir -p qr-v2-profile +cd qr-v2-profile +curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py ``` -**Pipe Utilization** - Which pipelines are active: -``` -Metric Name Metric Unit Metric Value --------------------- ----------- ------------ -TC % 16.67 -TMEM (Tensor Memory) % 15.27 -Tensor (FP) % 12.58 -ALU % 2.38 -TMA % 0.29 +The profiler uses the hosted GPU Mode NCU service: + +```bash +export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run ``` -**Warp State** - Where your warps are stalling: +## 3. Profile One QR v2 Shape + +This profiles `benchmarks[0]` from +`reference-kernels/problems/linalg/qr_v2/task.yml`: + +```bash +popcorn submit submission.py \ + --leaderboard qr_v2 \ + --profile-brev \ + --benchmark-index 0 \ + --no-tui ``` -Metric Name Metric Unit Metric Value ------------------------- ----------- ------------ -Stall Long Scoreboard inst 18.31 -Stall Wait inst 1.88 -Stall Short Scoreboard inst 1.23 -Selected inst 1.00 -Stall Barrier inst 0.75 + +The first QR v2 benchmark shape is: + +```text +batch: 20; n: 32; cond: 1; seed: 43214 ``` -## Trace Files +## 4. Open the Report + +After the run finishes, the CLI downloads and extracts files like: -After profiling, a zip file is saved to your current directory: +```text +profile.0-batch-20-n-32-cond-1-seed-43214.zip +profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep ``` -profile_20260113_031052_result0_profile0.zip + +The last line printed by the CLI opens the report on macOS: + +```bash +open -a "NVIDIA Nsight Compute" 'profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep' ``` -This contains a `.ncu-rep` file (the full Nsight Compute report): +## Profile All QR v2 Benchmark Shapes + +Omit `--benchmark-index`: + +```bash +popcorn submit submission.py \ + --leaderboard qr_v2 \ + --profile-brev \ + --no-tui ``` -$ unzip -l profile_20260113_031052_result0_profile0.zip - Length Date Time Name ---------- ---------- ----- ---- - 2178383 01-13-2026 03:10 profile.ncu-rep + +This profiles every entry in the `benchmarks:` list in QR v2 `task.yml`, not +the `tests:` list. It will produce one zip and one extracted `.ncu-rep` per +benchmark shape. + +## Normal Submit Commands + +For correctness testing: + +```bash +popcorn submit submission.py --leaderboard qr_v2 --gpu B200 --mode test --no-tui ``` -You can open this file in the Nsight Compute GUI for detailed analysis: +For leaderboard submission: + ```bash -ncu-ui profile.ncu-rep +popcorn submit submission.py --leaderboard qr_v2 --gpu B200 --mode leaderboard --no-tui ``` + diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index 21b72bb..a2b7a92 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -62,6 +62,14 @@ pub struct Cli { #[arg(long)] pub mode: Option, + /// Profile on the GPU Mode Brev B200 and save the Nsight Compute trace locally + #[arg(long)] + pub profile_brev: bool, + + /// Optional: Profile a single benchmark index when using --profile-brev + #[arg(long)] + pub benchmark_index: Option, + // Optional: Specify output file #[arg(short, long)] pub output: Option, @@ -137,6 +145,14 @@ enum Commands { #[arg(long)] mode: Option, + /// Profile on the GPU Mode Brev B200 and save the Nsight Compute trace locally + #[arg(long)] + profile_brev: bool, + + /// Optional: Profile a single benchmark index when using --profile-brev + #[arg(long)] + benchmark_index: Option, + // Optional: Specify output file #[arg(short, long)] output: Option, @@ -184,6 +200,8 @@ pub async fn execute(cli: Cli) -> Result<()> { gpu, leaderboard, mode, + profile_brev, + benchmark_index, output, no_tui, }) => { @@ -198,23 +216,34 @@ pub async fn execute(cli: Cli) -> Result<()> { // Use filepath from Submit command first, fallback to top-level filepath let final_filepath = filepath.or(cli.filepath); + let final_gpu = if profile_brev { + Some("B200_Brev".to_string()) + } else { + gpu + }; + let final_mode = if profile_brev { + Some("profile".to_string()) + } else { + mode + }; - if no_tui { + if no_tui || profile_brev { submit::run_submit_plain( final_filepath, // Resolved filepath - gpu, // From Submit command + final_gpu, // From Submit command leaderboard, // From Submit command - mode, // From Submit command + final_mode, // From Submit command cli_id, + benchmark_index.or(cli.benchmark_index), output, // From Submit command ) .await } else { submit::run_submit_tui( final_filepath, // Resolved filepath - gpu, // From Submit command + final_gpu, // From Submit command leaderboard, // From Submit command - mode, // From Submit command + final_mode, // From Submit command cli_id, output, // From Submit command ) @@ -269,7 +298,9 @@ pub async fn execute(cli: Cli) -> Result<()> { } None => { // Check if any of the submission-related flags were used at the top level - if cli.gpu.is_some() || cli.leaderboard.is_some() || cli.mode.is_some() { + if !cli.profile_brev + && (cli.gpu.is_some() || cli.leaderboard.is_some() || cli.mode.is_some()) + { return Err(anyhow!( "Please use the 'submit' subcommand when specifying submission options:\n\ popcorn-cli submit [--gpu GPU] [--leaderboard LEADERBOARD] [--mode MODE] FILEPATH" @@ -287,16 +318,29 @@ pub async fn execute(cli: Cli) -> Result<()> { ) })?; - // Run TUI with only filepath, no other options - submit::run_submit_tui( - Some(top_level_filepath), - None, // No GPU option - None, // No leaderboard option - None, // No mode option - cli_id, - None, // No output option - ) - .await + if cli.profile_brev { + submit::run_submit_plain( + Some(top_level_filepath), + Some("B200_Brev".to_string()), + cli.leaderboard, + Some("profile".to_string()), + cli_id, + cli.benchmark_index, + cli.output, + ) + .await + } else { + // Run TUI with only filepath, no other options + submit::run_submit_tui( + Some(top_level_filepath), + None, // No GPU option + None, // No leaderboard option + None, // No mode option + cli_id, + None, // No output option + ) + .await + } } else { Err(anyhow!( "No command or submission file specified. Use --help for usage." diff --git a/src/cmd/submit.rs b/src/cmd/submit.rs index ec8e6fb..29364f6 100644 --- a/src/cmd/submit.rs +++ b/src/cmd/submit.rs @@ -9,6 +9,7 @@ use ratatui::prelude::*; use ratatui::style::{Color, Style, Stylize}; use ratatui::text::{Line, Span}; use ratatui::widgets::{Block, Borders, List, ListItem, ListState}; +use serde_json::Value; use tokio::task::JoinHandle; use crate::models::{AppState, GpuItem, LeaderboardItem, SubmissionModeItem}; @@ -686,6 +687,7 @@ pub async fn run_submit_plain( leaderboard: Option, mode: Option, cli_id: String, + benchmark_index: Option, output: Option, ) -> Result<()> { let file_to_submit = match filepath { @@ -747,18 +749,34 @@ pub async fn run_submit_plain( // Create client and submit let client = service::create_client(Some(cli_id))?; - let result = service::submit_solution( - &client, - &file_to_submit, - &file_content, - &final_leaderboard, - &final_gpu, - &final_mode, - Some(Box::new(|msg| { - eprintln!("{}", msg); - })), - ) - .await?; + let result = if final_mode.eq_ignore_ascii_case("profile") + && final_gpu.eq_ignore_ascii_case("B200_Brev") + { + service::profile_brev_solution( + &client, + &file_to_submit, + &file_content, + &final_leaderboard, + benchmark_index, + Some(Box::new(|msg| { + eprintln!("{}", msg); + })), + ) + .await? + } else { + service::submit_solution( + &client, + &file_to_submit, + &file_content, + &final_leaderboard, + &final_gpu, + &final_mode, + Some(Box::new(|msg| { + eprintln!("{}", msg); + })), + ) + .await? + }; // Clean up the result text let trimmed = result.trim(); @@ -768,6 +786,8 @@ pub async fn run_submit_plain( trimmed }; + let profile_report_links = profile_report_links(content); + let content = content.replace("\\n", "\n"); // Write to file if output is specified @@ -783,6 +803,73 @@ pub async fn run_submit_plain( // Print to stdout println!("\n{}", content); + print_profile_report_links(profile_report_links); Ok(()) } + +#[derive(Debug)] +struct ProfileReportLink { + file_url: String, + label: String, + open_command: String, +} + +fn profile_report_links(content: &str) -> Vec { + let Ok(result) = serde_json::from_str::(content) else { + return Vec::new(); + }; + let Some(artifacts) = result + .get("downloaded_artifacts") + .and_then(|value| value.as_array()) + else { + return Vec::new(); + }; + + let mut links = Vec::new(); + for artifact in artifacts { + let Some(reports) = artifact.get("reports").and_then(|value| value.as_array()) else { + continue; + }; + for report in reports { + let Some(file_url) = report.get("file_url").and_then(|value| value.as_str()) else { + continue; + }; + let label = report + .get("path") + .and_then(|value| value.as_str()) + .unwrap_or("profile.ncu-rep"); + let open_command = report + .get("open_command") + .and_then(|value| value.as_str()) + .map(ToOwned::to_owned) + .unwrap_or_else(|| { + format!("open -a \"NVIDIA Nsight Compute\" {}", shell_quote(label)) + }); + links.push(ProfileReportLink { + file_url: file_url.to_string(), + label: label.to_string(), + open_command, + }); + } + } + links +} + +fn print_profile_report_links(links: Vec) { + for link in links { + println!( + "\nOpen in Nsight Compute: {}", + terminal_link(&link.file_url, &link.label) + ); + println!("{}", link.open_command); + } +} + +fn shell_quote(value: &str) -> String { + format!("'{}'", value.replace('\'', "'\\''")) +} + +fn terminal_link(url: &str, label: &str) -> String { + format!("\x1b]8;;{}\x1b\\{}\x1b]8;;\x1b\\", url, label) +} diff --git a/src/service/mod.rs b/src/service/mod.rs index 07a43d2..2ecb96a 100644 --- a/src/service/mod.rs +++ b/src/service/mod.rs @@ -6,10 +6,13 @@ use reqwest::multipart::{Form, Part}; use reqwest::Client; use serde_json::Value; use std::env; -use std::path::Path; +use std::fs::File as StdFile; +use std::io::Cursor; +use std::path::{Path, PathBuf}; use std::time::Duration; use tokio::io::AsyncWriteExt; use tokio::time::sleep; +use zip::ZipArchive; use crate::models::{ GpuItem, LeaderboardItem, SubmissionDetails, SubmissionJobStatus, SubmissionRun, @@ -602,6 +605,310 @@ pub async fn submit_solution>( .await } +pub async fn profile_brev_solution>( + client: &Client, + filepath: P, + file_content: &[u8], + leaderboard: &str, + benchmark_index: Option, + on_log: Option>, +) -> Result { + let base_url = env::var("POPCORN_BREV_PROFILER_URL") + .or_else(|_| env::var("BREV_PROFILER_URL")) + .map_err(|_| { + anyhow!( + "POPCORN_BREV_PROFILER_URL or BREV_PROFILER_URL is not set. Configure a hardened Brev profiler endpoint before using --profile-brev." + ) + })?; + let base_url = base_url.trim_end_matches('/'); + + let filename = filepath + .as_ref() + .file_name() + .ok_or_else(|| anyhow!("Invalid filepath"))? + .to_string_lossy(); + + let part = Part::bytes(file_content.to_vec()).file_name(filename.to_string()); + let mut form = Form::new() + .part("file", part) + .text("leaderboard", leaderboard.to_string()); + if let Some(index) = benchmark_index { + form = form.text("benchmark_index", index.to_string()); + } + + let resp = client + .post(format!("{}/profile", base_url)) + .multipart(form) + .timeout(Duration::from_secs(60)) + .send() + .await?; + + let status = resp.status(); + if !status.is_success() { + return Err(anyhow!( + "Profiler returned status {}: {}", + status, + response_error_text(resp).await? + )); + } + + let accepted: Value = resp.json().await?; + let job_id = accepted + .get("job_id") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Profiler did not return a job_id"))? + .to_string(); + + if let Some(ref cb) = on_log { + cb(format!( + "Profile job {} accepted. Waiting for results...", + job_id + )); + } + + let mut elapsed = 0; + loop { + let resp = match client + .get(format!("{}/jobs/{}", base_url, job_id)) + .timeout(Duration::from_secs(30)) + .send() + .await + { + Ok(resp) => resp, + Err(err) => { + if elapsed >= SUBMISSION_POLL_TIMEOUT_SECONDS { + return Err(err.into()); + } + if let Some(ref cb) = on_log { + cb(format!( + "Profile job {} status poll failed: {}. Retrying...", + job_id, err + )); + } + sleep(Duration::from_secs(SUBMISSION_POLL_INTERVAL_SECONDS)).await; + elapsed += SUBMISSION_POLL_INTERVAL_SECONDS; + continue; + } + }; + + let status = resp.status(); + if !status.is_success() { + return Err(anyhow!( + "Profiler status returned {}: {}", + status, + response_error_text(resp).await? + )); + } + + let job: Value = resp.json().await?; + let job_status = job + .get("status") + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + let queue_position = job.get("queue_position").and_then(|v| v.as_i64()); + + if let Some(ref cb) = on_log { + match queue_position { + Some(pos) => cb(format!( + "Profile job {} status: {} (queue position {}, {}s)", + job_id, job_status, pos, elapsed + )), + None => cb(format!( + "Profile job {} status: {} ({}s)", + job_id, job_status, elapsed + )), + } + } + + match job_status { + "succeeded" => { + let artifacts = download_profile_artifacts(client, base_url, &job).await?; + let mut result = job; + result["downloaded_artifacts"] = Value::Array( + artifacts + .iter() + .map(|artifact| artifact.to_json()) + .collect(), + ); + return serde_json::to_string_pretty(&result) + .map_err(|e| anyhow!("Failed to format profile result: {}", e)); + } + "failed" | "timed_out" => { + let error = job + .get("error") + .and_then(|v| v.as_str()) + .unwrap_or("No error details were provided"); + return Err(anyhow!("Profile job {} {}: {}", job_id, job_status, error)); + } + _ => {} + } + + if elapsed >= SUBMISSION_POLL_TIMEOUT_SECONDS { + return Err(anyhow!( + "Timed out waiting for profile job {} after {} seconds", + job_id, + SUBMISSION_POLL_TIMEOUT_SECONDS + )); + } + + sleep(Duration::from_secs(SUBMISSION_POLL_INTERVAL_SECONDS)).await; + elapsed += SUBMISSION_POLL_INTERVAL_SECONDS; + } +} + +#[derive(Debug)] +struct DownloadedProfileArtifact { + zip_path: PathBuf, + reports: Vec, +} + +impl DownloadedProfileArtifact { + fn to_json(&self) -> Value { + let reports: Vec = self + .reports + .iter() + .map(|path| { + serde_json::json!({ + "path": path.display().to_string(), + "file_url": file_url(path), + "open_command": format!( + "open -a \"NVIDIA Nsight Compute\" {}", + shell_quote(&path.display().to_string()) + ), + "ncu_ui_command": format!("ncu-ui {}", shell_quote(&path.display().to_string())), + }) + }) + .collect(); + + serde_json::json!({ + "zip_path": self.zip_path.display().to_string(), + "reports": reports, + }) + } +} + +async fn download_profile_artifacts( + client: &Client, + base_url: &str, + job: &Value, +) -> Result> { + let artifacts = job + .get("artifacts") + .and_then(|v| v.as_array()) + .ok_or_else(|| anyhow!("Profiler job did not include artifacts"))?; + + let mut saved = Vec::new(); + for artifact in artifacts { + let name = artifact + .get("name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Profiler artifact missing name"))?; + let url = artifact + .get("url") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Profiler artifact missing url"))?; + let artifact_url = if url.starts_with("http://") || url.starts_with("https://") { + url.to_string() + } else { + format!("{}{}", base_url, url) + }; + let bytes = client + .get(artifact_url) + .timeout(Duration::from_secs(120)) + .send() + .await? + .error_for_status()? + .bytes() + .await?; + let zip_path = PathBuf::from(name); + std::fs::write(&zip_path, bytes.as_ref()) + .map_err(|e| anyhow!("Failed to write profile artifact {}: {}", name, e))?; + let reports = extract_ncu_reports(&zip_path, bytes.as_ref())?; + saved.push(DownloadedProfileArtifact { zip_path, reports }); + } + Ok(saved) +} + +fn extract_ncu_reports(zip_path: &Path, bytes: &[u8]) -> Result> { + let mut archive = ZipArchive::new(Cursor::new(bytes)).map_err(|e| { + anyhow!( + "Failed to read profile artifact {}: {}", + zip_path.display(), + e + ) + })?; + let extract_dir = zip_path.with_extension(""); + std::fs::create_dir_all(&extract_dir).map_err(|e| { + anyhow!( + "Failed to create profile report directory {}: {}", + extract_dir.display(), + e + ) + })?; + + let mut reports = Vec::new(); + for idx in 0..archive.len() { + let mut entry = archive.by_index(idx).map_err(|e| { + anyhow!( + "Failed to read profile artifact entry in {}: {}", + zip_path.display(), + e + ) + })?; + if !entry.name().ends_with(".ncu-rep") { + continue; + } + + let file_name = Path::new(entry.name()) + .file_name() + .ok_or_else(|| anyhow!("Profile artifact contains an invalid report path"))?; + let mut report_path = extract_dir.join(file_name); + if report_path.exists() { + let stem = report_path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("profile"); + report_path = extract_dir.join(format!("{}-{}.ncu-rep", stem, idx)); + } + + let mut output = StdFile::create(&report_path) + .map_err(|e| anyhow!("Failed to create {}: {}", report_path.display(), e))?; + std::io::copy(&mut entry, &mut output) + .map_err(|e| anyhow!("Failed to extract {}: {}", report_path.display(), e))?; + reports.push(report_path); + } + Ok(reports) +} + +fn file_url(path: &Path) -> String { + let absolute = path + .canonicalize() + .unwrap_or_else(|_| path.to_path_buf()) + .display() + .to_string(); + format!( + "file://{}", + urlencoding::encode(&absolute).replace("%2F", "/") + ) +} + +fn shell_quote(value: &str) -> String { + format!("'{}'", value.replace('\'', "'\\''")) +} + +async fn response_error_text(resp: reqwest::Response) -> Result { + let text = resp.text().await?; + Ok(serde_json::from_str::(&text) + .ok() + .and_then(|v| { + v.get("detail") + .or_else(|| v.get("message")) + .and_then(|d| d.as_str()) + .map(str::to_string) + }) + .unwrap_or(text)) +} + async fn submit_solution_background>( client: &Client, filepath: P,