gpu-mode · msaroufim · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,6 +24,7 @@ chrono = "0.4"
 urlencoding = "2.1.3"
 bytes = "1.11.1"
 futures-util = "0.3.31"
+zip = "2.2.2"
 
 [dev-dependencies]
 tempfile = "3.10"
diff --git a/README.md b/README.md
@@ -5,9 +5,26 @@ A command-line interface tool for submitting solutions to the [gpumode.com](http
 
 Tested on linux and mac but should just work on Windows as well.
 
-## New: Nsight Compute Profiling
+## New: QR v2 Nsight Compute Profiling
 
-Profile your kernels with `--mode profile` and get detailed metrics. Currently only available for the NVFP4 Blackwell competition (Modal, which we use for other competitions, does not support NCU). See [docs/profiling.md](docs/profiling.md) for details.
+Profile QR v2 submissions on the hosted GPU Mode B200 Nsight Compute service.
+See [docs/profiling.md](docs/profiling.md) for a complete copy-paste flow.
+
+Quick QR v2 example:
+
+```bash
+curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py
+export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run
+popcorn submit submission.py --leaderboard qr_v2 --profile-brev --benchmark-index 0 --no-tui
+```
+
+The CLI downloads and extracts the `.ncu-rep` file, prints a clickable terminal
+link to the report, and ends with a macOS command that opens it in Nsight
+Compute:
+
+```bash
+open -a "NVIDIA Nsight Compute" profile.0-.../profile.ncu-rep
+```
 
 ## [NEW] Submit To The Linear Algebra Competition
 
@@ -102,6 +119,12 @@ popcorn submit solution.py
 # Direct submission with all options
 popcorn submit --leaderboard grayscale_v2 --gpu A100 --mode leaderboard solution.py
 
+# Nsight Compute profile on the hosted GPU Mode B200 profiler
+POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run popcorn submit --leaderboard qr_v2 --profile-brev solution.py
+
+# Profile one QR v2 benchmark shape
+POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run popcorn submit --leaderboard qr_v2 --profile-brev --benchmark-index 0 solution.py
+
 # Plain output mode (no TUI, good for CI/scripts)
 popcorn submit --no-tui --leaderboard grayscale_v2 --gpu A100 --mode test solution.py
 

diff --git a/docs/linalg-qr-b200.md b/docs/linalg-qr-b200.md
@@ -1,4 +1,4 @@
-# Submit To The Linear Algebra QR Competition
+# Submit To The Linear Algebra QR v2 Competition
 
 First install and register Popcorn:
 
@@ -7,7 +7,7 @@ curl -fsSL https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/install.s
 popcorn register discord
 ```
 
-Get the starter B200 QR submission:
+Get the starter B200 QR v2 submission:
 
 ```bash
 curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py
@@ -19,6 +19,17 @@ Run a correctness test:
 popcorn submit --leaderboard qr_v2 --gpu B200 --mode test submission.py
 ```
 
+Profile the first benchmark shape with Nsight Compute:
+
+```bash
+export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run
+popcorn submit --leaderboard qr_v2 --profile-brev --benchmark-index 0 submission.py
+```
+
+The CLI downloads a `.zip`, extracts `profile.ncu-rep`, and prints an
+`open -a "NVIDIA Nsight Compute" ...` command. See
+[profiling.md](profiling.md) for the complete QR v2 profiling flow.
+
 Submit to the leaderboard:
 
 ```bash

diff --git a/docs/profiling.md b/docs/profiling.md
@@ -1,65 +1,91 @@
-# Nsight Compute Profiling
+# QR v2 Nsight Compute Profiling
 
-Profile your kernels directly from the CLI and get detailed Nsight Compute metrics. This is particularly useful for the NVIDIA NVFP4 Blackwell competition where you need to optimize tensor core utilization.
+This profiles the GPU Mode QR v2 problem from `reference-kernels` and downloads
+an Nsight Compute `.ncu-rep` report that you can open locally.
 
-**Note:** Profiling is currently only available for the NVFP4 Blackwell competition. Modal, which we use for other competitions, does not support NCU.
-
-## Quick Start
+## 1. Install and Register
 
 ```bash
-popcorn-cli submit submission.py --leaderboard nvfp4_dual_gemm --gpu NVIDIA --mode profile --no-tui
+curl -fsSL https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/install.sh | bash
+popcorn register discord
 ```
 
-## Expected Output
+Restart your terminal if `popcorn` is not found after installation.
 
-The profiler returns three key metric tables for each benchmark:
+## 2. Get the QR v2 Starter Submission
 
-**GPU Throughput** - Overall utilization:
-```
-Metric Name      Metric Unit Metric Value
----------------- ----------- ------------
-Memory [%]                 %        32.48
-Compute (SM) [%]           %        13.23
+```bash
+mkdir -p qr-v2-profile
+cd qr-v2-profile
+curl -O https://raw.githubusercontent.com/gpu-mode/reference-kernels/main/problems/linalg/qr_v2/submission.py
 ```
 
-**Pipe Utilization** - Which pipelines are active:
-```
-Metric Name          Metric Unit Metric Value
--------------------- ----------- ------------
-TC                             %        16.67
-TMEM (Tensor Memory)           %        15.27
-Tensor (FP)                    %        12.58
-ALU                            %         2.38
-TMA                            %         0.29
+The profiler uses the hosted GPU Mode NCU service:
+
+```bash
+export POPCORN_BREV_PROFILER_URL=https://http--brev-profiler-proxy--dxfjds728w5v.code.run
 ```
 
-**Warp State** - Where your warps are stalling:
+## 3. Profile One QR v2 Shape
+
+This profiles `benchmarks[0]` from
+`reference-kernels/problems/linalg/qr_v2/task.yml`:
+
+```bash
+popcorn submit submission.py \
+  --leaderboard qr_v2 \
+  --profile-brev \
+  --benchmark-index 0 \
+  --no-tui
 ```
-Metric Name              Metric Unit Metric Value
------------------------- ----------- ------------
-Stall Long Scoreboard           inst        18.31
-Stall Wait                      inst         1.88
-Stall Short Scoreboard          inst         1.23
-Selected                        inst         1.00
-Stall Barrier                   inst         0.75
+
+The first QR v2 benchmark shape is:
+
+```text
+batch: 20; n: 32; cond: 1; seed: 43214
 ```
 
-## Trace Files
+## 4. Open the Report
+
+After the run finishes, the CLI downloads and extracts files like:
 
-After profiling, a zip file is saved to your current directory:
+```text
+profile.0-batch-20-n-32-cond-1-seed-43214.zip
+profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep
 ```
-profile_20260113_031052_result0_profile0.zip
+
+The last line printed by the CLI opens the report on macOS:
+
+```bash
+open -a "NVIDIA Nsight Compute" 'profile.0-batch-20-n-32-cond-1-seed-43214/profile.ncu-rep'
 ```
 
-This contains a `.ncu-rep` file (the full Nsight Compute report):
+## Profile All QR v2 Benchmark Shapes
+
+Omit `--benchmark-index`:
+
+```bash
+popcorn submit submission.py \
+  --leaderboard qr_v2 \
+  --profile-brev \
+  --no-tui
 ```
-$ unzip -l profile_20260113_031052_result0_profile0.zip
-  Length      Date    Time    Name
----------  ---------- -----   ----
-  2178383  01-13-2026 03:10   profile.ncu-rep
+
+This profiles every entry in the `benchmarks:` list in QR v2 `task.yml`, not
+the `tests:` list. It will produce one zip and one extracted `.ncu-rep` per
+benchmark shape.
+
+## Normal Submit Commands
+
+For correctness testing:
+
+```bash
+popcorn submit submission.py --leaderboard qr_v2 --gpu B200 --mode test --no-tui
 ```
 
-You can open this file in the Nsight Compute GUI for detailed analysis:
+For leaderboard submission:
+
 ```bash
-ncu-ui profile.ncu-rep
+popcorn submit submission.py --leaderboard qr_v2 --gpu B200 --mode leaderboard --no-tui
 ```
+
diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs
@@ -62,6 +62,14 @@ pub struct Cli {
     #[arg(long)]
     pub mode: Option<String>,
 
+    /// Profile on the GPU Mode Brev B200 and save the Nsight Compute trace locally
+    #[arg(long)]
+    pub profile_brev: bool,
+
+    /// Optional: Profile a single benchmark index when using --profile-brev
+    #[arg(long)]
+    pub benchmark_index: Option<usize>,
+
     // Optional: Specify output file
     #[arg(short, long)]
     pub output: Option<String>,
@@ -137,6 +145,14 @@ enum Commands {
         #[arg(long)]
         mode: Option<String>,
 
+        /// Profile on the GPU Mode Brev B200 and save the Nsight Compute trace locally
+        #[arg(long)]
+        profile_brev: bool,
+
+        /// Optional: Profile a single benchmark index when using --profile-brev
+        #[arg(long)]
+        benchmark_index: Option<usize>,
+
         // Optional: Specify output file
         #[arg(short, long)]
         output: Option<String>,
@@ -184,6 +200,8 @@ pub async fn execute(cli: Cli) -> Result<()> {
             gpu,
             leaderboard,
             mode,
+            profile_brev,
+            benchmark_index,
             output,
             no_tui,
         }) => {
@@ -198,23 +216,34 @@ pub async fn execute(cli: Cli) -> Result<()> {
 
             // Use filepath from Submit command first, fallback to top-level filepath
             let final_filepath = filepath.or(cli.filepath);
+            let final_gpu = if profile_brev {
+                Some("B200_Brev".to_string())
+            } else {
+                gpu
+            };
+            let final_mode = if profile_brev {
+                Some("profile".to_string())
+            } else {
+                mode
+            };
 
-            if no_tui {
+            if no_tui || profile_brev {
                 submit::run_submit_plain(
                     final_filepath, // Resolved filepath
-                    gpu,            // From Submit command
+                    final_gpu,      // From Submit command
                     leaderboard,    // From Submit command
-                    mode,           // From Submit command
+                    final_mode,     // From Submit command
                     cli_id,
+                    benchmark_index.or(cli.benchmark_index),
                     output, // From Submit command
                 )
                 .await
             } else {
                 submit::run_submit_tui(
                     final_filepath, // Resolved filepath
-                    gpu,            // From Submit command
+                    final_gpu,      // From Submit command
                     leaderboard,    // From Submit command
-                    mode,           // From Submit command
+                    final_mode,     // From Submit command
                     cli_id,
                     output, // From Submit command
                 )
@@ -269,7 +298,9 @@ pub async fn execute(cli: Cli) -> Result<()> {
         }
         None => {
             // Check if any of the submission-related flags were used at the top level
-            if cli.gpu.is_some() || cli.leaderboard.is_some() || cli.mode.is_some() {
+            if !cli.profile_brev
+                && (cli.gpu.is_some() || cli.leaderboard.is_some() || cli.mode.is_some())
+            {
                 return Err(anyhow!(
                     "Please use the 'submit' subcommand when specifying submission options:\n\
                     popcorn-cli submit [--gpu GPU] [--leaderboard LEADERBOARD] [--mode MODE] FILEPATH"
@@ -287,16 +318,29 @@ pub async fn execute(cli: Cli) -> Result<()> {
                     )
                 })?;
 
-                // Run TUI with only filepath, no other options
-                submit::run_submit_tui(
-                    Some(top_level_filepath),
-                    None, // No GPU option
-                    None, // No leaderboard option
-                    None, // No mode option
-                    cli_id,
-                    None, // No output option
-                )
-                .await
+                if cli.profile_brev {
+                    submit::run_submit_plain(
+                        Some(top_level_filepath),
+                        Some("B200_Brev".to_string()),
+                        cli.leaderboard,
+                        Some("profile".to_string()),
+                        cli_id,
+                        cli.benchmark_index,
+                        cli.output,
+                    )
+                    .await
+                } else {
+                    // Run TUI with only filepath, no other options
+                    submit::run_submit_tui(
+                        Some(top_level_filepath),
+                        None, // No GPU option
+                        None, // No leaderboard option
+                        None, // No mode option
+                        cli_id,
+                        None, // No output option
+                    )
+                    .await
+                }
             } else {
                 Err(anyhow!(
                     "No command or submission file specified. Use --help for usage."