diff --git a/results/research/k3_gpu_beta_e2e_memory_context.json b/results/research/k3_gpu_beta_e2e_memory_context.json new file mode 100644 index 00000000..af665fb5 --- /dev/null +++ b/results/research/k3_gpu_beta_e2e_memory_context.json @@ -0,0 +1,92 @@ +{ + "kind": "k3_e2e_gpu_bench", + "config": { + "verifier_id": "google/gemma-4-26B-A4B-it", + "drafter_id": "z-lab/gemma-4-26B-A4B-it-DFlash", + "f_theta_dir": "results/research/f_theta_v5_s5_sliding", + "sink_size": 4, + "window_size": 64, + "gen_tokens": 16, + "n_samples": 3, + "haystack_lines": [ + 160, + 320 + ] + }, + "verifier_dims": { + "num_hidden_layers": 30, + "num_key_value_heads": 8, + "head_dim": 256, + "sliding_window": 1024 + }, + "env": { + "gpu": "NVIDIA H200", + "torch": "2.12.0+cu130" + }, + "results": [ + { + "haystack_lines": 160, + "prompt_tokens": { + "min": 3238, + "max": 3238 + }, + "ar": { + "decode_tokens_per_s": 16.314, + "prefill_s_mean": 0.3291, + "kv_bytes_final": 733061120, + "peak_mem_bytes": 57767987712, + "recall": 1.0, + "decode_tokens": 48 + }, + "restored": { + "decode_tokens_per_s": 16.523, + "prefill_s_mean": 0.704, + "resident_kv_bytes": 16711680, + "resident_window_tokens": 68, + "effective_context_tokens": 3254, + "peak_mem_bytes": 59053841408, + "recall": 1.0, + "decode_tokens": 48 + }, + "comparison": { + "kv_memory_saving_x": 43.9, + "ar_kv_mb": 733.06, + "restored_resident_kv_mb": 16.71, + "context_compression_x": 47.9, + "throughput_ratio_restored_over_ar": 1.013 + } + }, + { + "haystack_lines": 320, + "prompt_tokens": { + "min": 6438, + "max": 6438 + }, + "ar": { + "decode_tokens_per_s": 16.632, + "prefill_s_mean": 0.6351, + "kv_bytes_final": 1453957120, + "peak_mem_bytes": 63769947648, + "recall": 1.0, + "decode_tokens": 48 + }, + "restored": { + "decode_tokens_per_s": 16.527, + "prefill_s_mean": 1.3956, + "resident_kv_bytes": 16711680, + "resident_window_tokens": 68, + "effective_context_tokens": 6454, + "peak_mem_bytes": 65653019136, + "recall": 1.0, + "decode_tokens": 48 + }, + "comparison": { + "kv_memory_saving_x": 87.0, + "ar_kv_mb": 1453.96, + "restored_resident_kv_mb": 16.71, + "context_compression_x": 94.9, + "throughput_ratio_restored_over_ar": 0.994 + } + } + ] +} \ No newline at end of file diff --git a/results/research/k3_gpu_beta_fused_throughput.json b/results/research/k3_gpu_beta_fused_throughput.json new file mode 100644 index 00000000..a94e55ac --- /dev/null +++ b/results/research/k3_gpu_beta_fused_throughput.json @@ -0,0 +1,321 @@ +{ + "kind": "k3_specdecode_gpu_bench", + "config": { + "verifier_id": "google/gemma-4-26B-A4B-it", + "drafter_id": "z-lab/gemma-4-26B-A4B-it-DFlash", + "f_theta_dir": "results/research/f_theta_v5_s5_sliding", + "haystack_lines": 160, + "n_samples": 3, + "max_new_tokens": 64, + "block_size": 16, + "sink": 4, + "window": 64, + "seed": 0, + "skip_unfused": true, + "output": "results/research/k3_specdecode_gpu_main.json" + }, + "env": { + "gpu": "NVIDIA H200", + "torch": "2.12.0+cu130" + }, + "prompt_tokens": { + "min": 3238, + "max": 3238 + }, + "ar_incremental": { + "decode_tokens_per_s_mean": 16.125, + "recall": 1.0 + }, + "restored_pertoken": { + "decode_tokens_per_s_mean": 16.297, + "recall": 1.0 + }, + "restored_specdecode": { + "skipped": true, + "decode_tokens_per_s_mean": null, + "mean_accept_len": 0.0, + "recall": 0.0, + "per_sample": [ + { + "decode_tokens_per_s": null, + "mean_accept_len": 0.0, + "time_breakdown_s": { + "aux_clean_forward": 0.0, + "drafter": 0.0, + "incremental_verify": 0.0 + }, + "tokens": [] + }, + { + "decode_tokens_per_s": null, + "mean_accept_len": 0.0, + "time_breakdown_s": { + "aux_clean_forward": 0.0, + "drafter": 0.0, + "incremental_verify": 0.0 + }, + "tokens": [] + }, + { + "decode_tokens_per_s": null, + "mean_accept_len": 0.0, + "time_breakdown_s": { + "aux_clean_forward": 0.0, + "drafter": 0.0, + "incremental_verify": 0.0 + }, + "tokens": [] + } + ] + }, + "restored_specdecode_fused": { + "decode_tokens_per_s_mean": 28.937, + "mean_accept_len": 3.32, + "time_breakdown_s_mean": { + "drafter_cached": 0.319, + "incremental_verify": 1.894, + "ctx_kv_extend": 0.034 + }, + "recall": 1.0, + "per_sample": [ + { + "tokens": [ + 818, + 6789, + 3393, + 563, + 5213, + 28487, + 1618, + 236772, + 236832, + 236828, + 236819, + 236771, + 84750, + 106, + 106, + 107, + 45518, + 107, + 101, + 818, + 6789, + 3393, + 563, + 5213, + 28487, + 1618, + 236772, + 236832, + 236828, + 236819, + 236771, + 84750, + 106, + 106, + 107, + 45518, + 107, + 101, + 818, + 6789, + 3393, + 563, + 5213, + 28487, + 1618, + 236772, + 236832, + 236828, + 236819, + 236771, + 84750, + 106, + 106, + 106, + 45518, + 107, + 101, + 818, + 6789, + 3393, + 563, + 5213, + 28487, + 1618 + ], + "decode_s": 1.8752116989344358, + "prefill_s": 0.757, + "decode_tokens_per_s": 34.129, + "time_breakdown_s": { + "drafter_cached": 0.073, + "incremental_verify": 1.769, + "ctx_kv_extend": 0.032 + }, + "blocks": 14, + "mean_accept_len": 3.64, + "decode_tokens": 64 + }, + { + "tokens": [ + 818, + 6789, + 3393, + 563, + 5213, + 236777, + 59790, + 236772, + 236828, + 236819, + 236825, + 236770, + 84750, + 106, + 106, + 45518, + 107, + 101, + 818, + 6789, + 3393, + 563, + 5213, + 236777, + 59790, + 236772, + 236828, + 236819, + 236825, + 236770, + 84750, + 106, + 106, + 106, + 107, + 45518, + 107, + 101, + 818, + 6789, + 3393, + 563, + 5213, + 236777, + 59790, + 236772, + 236828, + 236819, + 236825, + 236770, + 84750, + 106, + 106, + 106, + 107, + 45518, + 107, + 101, + 818, + 6789, + 3393, + 563, + 5213, + 236777 + ], + "decode_s": 2.321441221050918, + "prefill_s": 0.757, + "decode_tokens_per_s": 27.569, + "time_breakdown_s": { + "drafter_cached": 0.382, + "incremental_verify": 1.904, + "ctx_kv_extend": 0.034 + }, + "blocks": 15, + "mean_accept_len": 3.33, + "decode_tokens": 64 + }, + { + "tokens": [ + 818, + 6789, + 3393, + 563, + 5213, + 4989, + 26742, + 236772, + 236825, + 236828, + 236825, + 236825, + 84750, + 106, + 106, + 45518, + 107, + 101, + 4989, + 26742, + 236772, + 236825, + 236828, + 236825, + 236825, + 106, + 106, + 106, + 107, + 45518, + 107, + 101, + 818, + 6789, + 3393, + 563, + 5213, + 4989, + 26742, + 236772, + 236825, + 236828, + 236825, + 236825, + 84750, + 106, + 106, + 106, + 106, + 45518, + 107, + 101, + 4989, + 26742, + 236772, + 236825, + 236828, + 236825, + 236825, + 106, + 106, + 106, + 45518, + 107 + ], + "decode_s": 2.5486192000098526, + "prefill_s": 0.757, + "decode_tokens_per_s": 25.112, + "time_breakdown_s": { + "drafter_cached": 0.502, + "incremental_verify": 2.008, + "ctx_kv_extend": 0.037 + }, + "blocks": 16, + "mean_accept_len": 3.0, + "decode_tokens": 64 + } + ], + "speedup_over_ar_x": 1.79 + } +} \ No newline at end of file